In [None]:
pip install transformers



In [None]:
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from transformers import pipeline



In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Function to generate text using GPT-Neo model
def generate_text(prompt, model_name='EleutherAI/gpt-neo-2.7B', max_length=50):
    gen = pipeline('text-generation', model=model_name)
    generated_text = gen(prompt, max_length=max_length, do_sample=True)
    return generated_text[0]['generated_text']


In [None]:
# Function to preprocess text
def preprocess_text(text):
    # Cleaning data from symbols and characters
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    cleaned_text = re.sub(r'<script[^>]*>[\s\S]*?</script>', '', text)
    cleaned_text = re.sub(r'<style[^>]*>[\s\S]*?</style>', '', text)
    # Normalization: convert to lowercase
    cleaned_text = cleaned_text.lower()
    # Tokenization
    words = word_tokenize(cleaned_text)
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in lemmatized_words if word not in stop_words]
    return ' '.join(filtered_words)

In [None]:
# Generate text with a sports-related prompt
generated_text = generate_text("Sports are", max_length=100)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
# Preprocess generated text
preprocessed_text = preprocess_text(generated_text)

In [None]:
# Print preprocessed text
print("Preprocessed text:")
print(preprocessed_text)

Preprocessed text:
sport different especially term fighting sport fight two guy really skilled hard win using kick punch said povetkin fivetime kickboxing world champion want fight biggest best guy world need use weapon different discipline threetime ufc bantamweight champ would


In [None]:
# Calculate TF for each word for all documents
def calculate_tf(document):
    words = document.split()
    word_count = len(words)
    tf = {}
    for word in set(words):
        tf[word] = words.count(word) / word_count
    return tf


In [None]:
# Print TF values
print("\nTF values:")
print(calculate_tf(preprocessed_text))


TF values:
{'discipline': 0.02631578947368421, 'povetkin': 0.02631578947368421, 'fighting': 0.02631578947368421, 'kick': 0.02631578947368421, 'different': 0.05263157894736842, 'weapon': 0.02631578947368421, 'want': 0.02631578947368421, 'win': 0.02631578947368421, 'biggest': 0.02631578947368421, 'hard': 0.02631578947368421, 'skilled': 0.02631578947368421, 'world': 0.05263157894736842, 'using': 0.02631578947368421, 'use': 0.02631578947368421, 'would': 0.02631578947368421, 'threetime': 0.02631578947368421, 'bantamweight': 0.02631578947368421, 'especially': 0.02631578947368421, 'sport': 0.05263157894736842, 'two': 0.02631578947368421, 'champion': 0.02631578947368421, 'need': 0.02631578947368421, 'champ': 0.02631578947368421, 'fivetime': 0.02631578947368421, 'guy': 0.05263157894736842, 'ufc': 0.02631578947368421, 'said': 0.02631578947368421, 'term': 0.02631578947368421, 'kickboxing': 0.02631578947368421, 'punch': 0.02631578947368421, 'really': 0.02631578947368421, 'fight': 0.05263157894736

In [None]:
# Calculate IDF for each word
def calculate_idf(documents):
    total_documents = len(documents)
    all_words = ' '.join(documents).split()
    idf = {}
    for word in set(all_words):
        doc_count = sum(1 for doc in documents if word in doc)
        idf[word] = np.log(total_documents / (doc_count + 1))  # Adding 1 to avoid division by zero
    return idf

In [None]:
# Print IDF values
print("\nIDF values:")
print(calculate_idf([preprocessed_text]))



IDF values:
{'discipline': -0.6931471805599453, 'povetkin': -0.6931471805599453, 'fighting': -0.6931471805599453, 'kick': -0.6931471805599453, 'different': -0.6931471805599453, 'weapon': -0.6931471805599453, 'want': -0.6931471805599453, 'win': -0.6931471805599453, 'biggest': -0.6931471805599453, 'hard': -0.6931471805599453, 'skilled': -0.6931471805599453, 'world': -0.6931471805599453, 'using': -0.6931471805599453, 'use': -0.6931471805599453, 'would': -0.6931471805599453, 'threetime': -0.6931471805599453, 'bantamweight': -0.6931471805599453, 'especially': -0.6931471805599453, 'sport': -0.6931471805599453, 'two': -0.6931471805599453, 'champion': -0.6931471805599453, 'need': -0.6931471805599453, 'champ': -0.6931471805599453, 'fivetime': -0.6931471805599453, 'guy': -0.6931471805599453, 'ufc': -0.6931471805599453, 'said': -0.6931471805599453, 'term': -0.6931471805599453, 'kickboxing': -0.6931471805599453, 'punch': -0.6931471805599453, 'really': -0.6931471805599453, 'fight': -0.693147180559

In [None]:
# Calculate TF-IDF from scratch
def calculate_tfidf(documents):
    tfidf = []
    tf = [calculate_tf(doc) for doc in documents]
    idf = calculate_idf(documents)
    for doc_tf in tf:
        doc_tfidf = {word: tf_value * idf[word] for word, tf_value in doc_tf.items()}
        tfidf.append(doc_tfidf)
    return tfidf


In [None]:
# Normalize TF-IDF
def normalize_tfidf(tfidf):
    normalized_tfidf = []
    for doc_tfidf in tfidf:
        norm = np.linalg.norm(list(doc_tfidf.values()))
        normalized_tfidf.append({word: value / norm for word, value in doc_tfidf.items()})
    return normalized_tfidf

In [None]:
# Calculate TF-IDF using scikit-learn
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([preprocessed_text])
feature_names = tfidf_vectorizer.get_feature_names_out()
sklearn_tfidf = [{feature_names[j]: tfidf_matrix[0, j]} for j in range(len(feature_names))]


In [None]:
# Print TF-IDF values from scikit-learn
print("\nTF-IDF using scikit-learn:")
print(sklearn_tfidf)


TF-IDF using scikit-learn:
[{'bantamweight': 0.14433756729740646}, {'best': 0.14433756729740646}, {'biggest': 0.14433756729740646}, {'champ': 0.14433756729740646}, {'champion': 0.14433756729740646}, {'different': 0.2886751345948129}, {'discipline': 0.14433756729740646}, {'especially': 0.14433756729740646}, {'fight': 0.2886751345948129}, {'fighting': 0.14433756729740646}, {'fivetime': 0.14433756729740646}, {'guy': 0.2886751345948129}, {'hard': 0.14433756729740646}, {'kick': 0.14433756729740646}, {'kickboxing': 0.14433756729740646}, {'need': 0.14433756729740646}, {'povetkin': 0.14433756729740646}, {'punch': 0.14433756729740646}, {'really': 0.14433756729740646}, {'said': 0.14433756729740646}, {'skilled': 0.14433756729740646}, {'sport': 0.2886751345948129}, {'term': 0.14433756729740646}, {'threetime': 0.14433756729740646}, {'two': 0.14433756729740646}, {'ufc': 0.14433756729740646}, {'use': 0.14433756729740646}, {'using': 0.14433756729740646}, {'want': 0.14433756729740646}, {'weapon': 0.14

In [None]:
# Calculate TF-IDF from scratch
tfidf_scratch = calculate_tfidf([preprocessed_text])
normalized_tfidf_scratch = normalize_tfidf(tfidf_scratch)

In [None]:
# Print TF-IDF values from scratch
print("\nTF-IDF from scratch:")
print(normalized_tfidf_scratch)


TF-IDF from scratch:
[{'discipline': -0.14433756729740646, 'povetkin': -0.14433756729740646, 'fighting': -0.14433756729740646, 'kick': -0.14433756729740646, 'different': -0.2886751345948129, 'weapon': -0.14433756729740646, 'want': -0.14433756729740646, 'win': -0.14433756729740646, 'biggest': -0.14433756729740646, 'hard': -0.14433756729740646, 'skilled': -0.14433756729740646, 'world': -0.2886751345948129, 'using': -0.14433756729740646, 'use': -0.14433756729740646, 'would': -0.14433756729740646, 'threetime': -0.14433756729740646, 'bantamweight': -0.14433756729740646, 'especially': -0.14433756729740646, 'sport': -0.2886751345948129, 'two': -0.14433756729740646, 'champion': -0.14433756729740646, 'need': -0.14433756729740646, 'champ': -0.14433756729740646, 'fivetime': -0.14433756729740646, 'guy': -0.2886751345948129, 'ufc': -0.14433756729740646, 'said': -0.14433756729740646, 'term': -0.14433756729740646, 'kickboxing': -0.14433756729740646, 'punch': -0.14433756729740646, 'really': -0.144337