In [None]:
from transformers import pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

def generate_documents(phrase, num_documents, model_name="EleutherAI/gpt-neo-2.7B"):
    gen_pipeline = pipeline("text-generation", model=model_name)
    documents = [gen_pipeline(phrase, max_length=150, num_return_sequences=1)[0]['generated_text'] for _ in range(num_documents)]
    return documents

def preprocess_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    normalized_text = cleaned_text.lower()
    tokenized_text = word_tokenize(normalized_text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in tokenized_text]
    stop_words = set(stopwords.words('english'))
    processed_text = [word for word in lemmatized_text if word not in stop_words]
    unique_words = set(processed_text)
    len_3 = [unique_word for unique_word in unique_words if len(unique_word) < 3]
    processed_text = [word for word in processed_text if word not in len_3]
    unique_words = set(processed_text)
    return processed_text

# Generate documents
phrase = "The quick brown fox jumps over the lazy dog."
num_documents = 2
documents = generate_documents(phrase, num_documents)

# Preprocess documents
preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Convert preprocessed documents to strings
preprocessed_documents_strings = [' '.join(doc) for doc in preprocessed_documents]

# Apply TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_documents_strings)

# Print TF-IDF matrix
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


TF-IDF Matrix:
[[0.16247628 0.40461123 0.08123814 0.08123814 0.34680963 0.08123814
  0.08123814 0.08123814 0.08123814 0.40461123 0.08123814 0.08123814
  0.08123814 0.28900802 0.34680963 0.08123814 0.         0.08123814
  0.         0.40461123 0.08123814 0.08123814 0.08123814 0.08123814
  0.08123814 0.08123814 0.08123814 0.08123814 0.08123814 0.08123814
  0.08123814 0.08123814 0.08123814]
 [0.         0.43183468 0.         0.         0.48941264 0.
  0.         0.         0.         0.31667877 0.         0.
  0.         0.40304571 0.40304571 0.         0.12138572 0.
  0.12138572 0.34546775 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.        ]]


In [6]:
import math
import numpy as np

def calculate_tf(term, document):
    term_frequency = document.count(term)
    total_terms = len(document)
    return term_frequency / total_terms if total_terms else 0

def calculate_idf(term, corpus):
    total_documents = len(corpus)
    documents_containing_term = sum(1 for doc in corpus if term in doc)
    return math.log((total_documents + 1) / (documents_containing_term + 1)) + 1

def calculate_tfidf(term, document, corpus):
    tf = calculate_tf(term, document)
    idf = calculate_idf(term, corpus)
    return tf * idf

def l2_normalize(matrix):
    norms = np.linalg.norm(matrix, axis=1)
    return matrix / norms[:, np.newaxis]

# Assume preprocessed_documents is your list of preprocessed documents
corpus = preprocessed_documents

# Get unique terms from all documents
unique_terms = set(term for doc in corpus for term in doc)

# Calculate TF-IDF matrix
tfidf_matrix = np.zeros((len(corpus), len(unique_terms)))
for i, doc in enumerate(corpus):
    for j, term in enumerate(unique_terms):
        tfidf_matrix[i, j] = calculate_tfidf(term, doc, corpus)

# Normalize TF-IDF matrix using L2 normalization
normalized_tfidf_matrix = l2_normalize(tfidf_matrix)

# Print normalized TF-IDF matrix
print("Normalized TF-IDF Matrix:")
print(normalized_tfidf_matrix)


Normalized TF-IDF Matrix:
[[0.16247628 0.08123814 0.08123814 0.40461123 0.08123814 0.40461123
  0.08123814 0.         0.08123814 0.08123814 0.08123814 0.08123814
  0.08123814 0.08123814 0.08123814 0.34680963 0.08123814 0.08123814
  0.08123814 0.40461123 0.08123814 0.08123814 0.08123814 0.08123814
  0.08123814 0.28900802 0.34680963 0.08123814 0.08123814 0.08123814
  0.         0.08123814 0.08123814]
 [0.         0.         0.         0.34546775 0.         0.43183468
  0.         0.12138572 0.         0.         0.         0.
  0.         0.         0.         0.48941264 0.         0.
  0.         0.31667877 0.         0.         0.         0.
  0.         0.40304571 0.40304571 0.         0.         0.
  0.12138572 0.         0.        ]]
