In [None]:
!pip install faker

Collecting faker
  Downloading Faker-24.3.0-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-24.3.0


In [None]:
from faker import Faker
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import nltk
import re
import math
import numpy as np

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def generate_documents(word, num_docs=5, max_length=100):
    faker = Faker()
    documents = []
    for _ in range(num_docs):
        # Generate random text with the word included
        doc = faker.paragraph()
        if len(doc) > max_length:
            doc = doc[:max_length]
        doc += ' ' + word
        documents.append(doc)
    return documents

In [None]:
word = "technology"
documents = generate_documents(word)

preprocessed_documents=[]
for doc in documents:
    doc = doc.lower()

    # Remove punctuation
    doc = re.sub(r'[^\w\s]', '', doc)

    tokens = word_tokenize(doc)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    preprocessed_documents.append(' '.join(tokens))

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_documents)

# get words
feature_names = vectorizer.get_feature_names_out()

# Print TFIDF for each document
for i, doc in enumerate(documents):
    print(f"TFIDF for Document {i+1}:")
    feature_index = tfidf_matrix[i,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])
    for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
        print(f"{w}: {s}")
    print()


TFIDF for Document 1:
technology: 0.12633109113985516
adult: 0.26511998030695105
computer: 0.26511998030695105
former: 0.26511998030695105
speak: 0.26511998030695105
strategy: 0.26511998030695105
bag: 0.26511998030695105
course: 0.26511998030695105
candidate: 0.26511998030695105
west: 0.26511998030695105
outside: 0.26511998030695105
resource: 0.26511998030695105
whole: 0.26511998030695105
plan: 0.26511998030695105
deep: 0.26511998030695105

TFIDF for Document 2:
movement: 0.2749592798183583
brother: 0.2749592798183583
talk: 0.2749592798183583
purpose: 0.2749592798183583
cold: 0.2749592798183583
national: 0.2749592798183583
end: 0.2749592798183583
decision: 0.2749592798183583
value: 0.2749592798183583
dream: 0.2749592798183583
assume: 0.2749592798183583
doctor: 0.2749592798183583
various: 0.2749592798183583
technology: 0.13101957007640605

TFIDF for Document 3:
certain: 0.2859822148170311
someone: 0.2859822148170311
parent: 0.2859822148170311
social: 0.2859822148170311
according: 0.2859

In [None]:
def preprocess_text(doc):
    doc = doc.lower()
    doc = ''.join([char for char in doc if char not in string.punctuation])
    tokens = word_tokenize(doc)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

In [None]:
def calculate_tf(word, document):
    word_count = document.count(word)
    total_words = len(document)
    return word_count / total_words

def calculate_idf(word, documents):
    num_documents_with_word = np.sum([1 for doc in documents if word in doc])
    total_documents = len(documents)
    return math.log((1+total_documents) / (1 + num_documents_with_word)) + 1

def calculate_tf_idf(word, document, documents):
    tf = calculate_tf(word, document)
    idf = calculate_idf(word, documents)
    return tf * idf


preprocessed_documents = [preprocess_text(doc) for doc in documents]

tfidf_scores = []
for doc in preprocessed_documents:
    tfidf_doc = {}
    for word in set(doc):
        tfidf_doc[word] = calculate_tf_idf(word, doc, preprocessed_documents)
    tfidf_scores.append(tfidf_doc)

normalized_tfidf_scores = []
for doc in tfidf_scores:
    sum_of_squares = np.sum(score ** 2 for score in doc.values())
    sqrt_sum_of_squares = math.sqrt(sum_of_squares)
    normalized_doc = {word: score / sqrt_sum_of_squares for word, score in doc.items()}
    normalized_tfidf_scores.append(normalized_doc)

# Print TFIDF for each document
for i, doc in enumerate(normalized_tfidf_scores):
    print(f"TFIDF for Document {i+1}:")
    for word, score in doc.items():
        print(f"{word}: {score}")
    print()


TFIDF for Document 1:
candidate: 0.2651199803069511
strategy: 0.2651199803069511
adult: 0.2651199803069511
bag: 0.2651199803069511
whole: 0.2651199803069511
deep: 0.2651199803069511
west: 0.2651199803069511
outside: 0.2651199803069511
former: 0.2651199803069511
speak: 0.2651199803069511
computer: 0.2651199803069511
plan: 0.2651199803069511
technology: 0.1263310911398552
course: 0.2651199803069511
resource: 0.2651199803069511

TFIDF for Document 2:
various: 0.2749592798183583
talk: 0.2749592798183583
cold: 0.2749592798183583
decision: 0.2749592798183583
movement: 0.2749592798183583
technology: 0.13101957007640605
purpose: 0.2749592798183583
dream: 0.2749592798183583
assume: 0.2749592798183583
national: 0.2749592798183583
end: 0.2749592798183583
doctor: 0.2749592798183583
value: 0.2749592798183583
brother: 0.2749592798183583

TFIDF for Document 3:
never: 0.2749592798183583
spend: 0.2749592798183583
establish: 0.2749592798183583
c: 0.2749592798183583
according: 0.2749592798183583
floor: 0

  sum_of_squares = np.sum(score ** 2 for score in doc.values())
