1. Import necessary libraries

In [1]:
import pandas as pd
import math  # use for log function
from collections import Counter

2. Load pre-processed file

In [3]:
file_path = "Processed_Reviews.csv"
df = pd.read_csv(file_path)

3. Extract the tokenized text data

In [5]:
tokenized_reviews = df['tokenized'].dropna().apply(eval)

4. Function to compute Term Frequency (TF)

In [7]:
def compute_tf(document):
    word_count = Counter(document)
    tf = {word: count / len(document) for word, count in word_count.items()}
    return tf

5. Function to compute Inverse Document Frequency (IDF)

In [9]:
def compute_idf(documents):
    N = len(documents)  # Total number of documents
    idf = {}
    all_words = set(word for doc in documents for word in doc)  # Unique words
    for word in all_words:
        count = sum(1 for doc in documents if word in doc)
        idf[word] = math.log(N / count)
    return idf

 6. Function to compute TF-IDF

In [11]:
def compute_tfidf(document, idf):
    tfidf = {}
    tf = compute_tf(document)  # Get TF values for the document
    for word, tf_value in tf.items():
        tfidf[word] = tf_value * idf[word]  # Multiply TF and IDF
    return tfidf

7. Convert tokenized reviews into a List

In [13]:
documents = tokenized_reviews.tolist()

8. Compute TF scores, convert to DataFrame and save to a CSV file

In [15]:
tf_data = [compute_tf(doc) for doc in documents]
tf_df = pd.DataFrame(tf_data).fillna(0)
tf_df.to_csv("tf_scores.csv", index=False)

9. Compute IDF scores, convert to DataFrame and save to a CSV file

In [17]:
idf = compute_idf(documents)
idf_df = pd.DataFrame([idf]).fillna(0)
idf_df.to_csv("idf_scores.csv", index=False)

10. Compute TF-IDF scores, convert to DataFrame and save to a CSV file

In [19]:
tfidf_data = [compute_tfidf(doc, idf) for doc in documents]
tfidf_df = pd.DataFrame(tfidf_data).fillna(0)
tfidf_df.to_csv("tfidf_scores.csv", index=False)