## Computing the TF-IDF


In [42]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from collections import Counter
from nltk.corpus import stopwords
from tqdm import tqdm
from sklearn.decomposition import PCA
import plotly.express as px
#nltk.download('stopwords')
#nltk.download('punkt')

In [2]:
# The tokenizer of nltk was too slow for that case.
# So, we will use python regular expressions to 
# increase speed

def tokenize_and_count(text):
    """
    This function will tokenize a text using regular
    expressions and counts the number of times each
    token is present in the text
    """
    # Tokenize the text
    p = re.compile("[a-z]+") # Using a regular expression here
    tokens = p.findall(text.lower())
    tokens_counts = Counter(tokens)
    final_dict = {k: v for k, v in sorted(tokens_counts.items(), key=lambda item: item[1], reverse=True)}
    return final_dict

def fair_compose(tokens_positive, tokens_negative, n=100):
    """
    Given the most common words on each class, this function
    will compose a vocabulary with the words that are common
    to both classes while keeping the most popular words for
    each class. 
    """
    vocabulary = []
    i = 0
    j = 0
    while len(set(vocabulary)) < n:
        while True:
            w = list(tokens_positive.keys())[i]
            i +=1
            if w not in stopwords.words('english'):
                vocabulary.append(w)
                break
        while True:
            w = list(tokens_negative.keys())[j]
            j +=1
            if w not in stopwords.words('english'):
                vocabulary.append(w)
                break
    return list(set(vocabulary))

def compute_vocabulary(text_positive, text_negative, n=100):
    """
    This function will create a vocabulary with the n most frequent words
    that are present in the text_positive and text_negative at the same
    time.  
    The function will also remove the stopwords and compute the frequency
    for each word in the positive texts and the negative texts.
    Returns two dictionaries.
    """
    tokens_positive = tokenize_and_count(text_positive)
    del(tokens_positive['br'])
    tokens_negative = tokenize_and_count(text_negative)
    del(tokens_negative['br'])
    vocabulary = fair_compose(tokens_positive, tokens_negative, n)
    return vocabulary

In [3]:
data = pd.read_csv("./IMDB_Dataset.csv")

In [4]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
data.shape

(50000, 2)

In [None]:
# Our training set will be formed by 20k positive and 20k negative reviews
vocabulary_size = 2000
data_positive = data[data['sentiment']=='positive']
data_negative = data[data['sentiment']=='negative']
# Training set
text_positive_train = " ".join(list(data_positive['review'][:20000]))
text_negative_train = " ".join(list(data_negative["review"][:20000]))
# Get the vocabulary
vocabulary = compute_vocabulary(text_positive_train, text_negative_train, vocabulary_size)

In [10]:
len(vocabulary)

500

In [14]:
def compute_IDF(t, list_of_documents):
    """
    Compute the IDF for a given term t
    Receives a word t and a list of documents.
    Each document on the list has been already
    tokenized.
    """
    denominator = 1
    for d in list_of_documents:
        if t in d:
            denominator +=1
    return np.log(len(list_of_documents)/denominator)

def compute_TF_IDF_vector(document, IDF_dictionary, vocabulary):
    """
    Compute the TF-IDF for a given document.
    The document is already tokenized.
    Returns a list
    """
    n = len(document)
    words = list(set(document)&set(vocabulary))
    terms_counts = Counter(document)
    tf_idf_vector = [0 for i in range(len(vocabulary))]
    for w in words:
        Ix = vocabulary.index(w)
        tf_idf_vector[Ix] = terms_counts[w] / n * IDF_dictionary[w]
    return tf_idf_vector

 



In [None]:
# Tokenize all documents
list_of_documents = []
p = re.compile("[a-z]+") # The regular expression for tokenizing a document
for d in data["review"]:
    list_of_documents.append(p.findall(d.lower()))

In [None]:
# Compute the IDF_dictionary
IDF_dict = {}
for i in tqdm(range(len(vocabulary))):
    t = vocabulary[i]
    IDF_dict[t] = compute_IDF(t, list_of_documents)



100%|██████████| 500/500 [01:13<00:00,  6.78it/s]


In [25]:
# Compute the TF-IDF matrix
TF_IDF = []
for i in tqdm(range(len(list_of_documents))):
    d = list_of_documents[i]
    TF_IDF.append(compute_TF_IDF_vector(d, IDF_dict, vocabulary))

100%|██████████| 50000/50000 [00:11<00:00, 4205.44it/s]


In [29]:
TF_IDF_matrix = np.array(TF_IDF)

In [31]:
pca = PCA(n_components=2)
pca.fit(TF_IDF_matrix)

In [32]:
print(pca.explained_variance_ratio_)

[0.01026865 0.00917184]


In [33]:
reduced_data = PCA(n_components=2).fit_transform(TF_IDF_matrix)

In [41]:
np.sum(TF_IDF_matrix.sum(axis=1) == 0)

0

In [45]:
data2plot = pd.DataFrame({'x': reduced_data[:, 0],
                          'y': reduced_data[:, 1], 
                          'sentiment': data["sentiment"]})

In [58]:
data2plot.head()
# only select some values to plot
n = 100
df_positive = data2plot[data2plot['sentiment']=="positive"][:n]
df_negative = data2plot[data2plot['sentiment']=="negative"][:n]
df = pd.concat([df_positive, df_negative])


In [60]:
fig = px.scatter(df, x="x", y="y", color="sentiment", opacity=0.7)
fig.show()

## Let's do a 3D plot

In [49]:
reduced_data3D = PCA(n_components=3).fit_transform(TF_IDF_matrix)

In [61]:
data2plot3D = pd.DataFrame({'x': reduced_data3D[:, 0],
                            'y': reduced_data3D[:, 1], 
                            'z': reduced_data3D[:, 2], 
                            'sentiment': data['sentiment']})
# only select some values to plot
n = 100
df_positive = data2plot3D[data2plot3D['sentiment']=="positive"][:n]
df_negative = data2plot3D[data2plot3D['sentiment']=="negative"][:n]
df = pd.concat([df_positive, df_negative])

In [63]:
fig = px.scatter_3d(df, x='x', y='y', z='z', color='sentiment', opacity=0.9)
fig.show()