<a href="https://colab.research.google.com/github/vglykos/NLP/blob/main/git_TF_IDF_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
#libraries
import pandas as pd
import numpy as np
import nltk

from nltk import word_tokenize
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [31]:
#import datasets
!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

File ‘bbc_text_cls.csv’ already there; not retrieving.



In [32]:
#roughly inspect data
df= pd.read_csv("bbc_text_cls.csv")
df.head(3)

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2225 non-null   object
 1   labels  2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB


In [34]:
#for every word in the corpus we will assign an index
#The product will be a dictionary "word2idx"

#initialize
word2idx = {}
tokenized_docs = []
idx = 0

for doc in df["text"]: #select a doc from the corpus
    bag = word_tokenize(doc.lower()) #tokenize it (break it down to words)
    doc_as_int = [] #list that contains the assigned indeces (int) from every doc
    for word in bag: #select word from bag of words
        if word not in word2idx:
            word2idx[word] = idx #assign new item
            idx += 1 #increase index
        #every new index is appended to a list    
        doc_as_int.append(word2idx[word]) #list of integers
    #every list is append to a list with the size of the corpus
    tokenized_docs.append(doc_as_int) #list of lists

In [35]:
print(len(tokenized_docs))
print(df.shape)

2225
(2225, 2)


In [36]:
#tokenized_docs[0] is the doc_as_int for the 1st doc (df.text[0])

In [46]:
#reverse mapping. (that will come handy later)
idx2word = {v:k for k,v in word2idx.items()}

In [37]:
#the number of documents
N_doc = len(df.text)
#the vocab size
V = len(word2idx)

print("the size of Term-Frequency matrix: {} x {}" .format(N_doc , V))

the size of Term-Frequency matrix: 2225 x 34762


In [38]:
# Term-Frequency matrix = CountVectorizer

#initialize
tf = np.zeros(shape=(N_doc , V))

# populate
for i, doc_as_int in enumerate(tokenized_docs):
    for j in doc_as_int:
        tf[i,j] += 1


In [39]:
print(tf.shape)

(2225, 34762)


In [40]:
tf[0:9 , 0:9]

array([[1., 4., 1., 3., 4., 5., 1., 5., 3.],
       [0., 0., 1., 2., 0., 0., 0., 0., 2.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 1., 1., 3., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0., 0., 0., 2.],
       [0., 0., 0., 0., 0., 0., 0., 0., 2.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [41]:
#compute Document Frequency and Inverse Document Frequency
DF = np.sum(tf>0, axis=0)
print(len(DF))
DF[0:5]

34762


array([ 12, 204, 127, 746,  13])

In [42]:
idf = np.log(N_doc / DF)

In [43]:
#compute TF-IDF
tf_idf = tf * idf

print(tf_idf.shape)
print(tf_idf[:4, :4])

(2225, 34762)
[[5.22260554 9.5575688  2.86332511 3.27835978]
 [0.         0.         2.86332511 2.18557319]
 [0.         0.         0.         0.        ]
 [0.         2.3893922  0.         0.        ]]


In [51]:
#Now lets choose the document index by random
# np.random.seed(123)
i= np.random.choice(N_doc)

In [52]:

row= df.iloc[i] #extract the text and label for the choice
print("label: {}" .format(row["labels"]))
print("text: {}" .format(row["text"].split("\n",1)[0]))

print("Top 5 terms:")

scores = tf_idf[i,:] # extract the tf-idf scores for the choice
indeces = (-scores).argsort() #sort the scores in descending order
indeces

label: politics
text: Clarke faces ID cards rebellion
Top 5 terms:


array([ 1931,  1592, 23370, ..., 11738, 11752, 34761])

In [53]:
for j in indeces[:5]:
    print(idx2word[j])

cards
clarke
rebellion
id
bill
