In [1]:
import re
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# You can also use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [2]:
# Corpus
D = ["We like dogs and cats", "We like cars and planes"]

In [3]:
# Count Vectorizer creates a term frequency matrix
cv = CountVectorizer()
tf_mat = cv.fit_transform(D)
tf = pd.DataFrame(tf_mat.toarray(), columns = cv.get_feature_names_out())
tf

Unnamed: 0,and,cars,cats,dogs,like,planes,we
0,1,0,1,1,1,0,1
1,1,1,0,0,1,1,1


In [4]:
# Creating the tfidf matrix
tfidf_trans = TfidfTransformer(smooth_idf=False)
tfidf_mat = tfidf_trans.fit_transform(tf)
tfidf = pd.DataFrame(tfidf_mat.toarray(), columns = tfidf_trans.get_feature_names_out())

In [5]:
# Non-normalized tf-idf
pd.DataFrame(tfidf_trans.idf_ * tf.to_numpy(), columns = tfidf_trans.get_feature_names_out())

Unnamed: 0,and,cars,cats,dogs,like,planes,we
0,1.0,0.0,1.693147,1.693147,1.0,0.0,1.0
1,1.0,1.693147,0.0,0.0,1.0,1.693147,1.0


In [6]:
# Normalized tf-idf
tfidf

Unnamed: 0,and,cars,cats,dogs,like,planes,we
0,0.338381,0.0,0.572929,0.572929,0.338381,0.0,0.338381
1,0.338381,0.572929,0.0,0.0,0.338381,0.572929,0.338381


In [7]:
# d
print(tfidf.iloc[0,:])
# d * d
np.multiply(tfidf.iloc[0,:], tfidf.iloc[0,:]).sum().round()

and       0.338381
cars      0.000000
cats      0.572929
dogs      0.572929
like      0.338381
planes    0.000000
we        0.338381
Name: 0, dtype: float64


1.0

In [8]:
df = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML0187EN-SkillsNetwork/labs/module%203/data/tfidf.csv').iloc[:,1]

In [9]:
df.head(5)

0    Personally I have no idea what my IQ is. I’ve ...
1    I'm skeptical. A heavier lid would be needed t...
2    I think I have 100 cm of books on the subject....
3    Is chemistry hard in uni. Ive read somewhere t...
4    In addition to the other comment, you can crit...
Name: Comment, dtype: object

In [10]:
# Lets remove the numbers
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    return text

In [11]:
cv = CountVectorizer(max_features = 500, preprocessor = preprocess_text)
tf = cv.fit_transform(df)
pd.DataFrame(tf.toarray(), columns = cv.get_feature_names_out())

Unnamed: 0,able,about,above,acid,acids,actually,add,after,again,ago,...,wouldn,wrong,www,yeah,year,years,yes,you,your,yourself
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1581,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,10,6,0
1582,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1583,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,0
1584,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
cv = CountVectorizer(max_features = 500, preprocessor = preprocess_text)
tf = cv.fit_transform(df)
pd.DataFrame(tf.toarray(), columns = cv.get_feature_names_out())

Unnamed: 0,able,about,above,acid,acids,actually,add,after,again,ago,...,wouldn,wrong,www,yeah,year,years,yes,you,your,yourself
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1581,0,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,10,6,0
1582,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1583,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,0
1584,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
tfidf_trans = TfidfTransformer()
tfidf_mat = tfidf_trans.fit_transform(tf.toarray())
tfidf = pd.DataFrame(tfidf_mat.toarray(), columns = cv.get_feature_names_out())
tfidf

Unnamed: 0,able,about,above,acid,acids,actually,add,after,again,ago,...,wouldn,wrong,www,yeah,year,years,yes,you,your,yourself
0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
1,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
2,0.11354,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.044232,0.000000,0.0
3,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.188718,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
4,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079460,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1581,0.00000,0.214699,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.331533,0.308927,0.0
1582,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096378,0.149678,0.0
1583,0.00000,0.121809,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.225714,0.000000,0.0
1584,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0


In [14]:
tfidf = TfidfTransformer()
tfidf_mat = tfidf.fit_transform(tf.toarray())
pd.DataFrame(tfidf_mat.toarray(), columns = cv.get_feature_names_out())

Unnamed: 0,able,about,above,acid,acids,actually,add,after,again,ago,...,wouldn,wrong,www,yeah,year,years,yes,you,your,yourself
0,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
1,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
2,0.11354,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.044232,0.000000,0.0
3,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.188718,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0
4,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079460,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1581,0.00000,0.214699,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.331533,0.308927,0.0
1582,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.096378,0.149678,0.0
1583,0.00000,0.121809,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.225714,0.000000,0.0
1584,0.00000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0


In [15]:
tfidf