In [13]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    "Captain America eating pizza, Thor is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating dosa and you are eating mangoes"
]

In [15]:
v = TfidfVectorizer()
v.fit(corpus)
transform_output = v.transform(corpus)

In [16]:
print(v.vocabulary_)

{'captain': 9, 'america': 3, 'eating': 12, 'pizza': 23, 'thor': 26, 'is': 17, 'ironman': 16, 'ate': 8, 'already': 0, 'apple': 6, 'announcing': 5, 'new': 21, 'iphone': 15, 'tomorrow': 27, 'tesla': 25, 'model': 20, 'google': 14, 'pixel': 22, 'microsoft': 19, 'surface': 24, 'amazon': 2, 'eco': 13, 'dot': 11, 'am': 1, 'dosa': 10, 'and': 4, 'you': 28, 'are': 7, 'mangoes': 18}


In [17]:
i = v.vocabulary_.get('thor')
v.idf_[i]

np.float64(2.386294361119891)

In [18]:
# Print the idf of each word

all_feature_names = v.get_feature_names_out()

for word in all_feature_names:

  indx = v.vocabulary_.get(word)

  #get the score
  idf_score = v.idf_[indx]

  print(f"{word}: {idf_score}")


already: 2.386294361119891
am: 2.386294361119891
amazon: 2.386294361119891
america: 2.386294361119891
and: 2.386294361119891
announcing: 1.2876820724517808
apple: 2.386294361119891
are: 2.386294361119891
ate: 2.386294361119891
captain: 2.386294361119891
dosa: 2.386294361119891
dot: 2.386294361119891
eating: 1.9808292530117262
eco: 2.386294361119891
google: 2.386294361119891
iphone: 2.386294361119891
ironman: 2.386294361119891
is: 1.1335313926245225
mangoes: 2.386294361119891
microsoft: 2.386294361119891
model: 2.386294361119891
new: 1.2876820724517808
pixel: 2.386294361119891
pizza: 2.386294361119891
surface: 2.386294361119891
tesla: 2.386294361119891
thor: 2.386294361119891
tomorrow: 1.2876820724517808
you: 2.386294361119891


In [19]:
# Print the transformed output from tf-idf
print(transform_output.toarray())

[[0.23582143 0.         0.         0.23582143 0.         0.
  0.         0.         0.23582143 0.23582143 0.         0.
  0.39150408 0.         0.         0.         0.23582143 0.11201929
  0.         0.         0.         0.         0.         0.7074643
  0.         0.         0.23582143 0.         0.        ]
 [0.         0.         0.         0.         0.         0.30652086
  0.5680354  0.         0.         0.         0.         0.
  0.         0.         0.         0.5680354  0.         0.26982671
  0.         0.         0.         0.30652086 0.         0.
  0.         0.         0.         0.30652086 0.        ]
 [0.         0.         0.         0.         0.         0.30652086
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.26982671
  0.         0.         0.5680354  0.30652086 0.         0.
  0.         0.5680354  0.         0.30652086 0.        ]
 [0.         0.         0.         0.         0.         0.

#### **Custom Use case**

- E-commerce data
- 4 labels: Household, Electronics, Clothing & Books
- Task is to create a classification model that can predict a given description of a product and classify them as one of the labels using TfIdf vectorization technique

In [27]:
df = pd.read_csv('Ecommerce_data.csv')

In [29]:
df.head(5)

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [30]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Household,6000
Electronics,6000
Clothing & Accessories,6000
Books,6000


In [31]:
df.shape

(24000, 2)

In [32]:
df['label_num'] = df['label'].map({
    'Household': 0,
    'Electronics': 1,
    'Clothing & Accessories': 2,
    'Books': 3
})

In [33]:
df.head(5)

Unnamed: 0,Text,label,label_num
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household,0
1,"Contrast living Wooden Decorative Box,Painted ...",Household,0
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics,1
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories,2
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories,2


#### **Train Test Split**

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Text, df.label_num, test_size=0.2)

In [35]:
len(X_train)

19200

In [36]:
len(X_test)

4800

#### **Tfidf Vectorizer**

In [37]:
tf = TfidfVectorizer()

X_train_tf = tf.fit_transform(X_train)
X_test_tf = tf.transform(X_test)

#### **Classification Model**

In [38]:
clf = DecisionTreeClassifier()
clf.fit(X_train_tf,y_train)

y_pred = clf.predict(X_test_tf)

In [39]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.92      0.92      1152
           1       0.94      0.93      0.94      1199
           2       0.97      0.95      0.96      1228
           3       0.95      0.96      0.96      1221

    accuracy                           0.94      4800
   macro avg       0.94      0.94      0.94      4800
weighted avg       0.94      0.94      0.94      4800



#### **Testing on a new data**

In [46]:
#msg = ["Indira Designer Women's Art Mysore Silk Saree With Blouse Piece (Star-Red) This Saree Is Of Art Mysore Silk & Comes With Blouse Piece."]
msg = ["Radhika's designer women art saree silk blouse piece, saree with pipili chandua work"]
msg_tf = tf.transform(msg)

clf.predict(msg_tf)

array([2])