In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer
import kagglehub
import pandas as pd
import numpy as np
import os

corpus = [
    "Thor is eating a pizza, Loki is eating an ice cream, Ironman is eating a cake",
    "Apple will announce iPhone22 tomorrow",
    "Gemini flash 2.5 is an excellent model",
    "Microsoft is announcing Q4 results tomorrow",
    "Amazon will enter the 10 minutes delivery space on Monday",
    "Something is fishy"
]

In [53]:
v = TfidfVectorizer()
transformed_output = v.fit_transform(corpus)
print(v.vocabulary_)

{'thor': 31, 'is': 18, 'eating': 9, 'pizza': 25, 'loki': 19, 'an': 2, 'ice': 15, 'cream': 7, 'ironman': 17, 'cake': 6, 'apple': 5, 'will': 33, 'announce': 3, 'iphone22': 16, 'tomorrow': 32, 'gemini': 14, 'flash': 13, 'excellent': 11, 'model': 22, 'microsoft': 20, 'announcing': 4, 'q4': 26, 'results': 27, 'amazon': 1, 'enter': 10, 'the': 30, '10': 0, 'minutes': 21, 'delivery': 8, 'space': 29, 'on': 24, 'monday': 23, 'something': 28, 'fishy': 12}


In [54]:
all_feature_names = v.get_feature_names_out()
for word in all_feature_names:
    index = v.vocabulary_.get(word)
    print(f"{word} {v.idf_[index]}")

10 2.252762968495368
amazon 2.252762968495368
an 1.8472978603872037
announce 2.252762968495368
announcing 2.252762968495368
apple 2.252762968495368
cake 2.252762968495368
cream 2.252762968495368
delivery 2.252762968495368
eating 2.252762968495368
enter 2.252762968495368
excellent 2.252762968495368
fishy 2.252762968495368
flash 2.252762968495368
gemini 2.252762968495368
ice 2.252762968495368
iphone22 2.252762968495368
ironman 2.252762968495368
is 1.336472236621213
loki 2.252762968495368
microsoft 2.252762968495368
minutes 2.252762968495368
model 2.252762968495368
monday 2.252762968495368
on 2.252762968495368
pizza 2.252762968495368
q4 2.252762968495368
results 2.252762968495368
something 2.252762968495368
space 2.252762968495368
the 2.252762968495368
thor 2.252762968495368
tomorrow 1.8472978603872037
will 1.8472978603872037


In [55]:
corpus[:2]

['Thor is eating a pizza, Loki is eating an ice cream, Ironman is eating a cake',
 'Apple will announce iPhone22 tomorrow']

In [56]:
transformed_output.toarray()[:2]

array([[0.        , 0.        , 0.1840985 , 0.        , 0.        ,
        0.        , 0.22450645, 0.22450645, 0.        , 0.67351935,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.22450645, 0.        , 0.22450645, 0.39957152, 0.22450645,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.22450645, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.22450645, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.47974754, 0.        ,
        0.47974754, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.47974754, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.39339985, 0.39339985]])

In [57]:
path = kagglehub.dataset_download("saurabhshahane/ecommerce-text-classification")
# print("Path to dataset files:", path)
# for root, dirs, files in os.walk(path):
#     for file in files:
#         print(os.path.join(root, file))
csv_path = os.path.join(path, "ecommerceDataset.csv")

In [58]:
df = pd.read_csv(csv_path, header = None, names=["label", "text"])
df.head(10)

Unnamed: 0,label,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...
5,Household,Pitaara Box Romantic Venice Canvas Painting 6m...
6,Household,Paper Plane Design Starry Night Vangoh Wall Ar...
7,Household,Pitaara Box Romantic Venice Canvas Painting 6m...
8,Household,SAF 'Ganesh Modern Art Print' Painting (Synthe...
9,Household,Paintings Villa UV Textured Modern Art Print F...


In [59]:
df["label"].value_counts()

label
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: count, dtype: int64

In [60]:
## handle imbalance simply
min_size = 8671
df_household = df[df["label"] == "Household"].sample(min_size, random_state = 42)
df_books = df[df["label"] == "Books"].sample(min_size, random_state = 42)
df_electronics = df[df["label"] == "Electronics"].sample(min_size, random_state = 42)
df_clothing = df[df["label"] == "Clothing & Accessories"].sample(min_size, random_state = 42)

In [61]:
df_balanced = pd.concat([df_household, df_books, df_electronics, df_clothing], axis = 0)
df_balanced["label"].value_counts()

label
Household                 8671
Books                     8671
Electronics               8671
Clothing & Accessories    8671
Name: count, dtype: int64

In [62]:
df_balanced["new_label"] = df_balanced.label.map({
    "Household" : 0,
    "Books" : 1,
    "Electronics" : 2,
    "Clothing & Accessories" : 3
})

In [63]:
df_balanced.head()

Unnamed: 0,label,text,new_label
12340,Household,"Riedel VINUM Cognac Glasses, Set of 2 Size:Set...",0
7144,Household,ANSIO Polyester Shower Curtain for Bathroom - ...,0
4400,Household,Little India Meenakari Work Lord Ganesh Marble...,0
9992,Household,PETRICE Triple Paper Dispenser | 4 in 1 Foil C...,0
16601,Household,KC Cab Copper PVC Insulated Wire 90 m Coil(Red...,0


In [64]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_balanced["text"], df_balanced["new_label"], test_size = 0.2, stratify = df_balanced["new_label"], random_state = 42)

In [65]:
print(X_train.shape)
X_train.head()

(27747,)


36851    MISMXC Women's Concealed Lace Small Thigh Purs...
38321    Fashion League Ultra-Thin Transparent skin Ank...
6775     HOKIPO PVC Kitchen Placemats, Plastic, Multico...
3128     Gion Portable Pedestal 5 Blades Lightweight ai...
36637    Xs and Os Women Halter Neck Babydoll Lingerie ...
Name: text, dtype: object

In [72]:
print(X_test.shape)
X_test.head()
print(X_test.isnull().sum())
X_test = X_test.fillna("")

(6937,)
1


In [66]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('KNN', KNeighborsClassifier())
])

In [74]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.95      0.94      1734
           1       0.96      0.95      0.96      1734
           2       0.95      0.96      0.96      1735
           3       0.97      0.97      0.97      1734

    accuracy                           0.96      6937
   macro avg       0.96      0.96      0.96      6937
weighted avg       0.96      0.96      0.96      6937



### Observation
f1-scores are pretty good

In [75]:
y_test[:5]

48662    2
37647    3
37414    3
34324    3
36843    3
Name: new_label, dtype: int64

In [76]:
y_pred[:5]

array([2, 3, 3, 3, 3])

## Observation
5/5 observation are correct :)