In [1]:
import pandas as pd 
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.pipeline import Pipeline

In [2]:
import sklearn
print(sklearn.__version__)


1.2.2


In [3]:
df = pd.read_csv("Ecommerce_data.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24000 entries, 0 to 23999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    24000 non-null  object
 1   label   24000 non-null  object
dtypes: object(2)
memory usage: 375.1+ KB


In [5]:
df.head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24000 entries, 0 to 23999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    24000 non-null  object
 1   label   24000 non-null  object
dtypes: object(2)
memory usage: 375.1+ KB


In [7]:
df.duplicated().sum()

10166

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
le = LabelEncoder()

df['Category'] = le.fit_transform(df['label'])
df['Category'].value_counts()

1    4103
3    3410
0    3219
2    3102
Name: Category, dtype: int64

In [10]:
df[['label','Category']].value_counts(normalize=True).reset_index()

Unnamed: 0,label,Category,0
0,Clothing & Accessories,1,0.296588
1,Household,3,0.246494
2,Books,0,0.232688
3,Electronics,2,0.22423


In [11]:
df['Text'][2]

'IO Crest SY-PCI40010 PCI RAID Host Controller Card Brings new life to any old desktop PC. Connects up to 4 SATA II high speed SATA hard disk drives. Supports Windows 8 and Server 2012'

In [12]:
df[['Text','label']].head()

Unnamed: 0,Text,label
0,Urban Ladder Eisner Low Back Study-Office Comp...,Household
1,"Contrast living Wooden Decorative Box,Painted ...",Household
2,IO Crest SY-PCI40010 PCI RAID Host Controller ...,Electronics
3,ISAKAA Baby Socks from Just Born to 8 Years- P...,Clothing & Accessories
4,Indira Designer Women's Art Mysore Silk Saree ...,Clothing & Accessories


In [13]:
nlp = spacy.load('en_core_web_sm')

def preprocesor(text):
    doc = nlp(text)
    tokens = []
    for token in doc:
        if not token.is_stop and not token.is_punct :
            tokens.append(token.lemma_)
    return " ".join(tokens)


In [14]:
preprocesor("I am going to the market")

'go market'

In [15]:
print(df['Text'][0])

Urban Ladder Eisner Low Back Study-Office Computer Chair(Black) A study in simple. The Eisner study chair has a firm foam cushion, which makes long hours at your desk comfortable. The flexible meshed back is designed for air-circulation and support when you lean back. The curved arms provide ergonomic forearm support. Adjust the height using the gas lift to find that comfortable position and the nylon castors make it easy to move around your space. Chrome legs refer to the images for dimension details any assembly required will be done by the UL team at the time of delivery indoor use only.


In [16]:
print(preprocesor(df['Text'][0]))

Urban Ladder Eisner low Study Office Computer Chair(Black study simple Eisner study chair firm foam cushion make long hour desk comfortable flexible mesh design air circulation support lean curved arm provide ergonomic forearm support adjust height gas lift find comfortable position nylon castor easy space chrome leg refer image dimension detail assembly require UL team time delivery indoor use


In [17]:
df['Text_Processed'] = df['Text'].apply(preprocesor)

In [18]:
x = df['Text_Processed']
y = df['Category']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

# Multinomial Naive Bayes Classifier

In [28]:
nb =Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

nb.fit(X_train, y_train)

y_pred = nb.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.92      0.95      1478
           1       0.97      0.99      0.98      1410
           2       0.96      0.97      0.97      1453
           3       0.93      0.97      0.95      1419

    accuracy                           0.96      5760
   macro avg       0.96      0.96      0.96      5760
weighted avg       0.96      0.96      0.96      5760



In [33]:
nb =Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,3))),
    ('clf', MultinomialNB())
])

nb.fit(X_train, y_train)

y_pred = nb.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.94      0.96      1478
           1       0.98      0.98      0.98      1410
           2       0.97      0.98      0.97      1453
           3       0.94      0.97      0.96      1419

    accuracy                           0.97      5760
   macro avg       0.97      0.97      0.97      5760
weighted avg       0.97      0.97      0.97      5760



# Random Forest Classifier

In [29]:
rf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier())
])

rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.96      0.97      1478
           1       0.97      0.98      0.98      1410
           2       0.97      0.97      0.97      1453
           3       0.95      0.96      0.96      1419

    accuracy                           0.97      5760
   macro avg       0.97      0.97      0.97      5760
weighted avg       0.97      0.97      0.97      5760



# KNeighborsClassifier

In [31]:
knn = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', KNeighborsClassifier())
])

knn.fit(X_train, y_train)

y_pred = knn.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96      1478
           1       0.97      0.98      0.98      1410
           2       0.96      0.97      0.96      1453
           3       0.94      0.96      0.95      1419

    accuracy                           0.96      5760
   macro avg       0.96      0.96      0.96      5760
weighted avg       0.96      0.96      0.96      5760



# Hyperparameter tuning for Random Forest

In [39]:
rf = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
    ('clf', RandomForestClassifier())
])

rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1478
           1       0.97      0.98      0.98      1410
           2       0.97      0.96      0.97      1453
           3       0.96      0.96      0.96      1419

    accuracy                           0.97      5760
   macro avg       0.97      0.97      0.97      5760
weighted avg       0.97      0.97      0.97      5760



In [38]:
rf = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,3))),
    ('clf', RandomForestClassifier())
])

rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1478
           1       0.97      0.98      0.97      1410
           2       0.98      0.96      0.97      1453
           3       0.96      0.96      0.96      1419

    accuracy                           0.97      5760
   macro avg       0.97      0.97      0.97      5760
weighted avg       0.97      0.97      0.97      5760



#### The best model is the Random Forest with ngram_range=(1,1) and n_estimators=100

In [21]:
rf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier())
])

rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93       767
           1       0.95      0.97      0.96       973
           2       0.94      0.92      0.93       757
           3       0.91      0.91      0.91       824

    accuracy                           0.94      3321
   macro avg       0.93      0.93      0.93      3321
weighted avg       0.94      0.94      0.94      3321



In [48]:
print(y_val[:5])

3562     0
4217     3
10701    3
4514     2
17621    0
Name: Category, dtype: int32


In [49]:
print(y_pred[:5])

[0 3 3 2 0]


# Testing the model

In [50]:
y_pred = rf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.96      0.97      1224
           1       0.98      0.99      0.98      1210
           2       0.96      0.97      0.97      1145
           3       0.96      0.95      0.95      1221

    accuracy                           0.97      4800
   macro avg       0.97      0.97      0.97      4800
weighted avg       0.97      0.97      0.97      4800



In [52]:
X_test[:5]

3111     Satyam Kraft PU Leather(Pack 1 adjustable Buck...
18679    Neva Men Thermal Neva black coloured thermal c...
17472    Nisha Furniture Sheesham Wood Bedside Table Be...
21451    Fourgee Women Skinny fit Jeans FOURGEE brand b...
20800    nxt 2 Skn Girl Silk Stockings N2S210 Beige 3 6...
Name: Text_Processed, dtype: object

In [68]:
# Save the Cleaned Data
# df.to_csv("Ecommerce_data_cleaned.csv", index=False)

In [22]:
# Save the model
import joblib

# Assuming 'model' is your trained model
joblib.dump(rf, 'new_model.joblib')

['new_model.joblib']