In [1]:
import numpy as np
import pandas as pd

In [2]:
train= pd.read_csv('train_df.csv')
test= pd.read_csv('test_df.csv')

In [3]:
train.head()

Unnamed: 0,ID_NAME,Category,ProductName,Description
0,5194,Pants,"""Men's Basic-Fit Chino Pants""","""Brunello Cucinelli pants in basic chino twill..."
1,1694,Pants,"""Foulard Print Tech Swim Shorts""","""Elastic waistband with drawstring. All over p..."
2,9100,Activewear,"""Paul Smith Men's Artist Studio Jacquard Hoodi...","""Signature graffiti print defines this relaxed..."
3,4028,Pants,"""Men's Allsaints Cigarette Skinny Fit Jeans""","""Pre-washed fading and distressing break in th..."
4,14529,Underwear and Nightwear,"""Paul Smith - Signature Stripe Cotton-blend Bo...","""Paul Smith - These Paul Smith boxer briefs ar..."


Checking the categories

In [4]:
train['Category'] = train['Category'].astype(str)
category_counts = train['Category'].value_counts()
print(category_counts)

Category

Accessories                1027

Shoes                       848

Shirts                      452

Activewear                  425

Pants                       397

Jackets/Coats               264

Underwear and Nightwear     216

Suits                       153

Sweaters                    111

Jewelry                      77

Name: count, dtype: int64


NLTK

In [None]:
#Removing stopwords and punctuations and symbolls
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

wordnet = WordNetLemmatizer()

def preprocess_text(text):

    text = re.sub('[^a-zA-Z]', ' ', text)

    text = text.lower()

    text = text.split()

    text = [wordnet.lemmatize(word) for word in text if word not in set(stopwords.words('english'))]

    return ' '.join(text)


# Apply the preprocessing function to the 'ProductName' and 'Description' columns
train['ProductName'] = train['ProductName'].apply(preprocess_text)
train['Description'] = train['Description'].apply(preprocess_text)
test['ProductName'] = test['ProductName'].apply(preprocess_text)
test['Description']= test['Description'].apply(preprocess_text)

Importing different models and classes

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.preprocessing import OneHotEncoder

In [7]:
train['Category']

Unnamed: 0,Category
0,Pants
1,Pants
2,Activewear
3,Pants
4,Underwear and Nightwear
...,...
3965,Underwear and Nightwear
3966,Sweaters
3967,Pants
3968,Shoes


OHE on Category

In [8]:
encoder = OneHotEncoder(sparse_output=False)
y= encoder.fit_transform(train['Category'].values.reshape(-1, 1))

In [9]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

TF IDF

In [10]:
#Applying TF IDF sepaarately on both columns because ProductName is more important
from sklearn.preprocessing import StandardScaler
transformer=ColumnTransformer(
transformers=[
        ('name_tfidf', TfidfVectorizer(), 'ProductName'),
        ('desc_tfidf', TfidfVectorizer(), 'Description')
    ],
    # remainder='drop'
)

Creating pipeline

In [11]:
pipeline = Pipeline([
    ('transformer', transformer),
    # ('classifier',LogisticRegression(C=15,penalty='l1',solver='saga')) #This column was included when training single model without grid search
])

Train Test split

In [12]:
X = train[['ProductName','Description']]
y = train['Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

GridSearch

In [None]:
param_grid={
    'classifier__C': [1, 10, 100,200],
    'classifier__loss': ['', 'squared_hinge'],
    'classifier__average': [True, False]
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid,
                           cv=3, n_jobs=-1, scoring='accuracy')

grid_search.fit(X_train,y_train)

Checking the best parameters

In [None]:
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting the data

In [13]:
X_train_transformed=transformer.fit_transform(X_train)
X_test_transformed=transformer.transform(X_test) # Use transform instead of fit for X_test and test
test_transformed=transformer.transform(test) # Use transform instead of fit for X_test and test

In [15]:
X_train_transformed

<3176x9278 sparse matrix of type '<class 'numpy.float64'>'
	with 104708 stored elements in Compressed Sparse Row format>

Fitting single model

In [None]:
# from sklearn.svm import SVC

# lr=LogisticRegression(penalty='l2',solver='sag',C=100,random_state=86)
# lr.fit(X_train_transformed, y_train)

Tried Prediction for single models

In [17]:
# y_pred_svm = lr.predict(X_test_transformed)

# # Evaluate the model
# accuracy = accuracy_score(y_test, y_pred_svm)
# print(f'Accuracy: {accuracy:.6f}')

# # Print a detailed classification report
# print(classification_report(y_test, y_pred_svm))

Accuracy: 0.964736

                         precision    recall  f1-score   support



            Accessories       0.99      0.99      0.99       198

             Activewear       0.87      0.97      0.91        87

          Jackets/Coats       0.98      0.95      0.96        58

                Jewelry       1.00      0.93      0.96        14

                  Pants       0.94      0.92      0.93        84

                 Shirts       0.95      0.95      0.95        79

                  Shoes       0.99      1.00      0.99       169

                  Suits       1.00      0.92      0.96        36

               Sweaters       0.96      0.88      0.92        26

Underwear and Nightwear       1.00      0.95      0.98        43



               accuracy                           0.96       794

              macro avg       0.97      0.95      0.96       794

           weighted avg       0.97      0.96      0.96       794




Mapping the categories to their valuess

In [18]:
#Implemented this before when ohe was not performed
category_mapping = {
    'Accessories': 0,
    'Activewear': 1,
    'Jackets/Coats': 2,
    'Jewelry': 3,
    'Pants': 4,
    'Shirts': 5,
    'Shoes': 6,
    'Suits': 7,
    'Sweaters': 8,
    'Underwear and Nightwear': 9
}

y_pred_svm = [category_mapping[label] for label in y_pred_svm]

In [19]:
pipeline.fit(X,y)

Made 3 different models for ensemble with different train data

In [20]:
#Different Random states are used, to generate different test sets
X_train1,X_test1,y_train1,y_test1=train_test_split(X,y,test_size=0.2,random_state=98)
X_train2,X_test2,y_train2,y_test2=train_test_split(X,y,test_size=0.2,random_state=86)
X_train3,X_test3,y_train3,y_test3=train_test_split(X,y,test_size=0.2,random_state=67)

Logistic Reg. on model1

In [21]:
lr1=LogisticRegression(penalty='l1',solver='liblinear',C=500,random_state=98)
X_train1_transformed=transformer.fit_transform(X_train1)
X_test1_transformed=transformer.transform(X_test1)
lr1.fit(X_train1_transformed,y_train1)

Logistic Regressin on model2

In [None]:
lr2=LogisticRegression(penalty='l2',solver='saga',C=100,random_state=77)
X_train2_transformed=transformer.fit_transform(X_train2)
X_test2_transformed=transformer.transform(X_test2)
lr2.fit(X_train2_transformed,y_train2)

Logistic Regression on model3

In [None]:
lr3=LogisticRegression(penalty='l2',solver='sag',C=1500,random_state=86)
X_train3_transformed=transformer.fit_transform(X_train3)
X_test3_transformed=transformer.transform(X_test3)
lr3.fit(X_train3_transformed,y_train3)

Voting ensemble on my best performers

In [24]:
from sklearn.ensemble import VotingClassifier
voting_model = VotingClassifier(estimators=[
    ('lr1', lr1),
    ('lr2', lr2),
    ('lr3', lr3)
], voting='hard')

In [None]:
voting_model.fit(X_train_transformed, y_train)

In [None]:
test_pred=voting_model.predict(test_transformed)
test_pred_final = [category_mapping[label] for label in test_pred]
for i in range(993):
  print(test_pred_final[i])