In [1]:
# import the Ecommerce dataset

import pandas as pd

df = pd.read_csv(r"D:\Intellipaat\Projects\NLP\ecommerce_dataset.csv", names=['category','description'], header=None)
df.head()

Unnamed: 0,category,description
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [2]:
df['category'].value_counts()

Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: category, dtype: int64

In [3]:
df.shape

(50425, 2)

In [4]:
df.dropna(inplace=True)

In [5]:
df.shape

(50424, 2)

In [6]:
df['category'].replace("Clothing & Accessories", "Clothing_Accessories", inplace=True)

In [7]:
df['category'].value_counts()

Household               19313
Books                   11820
Electronics             10621
Clothing_Accessories     8670
Name: category, dtype: int64

In [8]:
df['category'] = '__label__' + df['category'].astype(str)

In [9]:
df['category_description'] = df['category'] + " " + df['description']
df.head()

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__Household Paper Plane Design Framed W...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__Household SAF 'Floral' Framed Paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__Household SAF 'UV Textured Modern Art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__Household SAF Flower Print Framed Pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__Household Incredible Gifts India Wood...


In [10]:
# Preprocessing the text

import re

def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(r'[ \n]+', ' ', text)
    return text.strip().lower()

In [11]:
df['category_description'] = df['category_description'].apply(preprocess)
df.head()

Unnamed: 0,category,description,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__household paper plane design framed w...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__household saf 'floral' framed paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__household saf 'uv textured modern art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__household saf flower print framed pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__household incredible gifts india wood...


In [12]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [13]:
train.shape, test.shape

((40339, 3), (10085, 3))

In [14]:
# Exporting the train and test dataset to csv

train.to_csv("ecommerce.train", columns=["category_description"], index=False, header=False)
test.to_csv("ecommerce.test", columns=["category_description"], index=False, header=False)

In [15]:
# Training the model using FastText
# Achieving 96% accuracy

import fasttext

model = fasttext.train_supervised(input="ecommerce.train")
model.test("ecommerce.test")

(10085, 0.9682697074863659, 0.9682697074863659)

In [16]:
model.predict("wintech assemble desktop pc cpu 500 gb sata hdd 4 gb ram intel c2d processor 3")

(('__label__electronics',), array([0.99943733]))

In [17]:
model.predict("ockey men's cotton t shirt fabric details 80 cotton 20 polyester super combed cotton rich fabric")

(('__label__clothing_accessories',), array([1.00001001]))

In [18]:
model.predict("think and grow rich deluxe edition")

(('__label__books',), array([1.00000989]))

In [19]:
model.get_nearest_neighbors("painting")

[(0.9987592697143555, 'ghrfhagw200'),
 (0.9987009763717651, 'picoting'),
 (0.9986954927444458, '27cmx27cmx13cm'),
 (0.9986816644668579, 'flicking'),
 (0.9986792802810669, '55in'),
 (0.9986792802810669, 'peppersnote'),
 (0.9986792802810669, 'including50'),
 (0.9986555576324463, '1350w'),
 (0.998654842376709, '78x72x4'),
 (0.9986546039581299, 'gudwell')]

In [20]:
model.get_nearest_neighbors("sony")

[(0.9993221759796143, 'plus6'),
 (0.9993221759796143, 'z4'),
 (0.9992969632148743, "43''style"),
 (0.9992969632148743, 'earphonenotes'),
 (0.9992969632148743, 'ventionport'),
 (0.9992969632148743, 'interfacematerial'),
 (0.9992969632148743, 'phonefeatures'),
 (0.999292254447937, '7lbpackage'),
 (0.999292254447937, '1700g'),
 (0.999292254447937, '890g')]