## FastText Classification of Text

In [1]:
# import necessary libraries for FastText classification
import fasttext
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# read the data
df = pd.read_csv("data/ecommerceDataset.csv", names= ["cat", "desc"])
df = pd.DataFrame(df)
df.sample(5)

Unnamed: 0,cat,desc
10154,Household,Signoraware Sprinkles N Spice Small Salt and P...
30384,Books,Geography for UPSC and State Civil Services Ex...
16113,Household,Frigidaire 131977000 Washer/Dryer Combo Timer ...
46945,Electronics,Well Point Silver Plated Stainless Steel Long ...
20140,Books,Indra Nooyi A Biography About the Book: Indra ...


In [3]:
df["cat"].value_counts()


cat
Household                 19313
Books                     11820
Electronics               10621
Clothing & Accessories     8671
Name: count, dtype: int64

In [4]:
df.describe()

Unnamed: 0,cat,desc
count,50425,50424
unique,4,27802
top,Household,Think & Grow Rich About the Author NAPOLEON HI...
freq,19313,30


In [5]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50425 entries, 0 to 50424
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cat     50425 non-null  object
 1   desc    50424 non-null  object
dtypes: object(2)
memory usage: 788.0+ KB


In [6]:
df.dropna(inplace=True)

In [None]:
df.sample(5) 

Unnamed: 0,cat,desc
38772,Clothing & Accessories,Cover Affair Tropical Leaves Printed Clear Des...
37660,Clothing & Accessories,MomToBe® Cotton Maternity Trouser Description:...
11604,Household,Bagaholics Ethnic Saree Clutch Leather Mobile ...
29358,Books,Bedside clinics in Medicine Part - 1 bedside c...
25634,Books,"Introduction to Public Health, 5/e Introductio..."


In [8]:
df['cat'] = df['cat'].str.replace(' ', '', regex=False)
df['cat'] = df['cat'].str.replace(r'[^A-Za-z0-9]', '_', regex=True)
df.sample(5)

Unnamed: 0,cat,desc
23968,Books,The Dhoni Touch: Unravelling the Enigma That I...
7734,Household,Home Desirica Curtain Tie Back (Rope) Set Of 6...
44869,Electronics,Lambent Charge2Plus SpillProof Wireless Blueto...
1854,Household,CLIMAX Table Baby VICE REVOLVING CLAMP 40MM [P...
14046,Household,TELEbrands-HBN Phoenix Gold Iron with Built-In...


In [9]:

df['cat']='__label__'+df['cat'].astype(str)
df.head()

Unnamed: 0,cat,desc
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...


In [10]:
df['category_description'] = df['cat'] + ' ' + df['desc']
df.head(3)

Unnamed: 0,cat,desc,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__Household Paper Plane Design Framed W...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__Household SAF 'Floral' Framed Paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__Household SAF 'UV Textured Modern Art...


In [11]:
import re

In [12]:
def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower()

In [13]:
df['category_description'] = df['category_description'].apply(preprocess)
df.head()

Unnamed: 0,cat,desc,category_description
0,__label__Household,Paper Plane Design Framed Wall Hanging Motivat...,__label__household paper plane design framed w...
1,__label__Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",__label__household saf 'floral' framed paintin...
2,__label__Household,SAF 'UV Textured Modern Art Print Framed' Pain...,__label__household saf 'uv textured modern art...
3,__label__Household,"SAF Flower Print Framed Painting (Synthetic, 1...",__label__household saf flower print framed pai...
4,__label__Household,Incredible Gifts India Wooden Happy Birthday U...,__label__household incredible gifts india wood...


In [14]:
train,test = train_test_split(df,test_size=0.2)
train.shape, test.shape

((40339, 3), (10085, 3))

In [15]:

train.to_csv("ecommerce.train", columns=["category_description"], index=False, header=False)
test.to_csv("ecommerce.test", columns=["category_description"], index=False, header=False)

In [16]:
model=fasttext.train_supervised(input="ecommerce.train")
model.test("ecommerce.test")

Read 4M words
Number of words:  79445
Number of labels: 4
Progress: 100.0% words/sec/thread: 4744987 lr:  0.000000 avg.loss:  0.178086 ETA:   0h 0m 0s


(10085, 0.968368864650471, 0.968368864650471)

In [20]:
model.predict("samsung galaxy a32 smartphone 64gb 6gb ram black")

ValueError: Unable to avoid copy while creating an array as requested.
If using `np.array(obj, copy=False)` replace it with `np.asarray(obj)` to allow a copy when needed (no behavior change in NumPy 1.x).
For more details, see https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword.