In [1]:
import pandas as pd
import numpy as np

# Load ML Pkgs
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

# Transformers
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
# Others
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,plot_confusion_matrix

from sklearn.multioutput import MultiOutputClassifier

In [2]:
raw_data = pd.read_csv('styles.csv')
raw_data.shape
raw_data.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName,Unnamed: 10,Unnamed: 11
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt,,
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans,,
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch,,
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants,,
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt,,


In [3]:
data = raw_data.iloc[:, 1:10]
data = data.drop(['year', 'season', 'baseColour'], axis=1)
data.head()

Unnamed: 0,gender,masterCategory,subCategory,articleType,usage,productDisplayName
0,Men,Apparel,Topwear,Shirts,Casual,Turtle Check Men Navy Blue Shirt
1,Men,Apparel,Bottomwear,Jeans,Casual,Peter England Men Party Blue Jeans
2,Women,Accessories,Watches,Watches,Casual,Titan Women Silver Watch
3,Men,Apparel,Bottomwear,Track Pants,Casual,Manchester United Men Solid Black Track Pants
4,Men,Apparel,Topwear,Tshirts,Casual,Puma Men Grey T-shirt


In [4]:
import nltk
import unicodedata
from nltk.corpus import stopwords
import re

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def clean_stopwords_shortwords(w):
    stopwords_list=stopwords.words('english')
    words = w.split() 
    clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 2]
    return " ".join(clean_words) 

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w = re.sub(r"text+","",w) #removing "text" from every sentence
    w=clean_stopwords_shortwords(w)
    w=re.sub(r'@\w+', '',w)
    return w

#CLEANING SENTENCES
sentences = data['productDisplayName']
sentences = sentences.astype(str)
sentences = sentences.map(preprocess_sentence)
data['productDisplayName'] = sentences
data.head()

Unnamed: 0,gender,masterCategory,subCategory,articleType,usage,productDisplayName
0,Men,Apparel,Topwear,Shirts,Casual,turtle check men navy blue shirt
1,Men,Apparel,Bottomwear,Jeans,Casual,peter england men party blue jeans
2,Women,Accessories,Watches,Watches,Casual,titan women silver watch
3,Men,Apparel,Bottomwear,Track Pants,Casual,manchester united men solid black track pants
4,Men,Apparel,Topwear,Tshirts,Casual,puma men grey shirt


In [5]:
# Features & Labels

data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna("NA", inplace=True)

Xfeatures = data['productDisplayName']
ylabels = data[['gender', 'masterCategory','subCategory','articleType', 'usage']]


In [6]:
# Split Data
x_train,x_test,y_train,y_test = train_test_split(Xfeatures,ylabels,test_size=0.1,random_state=30)


In [7]:
pipe_lr = Pipeline(steps=[('cv',CountVectorizer()),
                          ('lr_multi',MultiOutputClassifier(LogisticRegression()))])

In [8]:
# Fit on Dataset
pipe_lr.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [9]:
# Accuracy Score
pipe_lr.score(x_test,y_test)

0.9001124859392576

In [10]:
pipe_RF = Pipeline(steps=[('cv',CountVectorizer()),('RF',KNeighborsClassifier(n_neighbors=4))])
pipe_RF.fit(x_train,y_train)
pipe_KN = Pipeline(steps=[('cv',CountVectorizer()),('KN',KNeighborsClassifier(n_neighbors=4))])
pipe_KN.fit(x_train,y_train)

In [19]:
text = ' polkadot dress '

#print(pipe_RF.predict([text]))
print(pipe_lr.predict([text]))


[['Women' 'Apparel' 'Dress' 'Dresses' 'Casual']]


In [12]:
%store -r good_sentences

In [13]:
good_sentences

0        keep smiling classic tee colors possible men w...
1        make classic tee colors possible men women shi...
2        classic tee colors possible men women shirts p...
3        women classic tee colors possible men women sh...
4        high school classic tee colors possible men wo...
                               ...                        
11747                           try essential blues shorts
11748                              ball play sports hannah
11749                               red compression shorts
11750                                                 long
11751                                        mirror shorts
Name: text, Length: 11752, dtype: object

In [14]:
import pickle

filename = 'finalized_model.sav'
pickle.dump(pipe_lr, open(filename, 'wb'))


In [15]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(x_test, y_test)
print(result)

0.9001124859392576


In [16]:
%store loaded_model

Stored 'loaded_model' (Pipeline)
