In [1]:
##importing libraries
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
##loading the articles dataset
articles=pd.read_csv('articles.csv',usecols=['prod_name','product_type_name','product_group_name',
                                            'graphical_appearance_name','colour_group_name',
                                             'perceived_colour_value_name'])
articles.head()

Unnamed: 0,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name
0,Strap top,Vest top,Garment Upper body,Solid,Black,Dark
1,Strap top,Vest top,Garment Upper body,Solid,White,Light
2,Strap top (1),Vest top,Garment Upper body,Stripe,Off White,Dusty Light
3,OP T-shirt (Idro),Bra,Underwear,Solid,Black,Dark
4,OP T-shirt (Idro),Bra,Underwear,Solid,White,Light


In [3]:
def make_df(ordered_cols):
    df=articles
    df['query']=df['prod_name']
    ##adding attributes to the product name to generate queries
    ##reversing ordered_cols
    ordered_cols.reverse()
    for col in ordered_cols:
        df['query']=df[col]+' '+df['query']
    ##reduce to the input and output columns
    df=df[['query','product_group_name','product_type_name']]
    ##reducing all text to lowercase
    for col in df.columns:
        df[col]=df[col].apply(lambda x:x.lower())
    ##return the created dataframe
    return df

In [4]:
##creating combinations
set_1=make_df(['graphical_appearance_name'])
set_2=make_df(['colour_group_name'])
set_3=make_df(['perceived_colour_value_name'])
set_4=make_df(['graphical_appearance_name','colour_group_name'])
set_5=make_df(['perceived_colour_value_name','colour_group_name'])
set_6=make_df(['graphical_appearance_name','perceived_colour_value_name','colour_group_name'])

##deleting the articles dataset
del(articles)

##concatenating all dataframes into a single one
data=pd.concat([set_1,set_2,set_3,set_4,set_5,set_6])

#resetting index
data.reset_index(inplace=True)
data.drop('index',axis=1,inplace=True)

##deleting all the sets
del(set_1,set_2,set_3,set_4,set_5,set_6)

data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col]=df[col].apply(lambda x:x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col]=df[col].apply(lambda x:x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col]=df[col].apply(lambda x:x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

Unnamed: 0,query,product_group_name,product_type_name
0,solid strap top,garment upper body,vest top
1,solid strap top,garment upper body,vest top
2,stripe strap top (1),garment upper body,vest top
3,solid op t-shirt (idro),underwear,bra
4,solid op t-shirt (idro),underwear,bra


In [5]:
##ecoding the target
from sklearn.preprocessing import LabelEncoder

##category encoder
cat_enc=LabelEncoder()
data['product_group_name']=cat_enc.fit_transform(data['product_group_name'])

##subcategory encoder
sub_enc=LabelEncoder()
data['product_type_name']=sub_enc.fit_transform(data['product_type_name'])

##check the dataset
data.head()

Unnamed: 0,query,product_group_name,product_type_name
0,solid strap top,8,120
1,solid strap top,8,120
2,stripe strap top (1),8,120
3,solid op t-shirt (idro),16,15
4,solid op t-shirt (idro),16,15


In [6]:
##splitting data into train and test sets
from sklearn.model_selection import StratifiedShuffleSplit

##creating feature and target sets
X=data['query']
y=data.drop('query',axis=1)

##instantiating
sss=StratifiedShuffleSplit(test_size=0.2,random_state=42)

##creating train and test sets
train_idx,test_idx=next(sss.split(X,y))
X_train,X_test,y_train,y_test=X.iloc[train_idx],X.iloc[test_idx],y.iloc[train_idx],y.iloc[test_idx]

In [7]:
##tokenizing the text data
from keras.preprocessing.text import Tokenizer

##instantiating and fitting the tokenizer
tokenizer=Tokenizer(num_words=100000,lower=True)
tokenizer.fit_on_texts(X_train)

##tokeinizng the text
X_train=tokenizer.texts_to_sequences(X_train)
X_test=tokenizer.texts_to_sequences(X_test)

In [8]:
##vocabulary size
vocab_size=len(tokenizer.word_index)+1

In [9]:
##padding the textual data [error converting to tensor without conversion]
from keras.utils import pad_sequences

max_length=20 ##maximum length of sentence

X_train=pad_sequences(X_train,padding='post',maxlen=max_length)
X_test=pad_sequences(X_test,padding='post',maxlen=max_length)

In [10]:
##creating the rnn model
from keras.models import Sequential
from keras.layers import Embedding,Input,Dense,LSTM,SimpleRNN
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D

##instantiating the model
model=Sequential()

##adding layers to the model
model.add(Embedding(input_dim=vocab_size,output_dim=64))
model.add(Bidirectional(layer=LSTM(64,return_sequences=True)))
model.add(Bidirectional(layer=LSTM(32)))
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1))

In [11]:
##compiling the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [12]:
##training the model
model.fit(X_train, y_train['product_group_name'], epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1b04256e400>

In [14]:
model.evaluate(X_test,y_test['product_group_name'])



[9.202386195283907e-07, 0.10571570694446564]