In [1]:
##importing libraries
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
##loading the articles dataset
articles=pd.read_csv('articles.csv',usecols=['prod_name','product_type_name','product_group_name',
                                            'graphical_appearance_name','colour_group_name',
                                             'perceived_colour_value_name'])
articles.head()

Unnamed: 0,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name
0,Strap top,Vest top,Garment Upper body,Solid,Black,Dark
1,Strap top,Vest top,Garment Upper body,Solid,White,Light
2,Strap top (1),Vest top,Garment Upper body,Stripe,Off White,Dusty Light
3,OP T-shirt (Idro),Bra,Underwear,Solid,Black,Dark
4,OP T-shirt (Idro),Bra,Underwear,Solid,White,Light


In [3]:
##creating dictionaries for categories in 'product_group_name' and 'product_type_name'
type_dict={v:k for k,v in dict(enumerate(articles['product_type_name'].unique())).items()}
group_dict={v:k for k,v in dict(enumerate(articles['product_group_name'].unique())).items()}

##creating the reverse of these dictionaries for final decoding
reverse_type_dict={v:k for k,v in type_dict.items()}
reverse_group_dict={v:k for k,v in group_dict.items()}

##replacing values in articles with placeholders
articles['product_type_name']=articles['product_type_name'].apply(lambda x:str(type_dict[x]))
articles['product_group_name']=articles['product_group_name'].apply(lambda x:str(group_dict[x]))

##checking the head of the dataset
articles.head()

Unnamed: 0,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name
0,Strap top,0,0,Solid,Black,Dark
1,Strap top,0,0,Solid,White,Light
2,Strap top (1),0,0,Stripe,Off White,Dusty Light
3,OP T-shirt (Idro),1,1,Solid,Black,Dark
4,OP T-shirt (Idro),1,1,Solid,White,Light


In [4]:
def make_df(ordered_cols):
    df=articles
    df['query']=df['prod_name']
    ##adding attributes to the product name to generate queries
    ##reversing ordered_cols
    ordered_cols.reverse()
    for col in ordered_cols:
        df['query']=df[col]+' '+df['query']
    ##combining product_group_name and product_type_name into a single output column
    df['output']=df['product_group_name']+' '+df['product_type_name']
    ##reduce to the input and output columns
    df=df[['query','output']]
    ##reducing all text to lowercase
    for col in df.columns:
        df[col]=df[col].apply(lambda x:x.lower())
    ##return the created dataframe
    return df

In [5]:
##creating combinations
set_1=make_df(['graphical_appearance_name'])
set_2=make_df(['colour_group_name'])
set_3=make_df(['perceived_colour_value_name'])
set_4=make_df(['graphical_appearance_name','colour_group_name'])
set_5=make_df(['perceived_colour_value_name','colour_group_name'])
set_6=make_df(['graphical_appearance_name','perceived_colour_value_name','colour_group_name'])

##deleting the articles dataset
del(articles)

##concatenating all dataframes into a single one
data=pd.concat([set_1,set_2,set_3,set_4,set_5,set_6])

#resetting index
data.reset_index(inplace=True)
data.drop('index',axis=1,inplace=True)

##deleting all the sets
del(set_1,set_2,set_3,set_4,set_5,set_6)

data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col]=df[col].apply(lambda x:x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col]=df[col].apply(lambda x:x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col]=df[col].apply(lambda x:x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

Unnamed: 0,query,output
0,solid strap top,0 0
1,solid strap top,0 0
2,stripe strap top (1),0 0
3,solid op t-shirt (idro),1 1
4,solid op t-shirt (idro),1 1


In [6]:
##USING ENCODER-DECODER MODEL

In [7]:
##adding 'start_ ' and ' _end' at the extemities of 'output' so that the decoder knows when the sentence starts and ends
data['output']=data['output'].apply(lambda x:'start_ '+x+' _end')

##creating vocabularies for both 'query' and 'output'
def get_vocab(col):
    container=set()
    for line in data[col]:
        for word in line.split(' '):
            if word not in container:
                container.add(word)
    ##number of tokens for encoder/decoder = number of unique elements in vocab
    num_tokens=len(list(container))
    ##return vocab and num_tokens
    return sorted(list(container)), num_tokens

query_vocab,num_encoder_tokens=get_vocab('query')
output_vocab,num_decoder_tokens=get_vocab('output')

In [8]:
##calculate the maximum lengths of sentences in 'query' and 'output'
max_query_len=max(data['query'].apply(lambda x:len(x.split(' '))))
max_output_len=max(data['output'].apply(lambda x:len(x.split(' '))))

In [9]:
query_token_index = dict([(word, i) for i, word in enumerate(query_vocab)])
output_token_index = dict([(word, i) for i, word in enumerate(output_vocab)])

In [10]:
encoder_input_data = np.zeros((len(data['query']), max_query_len),dtype='float32')
decoder_input_data = np.zeros((len(data['output']), max_output_len),dtype='float32')
decoder_target_data = np.zeros((len(data['output']), max_output_len, num_decoder_tokens),dtype='float32')

In [11]:
for i,(input_text,target_text) in enumerate(zip(data['query'],data['output'])):
    for t,word in enumerate(input_text.split(' ')):
        encoder_input_data[i,t]=query_token_index[word]
    for t,word in enumerate(target_text.split(' ')):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i,t]=output_token_index[word]
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i,t-1,output_token_index[word]]=1

In [12]:
##inmporting libraries for building the model
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
from keras.utils import plot_model
import tensorflow as tf

In [13]:
##building the encoder

encoder_inputs=Input(shape=(None,)) ##input layer for the encoder
enx=Embedding(num_encoder_tokens, 64)(encoder_inputs) ##converts integers to embeddings
encoder=LSTM(64,return_state=True) ##encoder units for the seq2seq model
encoder_outputs,state_h,state_c=encoder(enx) ##extracting states using the LSTM encoder units
##we discard `encoder_outputs` and only keep the states
##'encoder_outputs' are not required as they are neither inferred nor used by the decoder
encoder_states=[state_h,state_c]

In [14]:
##building the decoder

decoder_inputs=Input(shape=(None,)) ##input layer for the decoder
dex=Embedding(num_decoder_tokens,64)
final_dex=dex(decoder_inputs)
decoder=LSTM(64,return_state=True,return_sequences=True)
decoder_outputs,_,_=decoder(final_dex,initial_state=encoder_states)
decoder_dense=Dense(num_decoder_tokens,activation='softmax') ##probability output for the decoder outputs
decoder_outputs=decoder_dense(decoder_outputs)

In [15]:
##compiling the model

model = Model([encoder_inputs,decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 64)     905088      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 64)     8512        ['input_2[0][0]']                
                                                                                              

In [16]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=32,
          epochs=10,
          validation_split=0.05)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x240fe906f10>

In [17]:
##saving the trained model
model.save('seq2seq_lstm')



INFO:tensorflow:Assets written to: seq2seq_lstm\assets


INFO:tensorflow:Assets written to: seq2seq_lstm\assets


In [18]:
##decoding sequences back to text

In [19]:
##creating encoder model
encoder_model=Model(encoder_inputs,encoder_states)
encoder_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 64)          905088    
                                                                 
 lstm (LSTM)                 [(None, 64),              33024     
                              (None, 64),                        
                              (None, 64)]                        
                                                                 
Total params: 938,112
Trainable params: 938,112
Non-trainable params: 0
_________________________________________________________________


In [20]:
##creating decoder model
decoder_state_input_h=Input(shape=(64,)) ##takes state_h values from the encoder model
decoder_state_input_c=Input(shape=(64,)) ##takes state_c values from the encoder model
decoder_states_inputs=[decoder_state_input_h,decoder_state_input_c]

final_dex2=dex(decoder_inputs) ##embedding the input received by the decoder

decoder_outputs2,state_h2,state_c2=decoder(final_dex2,initial_state=decoder_states_inputs)
decoder_states2=[state_h2,state_c2]
decoder_outputs2=decoder_dense(decoder_outputs2)

decoder_model=Model(([decoder_inputs],decoder_states_inputs),([decoder_outputs2],decoder_states2))
decoder_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, None, 64)     8512        ['input_2[0][0]']                
                                                                                                  
 input_3 (InputLayer)           [(None, 64)]         0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 64)]         0           []                               
                                                                                            

In [21]:
##creating reverse token dictionaries
reverse_query_token_index={v:k for k,v in query_token_index.items()}
reverse_output_token_index={v:k for k,v in output_token_index.items()}

In [22]:
##function to get encoder input for a sentence
def get_encoder_input():
    seq=input().split(' ')
    encoder_input_data=np.zeros(max_query_len)
    for i in range(len(seq)):
        if seq[i] not in query_token_index.keys():
            encoder_input_data[i]=0
        else:
            encoder_input_data[i]=query_token_index[seq[i]]
    return np.array([encoder_input_data])

In [23]:
##function to decode sequences
def decode_seq():
    ##taking input sequence from the user
    input_seq=get_encoder_input()
    ##encoder_input_seq=np.zeros(1,max_query_len),dtype='float32')
    #encode the input as state vectors.
    states_value=encoder_model.predict(input_seq)
    #generate empty target sequence of length 1
    target_seq=np.zeros((1,1))
    #populate the first character of target sequence with the start character.
    target_seq[0,0]=output_token_index['start_']
    #sampling loop for a batch of sequences
    #(to simplify, here we assume a batch of size 1).
    stop_condition=False
    decoded_sentence=''
    while not stop_condition:
        [output_tokens],_states=decoder_model.predict(([target_seq],states_value))
        #sample a token
        sampled_token_index=np.argmax(output_tokens[0, -1, :])
        sampled_char=reverse_output_token_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char
        
        #exit condition: either hit max length
        #or find stop character
        if (sampled_char == '_end' or
           len(decoded_sentence)>4):
            stop_condition=True
            
        #update the target sequence (of length 1)
        target_seq=np.zeros((1,1))
        target_seq[0, 0]=sampled_token_index
        
        #update states
        states_value=[_states[0],_states[1]]
        
    [cat_token,sub_token]=decoded_sentence.split(' ')[1:3]
    output_dict={'category':reverse_group_dict[int(cat_token)],
                'subcategory':reverse_type_dict[int(sub_token)]}
    
    return output_dict

In [24]:
decode_seq()

solid black shirt men


{'category': 'Garment Upper body', 'subcategory': 'Shirt'}

In [25]:
decode_seq()

denim pants blue


{'category': 'Garment Lower body', 'subcategory': 'Trousers'}

In [26]:
decode_seq()

solid jackets


{'category': 'Garment Upper body', 'subcategory': 'Sweater'}

In [27]:
decode_seq()

red dress


{'category': 'Garment Full body', 'subcategory': 'Dress'}

In [28]:
decode_seq()

tshirt for men


{'category': 'Garment Upper body', 'subcategory': 'Bodysuit'}

##tokenizing the text data
from keras.preprocessing.text import Tokenizer

##instantiating and fitting the query tokenizer
query_tokenizer=Tokenizer(num_words=100000,lower=True)
query_tokenizer.fit_on_texts(X_train)

##tokeinizng the query
X_train=query_tokenizer.texts_to_sequences(X_train)
X_test=query_tokenizer.texts_to_sequences(X_test)

##instantiating and fitting the output tokenizer
output_tokenizer=Tokenizer(num_words=100000,lower=True)
output_tokenizer.fit_on_texts(y_train)

##tokeinizng the query
y_train=output_tokenizer.texts_to_sequences(y_train)
y_test=output_tokenizer.texts_to_sequences(y_test)

##padding the query and ouput
from keras.utils import pad_sequences

query_length=15 ##max length of query
output_length=7 ##max length of output

X_train=pad_sequences(X_train,padding='post',maxlen=query_length)
X_test=pad_sequences(X_test,padding='post',maxlen=query_length)

y_train=pad_sequences(y_train,padding='post',maxlen=output_length)
y_test=pad_sequences(y_test,padding='post',maxlen=output_length)