#### IMPORTING THE LIBRARIES

In [1]:
# Importing the libraries
import nltk
import tensorflow as tf
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.externals import joblib
import numpy as np
import pandas as pd
#from tensorflow.keras.models import load_model
nltk.download('vader_lexicon')
nltk.download('stopwords')
import re
import time
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import MaxPool1D
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import AveragePooling1D
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.layers import Attention
#import fasttext as ft
from tensorflow.keras.layers import BatchNormalization
from sklearn.metrics import mean_squared_log_error as msle

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Wazir\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Wazir\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### FINAL FUNCTION FOR PREDICTION

This function would take a single data point and will predict the price

In [10]:
def final(X):    
    
    ''' Defining the function decontracted which expands anything with apostrophes'''
    def decontracted(phrase):
        # specific
        phrase = re.sub(r"won\'t", "will not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)
        # general
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"\'s", " is", phrase)
        phrase = re.sub(r"\'d", " would", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
        phrase = re.sub(r"\'ve", " have", phrase)
        phrase = re.sub(r"\'m", " am", phrase)
        return phrase
    
    ''' Defining the function preprocess to preprocess the product name, item_description and brand_name'''
    def preprocess(sentence):
        # Converting the sentence to a string instance
        sentence = str(sentence)
        sent = decontracted(sentence)
        sent = sent.replace('\\r',' ')
        sent = sent.replace('\\t',' ')
        sent = sent.replace('\\"',' ')
        sent = sent.replace('\\n',' ')
        sent = re.sub('[^A-Za-z0-9]+',' ',sent)
        sent = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+',' ',sent) # Removing the punctuations
        sent = ' '.join(e for e in sent.split() if e.lower() not in nltk.corpus.stopwords.words('english') and len(e)>=3)
        return sent.lower().strip()

    ''' Importing the unique training brand list'''
    training_brands = pd.read_csv("training_brands.txt",sep = '\t')  

    '''Initializing the unique training brand list'''
    unique_brand_set = list(training_brands.brands.values)

    '''Defining the function to replace the brand_name from the unique list of brand names'''
    def brand_name_replace(brand_name,product_desc):
      if product_desc.split()[0] in unique_brand_set: # Checking whether the first word of the description is a valid brand
        brand_name = product_desc.split()[0]
      elif len(product_desc.split()) >= 2 and ' '.join(product_desc.split()[0:2]) in unique_brand_set: # Checking whether the first two words of the description is a valid brand
        brand_name = ' '.join(product_desc.split()[0:2])
      elif len(product_desc.split()) >= 3 and ' '.join(product_desc.split()[0:3]) in unique_brand_set: # Checking whether the first three words of the description is a valid brand
        brand_name = ' '.join(product_desc.split()[0:3])
      elif len(product_desc.split()) >= 4 and ' '.join(product_desc.split()[0:4]) in unique_brand_set: # Checking whether the first four words of the description is a valid brand
        brand_name = ' '.join(product_desc.split()[0:4])
      elif len(product_desc.split()) >= 5 and ' '.join(product_desc.split()[0:5]) in unique_brand_set: # Checking whether the first five words of the description is a valid brand
        brand_name = ' '.join(product_desc.split()[0:5])
      elif len(product_desc.split()) >= 6 and ' '.join(product_desc.split()[0:6]) in unique_brand_set: # Checking whether the first six words of the description is a valid brand
        brand_name = ' '.join(product_desc.split()[0:6])
      elif len(product_desc.split()) >= 7 and ' '.join(product_desc.split()[0:7]) in unique_brand_set: # Checking whether the first seven words of the description is a valid brand
        brand_name = ' '.join(product_desc.split()[0:7])
      elif len(product_desc.split()) >= 8 and ' '.join(product_desc.split()[0:8]) in unique_brand_set: # Checking whether the first eight words of the description is a valid brand
        brand_name = ' '.join(product_desc.split()[0:8])
      return brand_name

    '''Defining the function to extract the item description length'''
    def count_of_words(text):
        '''This function would remove the punctuations, numbers and 
        stopwords from the text and then converting everything to lowercase
        and thereby returning the number of words in the text'''
        try:
            text = text.replace('\\t','') # Removing the tabs
            text = text.replace('\\r','') # Removing the \r
            text = text.replace('\\n','') # Removing the newline character
            text = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+','',text) # Removing the punctuations
            text = re.sub('[0-9]+','',text) # Removing the numbers
            # Capturing the clean text in a string and then returning the length of the list for words greater than length of 3        
            new_text = ' '.join(word for word in text.split() if word.lower() not in nltk.corpus.stopwords.words('english') and len(word)>3)
            # Returning the length of the words in the string new_text
            return len(new_text.split())        
        except:
            return 0
    
    '''Defining the function to calculate the sentiment score of the preprocessed item description'''
    sid = SentimentIntensityAnalyzer()
    def sentiment_score(sentence):
        ss = sid.polarity_scores(str(sentence))
        return ss
    
    '''Defining the function to separate out the category names'''
    # Defining a function to separate out the three sub categories

    def category_split(category):
        '''This function separates out 3 particular sub categories from a category name'''

        # Defining a try-except block to check whether the category can be splitted into sub categories or not. The except block
        # catches the exception if there is no '/' in the category name and then returns a list with strings No Label repeated 3
        # times
        try:
            return category.split('/')
        except:
            return ("No Label","No Label","No Label")
    
    '''Preprocessing the product name'''
    preprocessed_name = preprocess(X[0])
    
    '''Preprocessing the brand_name'''
    if X[3] is np.nan:
        brand_name = brand_name_replace(X[3],X[0])
        if brand_name is np.nan:
          print("No Brands present for this product")
          return 0
        preprocessed_brand_nm = preprocess(brand_name)
    else:
        preprocessed_brand_nm = preprocess(X[3])
        
    '''Preprocessing the categories after getting the individual categories which are general_cat, subcat_1 and subcat_2'''
    general_cat, subcat_1, subcat_2 = category_split(X[2])
    
    '''Preprocessing the item description'''
    preprocessed_item_desc = preprocess(X[5])
    
    '''Getting the item_description length of the preprocessed item description'''
    item_desc_length = count_of_words(preprocessed_item_desc)
    
    '''Preprocessing the general_cat'''
    general_cat = general_cat.replace(" & ","_")
    general_cat = general_cat.replace(" ","_")
    general_cat = general_cat.lower()
    
    '''Preprocessing the subcat_1'''
    subcat_1 = subcat_1.replace(" & ","_")
    subcat_1 = subcat_1.replace(" ","_")
    subcat_1 = subcat_1.replace("-","_")
    subcat_1 = subcat_1.replace("\'s","")
    subcat_1 = subcat_1.replace("\(","")
    subcat_1 = subcat_1.replace("\)","")
    subcat_1 = subcat_1.lower()
    
    '''Preprocessing the subcat_2'''
    subcat_2 = subcat_2.replace(" & ","_")
    subcat_2 = subcat_2.replace(" ","_")
    subcat_2 = subcat_2.replace("-","_")
    subcat_2 = subcat_2.replace("\'s","")
    subcat_2 = subcat_2.replace("\(","")
    subcat_2 = subcat_2.replace("\)","")
    subcat_2 = subcat_2.lower()
    
    '''Concatenating the preprocessed_name and preprocessed_item_desc'''
    name_item_desc = preprocessed_name + " " + preprocessed_item_desc
    
    '''Concatenating the preprocessed_brand_nm, general_cat, subcat_1 and subcat_2'''
    concatenated_brcat = preprocessed_brand_nm + " " + general_cat + " " + subcat_1 + " " + subcat_2
    
    '''Getting the sentiment score of the preprocessed_item_desc'''
    sent_sc = sentiment_score(str(preprocessed_item_desc))
    
    '''Concatenating the numerical features'''
    numerical_features = [X[1]] + [X[4]] + [item_desc_length] + [v for v in sent_sc.values()]
    
    '''Loading the tokenizers'''
    Tokenizer_pritem_desc = joblib.load('Tokenizer_pritem_desc.pkl')
    Tokenizer_brcat = joblib.load('Tokenizer_brcat.pkl')
    
    '''Loading the minmaxscaler'''
    scaler = joblib.load('minmaxscaler.pkl')
    
    '''Loading the embedding matrix'''
    embedding_matrix = np.load('embedding_matrix.npy')
    
    ''' Defining the model variables'''
    maxlen_pritem_desc = 154
    maxlen_brcat = 8
    
    ''' Defining the model architecture'''
    embedding_dim_brcat = 50
    num_tokens_brcat = len(Tokenizer_brcat.word_index)+1
    embedding_dim_preprocessed_pritemdesc = 300
    num_tokens_preprocessed_pritemdesc = len(Tokenizer_pritem_desc.word_index)+1

    # Creating the model architecture
    Inp1 = Input(shape = (maxlen_pritem_desc,), dtype='int64')
    Emb1 = Embedding(input_dim = num_tokens_preprocessed_pritemdesc, output_dim = embedding_dim_preprocessed_pritemdesc, input_length = maxlen_pritem_desc,
                    embeddings_initializer = tf.keras.initializers.constant(embedding_matrix), trainable = False)(Inp1)
    LSTM_layer_1 = LSTM(units=50, return_sequences = True)(Emb1)

    #avgpool = AveragePooling1D(pool_size = 2, strides=2)(Emb1)
    #Flatten_1 = Flatten()(avgpool)

    Inp2 = Input(shape = (maxlen_brcat,), dtype='int64')
    Emb2 = Embedding(input_dim = num_tokens_brcat, output_dim = embedding_dim_brcat, input_length = maxlen_brcat,trainable = True)(Inp2)
    LSTM_layer_2 = LSTM(units=50, return_sequences = True)(Emb2)
    #Flatten_2 = Flatten()(Emb2)

    att_contextvec = Attention()([LSTM_layer_2,LSTM_layer_1])

    avgpool = GlobalAveragePooling1D()(att_contextvec)

    Flatten_1 = Flatten()(avgpool)

    Inp3 = Input(shape = (7,))
    Dense_1 = Dense(units = 4, activation = 'relu', kernel_initializer = 'he_normal')(Inp3)
    #Dense_2 = Dense(units = 4, activation = 'relu', kernel_initializer = 'he_uniform')(Dense_2)

    concat = concatenate([Flatten_1,Dense_1])

    BN_1 = BatchNormalization()(concat)

    Dense_2 = Dense(units = 16, kernel_initializer = 'he_normal')(BN_1)

    Dense_2 = tf.keras.layers.PReLU()(Dense_2)

    dropout_1 = Dropout(0.5)(Dense_2)

    #BN_2 = BatchNormalization()(dropout_1)

    Dense_3 = Dense(units = 8, kernel_initializer = 'he_normal')(dropout_1)

    Dense_3 = tf.keras.layers.PReLU()(Dense_3)

    dropout_2 = Dropout(0.4)(Dense_3)

    Output = Dense(units = 1, activation = 'relu')(dropout_2)

    model = Model(inputs = [Inp1, Inp2, Inp3], outputs = [Output])

    #model.summary()
    
    '''Loading the best model weights'''
    model.load_weights('best_model_weights.hdf5')
    
    '''Scaling the numerical features'''
    numerical_features_scaled = scaler.transform(np.array(numerical_features).reshape(1,-1))
    
    '''Generating the padded sequence for the concatenated product name and preprocessed_item_desc'''
    pritem_desc_sequence = Tokenizer_pritem_desc.texts_to_sequences([name_item_desc])
    pritem_desc_padded_sequence = pad_sequences(pritem_desc_sequence, padding = 'post', truncating = 'post', maxlen = 154)
    
    '''Generating the padded sequence for the concatenated brand name and the categories'''
    brcat_sequence = Tokenizer_brcat.texts_to_sequences([concatenated_brcat])
    brcat_padded_sequence = pad_sequences(brcat_sequence, padding = 'post', truncating = 'post', maxlen = 8)
    
    '''Predicting on the test instance'''
    prediction = model.predict([np.array(pritem_desc_padded_sequence).reshape(1,-1), np.array(brcat_padded_sequence).reshape(1,-1), numerical_features_scaled.reshape(1,-1)])
    
    '''Returning the predicted price'''
    return prediction[0][0]

In [4]:
test_data = pd.read_csv("test_stg2.tsv", sep="\t")

test_data.head()

Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined..."
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...
4,4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...


In [16]:
start_time = time.time()
test_prediction = final(list(test_data.iloc[4,1:]))
end_time = time.time()

diff = end_time - start_time

No Brands present for this product


In [17]:
print("The prediction of price is {}  in {} seconds".format(test_prediction, diff))

The prediction of price is 0  in 0.23447561264038086 seconds


In [8]:
test_data.iloc[2,6]

'Brand new coach bag. Bought for [rm] at a Coach outlet.'

#### FINAL FUNCTION FOR CALCULATING ERROR METRIC

In [None]:
def final_metric(X,Y):
    ''' Defining the function decontracted which expands anything with apostrophes'''
    def decontracted(phrase):
        # specific
        phrase = re.sub(r"won\'t", "will not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)
        # general
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"\'s", " is", phrase)
        phrase = re.sub(r"\'d", " would", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
        phrase = re.sub(r"\'ve", " have", phrase)
        phrase = re.sub(r"\'m", " am", phrase)
        return phrase
    
    ''' Defining the function preprocess to preprocess the product name, item_description and brand_name'''
    def preprocess(text_array):
        # Initializing an empty list named preprocessed_total_train
        preprocessed = []

        for sentence in tqdm(text_array):
            sentence = str(sentence)
            sent = decontracted(sentence)
            sent = sent.replace('\\r',' ')
            sent = sent.replace('\\t',' ')
            sent = sent.replace('\\"',' ')
            sent = sent.replace('\\n',' ')
            sent = re.sub('[^A-Za-z0-9]+',' ',sent)
            sent = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+',' ',sent) # Removing the punctuations
            sent = ' '.join(e for e in sent.split() if e.lower() not in nltk.corpus.stopwords.words('english') and len(e)>=3)
            preprocessed.append(sent.lower().strip())
        return preprocessed

    ''' Importing the unique training brand list'''
    training_brands = pd.read_csv("training_brands.txt",sep = '\t')  

    '''Initializing the unique training brand list'''
    unique_brand_set = list(training_brands.brands.values)

    '''Defining the function to replace the brand_name from the unique list of brand names'''
    def brand_name_replace(brand_name,product_desc):
        for i in range(len(brand_name)):
            if brand_name[i] is np.nan:
                if product_desc[i].split()[0] in unique_brand_set: # Checking whether the first word of the description is a valid brand
                    brand_name[i] = product_desc[i].split()[0]
                elif len(product_desc[i].split()) >= 2 and ' '.join(product_desc[i].split()[0:2]) in unique_brand_set: # Checking whether the first two words of the description is a valid brand
                    brand_name[i] = ' '.join(product_desc[i].split()[0:2])
                elif len(product_desc[i].split()) >= 3 and ' '.join(product_desc[i].split()[0:3]) in unique_brand_set: # Checking whether the first three words of the description is a valid brand
                    brand_name[i] = ' '.join(product_desc[i].split()[0:3])
                elif len(product_desc[i].split()) >= 4 and ' '.join(product_desc[i].split()[0:4]) in unique_brand_set: # Checking whether the first four words of the description is a valid brand
                    brand_name[i] = ' '.join(product_desc[i].split()[0:4])
                elif len(product_desc[i].split()) >= 5 and ' '.join(product_desc[i].split()[0:5]) in unique_brand_set: # Checking whether the first five words of the description is a valid brand
                    brand_name[i] = ' '.join(product_desc[i].split()[0:5])
                elif len(product_desc[i].split()) >= 6 and ' '.join(product_desc[i].split()[0:6]) in unique_brand_set: # Checking whether the first six words of the description is a valid brand
                    brand_name[i] = ' '.join(product_desc[i].split()[0:6])
                elif len(product_desc[i].split()) >= 7 and ' '.join(product_desc[i].split()[0:5]) in unique_brand_set: # Checking whether the first seven words of the description is a valid brand
                    brand_name[i] = ' '.join(product_desc[i].split()[0:7])
                elif len(product_desc[i].split()) >= 8 and ' '.join(product_desc[i].split()[0:5]) in unique_brand_set: # Checking whether the first eight words of the description is a valid brand
                    brand_name[i] = ' '.join(product_desc[i].split()[0:8])
            else:
                continue
        return brand_name

    '''Defining the function to extract the item description length'''
    def count_of_words(text):
        '''This function would remove the punctuations, numbers and 
        stopwords from the text and then converting everything to lowercase
        and thereby returning the number of words in the text'''
        try:
            text = text.replace('\\t','') # Removing the tabs
            text = text.replace('\\r','') # Removing the \r
            text = text.replace('\\n','') # Removing the newline character
            text = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+','',text) # Removing the punctuations
            text = re.sub('[0-9]+','',text) # Removing the numbers
            # Capturing the clean text in a string and then returning the length of the list for words greater than length of 3        
            new_text = ' '.join(word for word in text.split() if word.lower() not in nltk.corpus.stopwords.words('english') and len(word)>3)
            # Returning the length of the words in the string new_text
            return len(new_text.split())        
        except:
            return 0
    
    '''Defining the function to separate out the category names'''
    # Defining a function to separate out the three sub categories

    def category_split(category):
        '''This function separates out 3 particular sub categories from a category name'''

        # Defining a try-except block to check whether the category can be splitted into sub categories or not. The except block
        # catches the exception if there is no '/' in the category name and then returns a list with strings No Label repeated 3
        # times
        try:
            return category.split('/')
        except:
            return ("No Label","No Label","No Label")
        
    '''Preprocessing the product name'''
    preprocessed_name = preprocess(X.name.values)
    
    '''Replacing the brand names with the available brand names in the product names'''
    preprocessed_brand_nm = brand_name_replace(X['brand_name'].values, X['name'].values)
    
    '''Replacing the brand names with the substituted brand names'''
    X['brand_name'] = preprocessed_brand_nm
    
    '''Filling out the missing values in the brand name columns with No Brand Name'''
    X["brand_name"] = X["brand_name"].fillna("No Brand Name")
    
    '''Preprocessing the brand names'''
    X["preprocessed_brand_nm"] = preprocess(X.brand_name.values)
    
    '''Preprocessing the categories after getting the individual categories which are general_cat, subcat_1 and subcat_2'''
    general_cat, subcat_1, subcat_2 = X['general_cat'], X['subcat_1'], X['subcat_2'] = zip(*X['category_name'].apply(category_split))

    
    '''Preprocessing the item description'''
    X['preprocessed_item_desc'] = preprocess(X['item_description'].values)
    
    '''Getting the item_description length of the preprocessed item description'''
    item_desc_length = count_of_words(preprocessed_item_desc)
    
    '''Initializing a new list to store lengths of item descriptions'''
    desc_length_ls = []

    for item_description in tqdm(X["item_description"].values):
        desc_length_ls.append(count_of_words(item_description))

    X["item_description_Length"] = desc_length_ls
    
    '''Preprocessing the general_cat'''
    X['general_cat'] = X['general_cat'].str.replace(" & ","_")
    X['general_cat'] = X['general_cat'].str.replace(" ","_")
    X['general_cat'] = X['general_cat'].str.lower()
    
    '''Preprocessing the subcat_1'''
    X['subcat_1'] = X['subcat_1'].str.replace(" & ","_")
    X['subcat_1'] = X['subcat_1'].str.replace(" ","_")
    X['subcat_1'] = X['subcat_1'].str.replace("-","_")
    X['subcat_1'] = X['subcat_1'].str.replace("\'s","")
    X['subcat_1'] = X['subcat_1'].str.replace("\(","")
    X['subcat_1'] = X['subcat_1'].str.replace("\)","")
    X['subcat_1'] = X['subcat_1'].str.lower()
    
    '''Preprocessing the subcat_2'''
    X['subcat_2'] = X['subcat_2'].str.replace(" & ","_")
    X['subcat_2'] = X['subcat_2'].str.replace(" ","_")
    X['subcat_2'] = X['subcat_2'].str.replace("-","_")
    X['subcat_2'] = X['subcat_2'].str.replace("\'s","")
    X['subcat_2'] = X['subcat_2'].str.replace("\(","")
    X['subcat_2'] = X['subcat_2'].str.replace("\)","")
    X['subcat_2'] = X['subcat_2'].str.lower()
    
    '''Sentiment Score'''
    sid = SentimentIntensityAnalyzer()

    sentiment = []

    for sentence in tqdm(test_data.preprocessed_item_desc.values):
        ss = sid.polarity_scores(str(sentence))
        sentiment.append(ss)
        
    '''Converting the list of dictionaries into a dataframe'''
    sentiment_score_df = pd.DataFrame(sentiment)
    
    '''Adding the sentiment score to the data'''

    X['neg'] = sentiment_score_df['neg']
    X['neu'] = sentiment_score_df['neu']
    X['pos'] = sentiment_score_df['pos']
    X['compound'] = sentiment_score_df['compound']
    
    '''Concatenating the preprocessed_name and preprocessed_item_desc'''
    X["name_item_desc"] = X["preprocessed_name"] + " " + X["preprocessed_item_desc"]
    
    '''Concatenating the preprocessed_brand_nm, general_cat, subcat_1 and subcat_2'''
    X["concatenated_brcat"] = X["preprocessed_brand_nm"] + " " + X["general_cat"] + " " + X["subcat_1"] + " " + X["subcat_2"]
    
    '''Numerical features'''
    numerical_features = X.loc[:,['item_condition_id', 'shipping', 'item_description_Length','neg','pos','neu','compound']]

    '''Loading the tokenizers'''
    Tokenizer_pritem_desc = joblib.load('Tokenizer_pritem_desc.pkl')
    Tokenizer_brcat = joblib.load('Tokenizer_brcat.pkl')
    
    '''Loading the minmaxscaler'''
    scaler = joblib.load('minmaxscaler.pkl')
    
    '''Loading the embedding matrix'''
    embedding_matrix = np.load('embedding_matrix.npy')
    
    '''Scaling the numerical features'''
    numerical_features_scaled = scaler.transform(numerical_features.values)
    
    '''Creating a list for concatenated product name and item description preprocessed'''
    pritem_desc_list = []
    for text in list(X["name_item_desc"]):
        if isinstance(text,float):
            text = str(text)
            pritem_desc_list.append(text)
        else:
            pritem_desc_list.append(text)
    
    '''Creating a list of concatenated brand name and categories'''
    brcat_list = []
    for text in list(X["concatenated_brcat"]):
        if isinstance(text,float):
            text = str(text)
            brcat_list.append(text)
        else:
            brcat_list.append(text)
    
    '''Generating the padded sequence for the concatenated product name and preprocessed_item_desc'''
    pritem_desc_sequence = Tokenizer_pritem_desc.texts_to_sequences(pritem_desc_list)
    pritem_desc_padded_sequence = pad_sequences(pritem_desc_sequence, padding = 'post', truncating = 'post', maxlen = 154)
    
    '''Generating the padded sequence for the concatenated brand name and the categories'''
    brcat_sequence = Tokenizer_brcat.texts_to_sequences(brcat_list)
    brcat_padded_sequence = pad_sequences(brcat_sequence, padding = 'post', truncating = 'post', maxlen = 8)
    
    ''' Defining the model variables'''
    maxlen_pritem_desc = 154
    maxlen_brcat = 8
    
    ''' Defining the model architecture'''
    embedding_dim_brcat = 50
    num_tokens_brcat = len(Tokenizer_brcat.word_index)+1
    embedding_dim_preprocessed_pritemdesc = 300
    num_tokens_preprocessed_pritemdesc = len(Tokenizer_pritem_desc.word_index)+1

    # Creating the model architecture
    Inp1 = Input(shape = (maxlen_pritem_desc,), dtype='int64')
    Emb1 = Embedding(input_dim = num_tokens_preprocessed_pritemdesc, output_dim = embedding_dim_preprocessed_pritemdesc, input_length = maxlen_pritem_desc,
                    embeddings_initializer = tf.keras.initializers.constant(embedding_matrix), trainable = False)(Inp1)
    LSTM_layer_1 = LSTM(units=50, return_sequences = True)(Emb1)

    #avgpool = AveragePooling1D(pool_size = 2, strides=2)(Emb1)
    #Flatten_1 = Flatten()(avgpool)

    Inp2 = Input(shape = (maxlen_brcat,), dtype='int64')
    Emb2 = Embedding(input_dim = num_tokens_brcat, output_dim = embedding_dim_brcat, input_length = maxlen_brcat,trainable = True)(Inp2)
    LSTM_layer_2 = LSTM(units=50, return_sequences = True)(Emb2)
    #Flatten_2 = Flatten()(Emb2)

    att_contextvec = Attention()([LSTM_layer_2,LSTM_layer_1])

    avgpool = GlobalAveragePooling1D()(att_contextvec)

    Flatten_1 = Flatten()(avgpool)

    Inp3 = Input(shape = (7,))
    Dense_1 = Dense(units = 4, activation = 'relu', kernel_initializer = 'he_normal')(Inp3)
    #Dense_2 = Dense(units = 4, activation = 'relu', kernel_initializer = 'he_uniform')(Dense_2)

    concat = concatenate([Flatten_1,Dense_1])

    BN_1 = BatchNormalization()(concat)

    Dense_2 = Dense(units = 16, kernel_initializer = 'he_normal')(BN_1)

    Dense_2 = tf.keras.layers.PReLU()(Dense_2)

    dropout_1 = Dropout(0.5)(Dense_2)

    #BN_2 = BatchNormalization()(dropout_1)

    Dense_3 = Dense(units = 8, kernel_initializer = 'he_normal')(dropout_1)

    Dense_3 = tf.keras.layers.PReLU()(Dense_3)

    dropout_2 = Dropout(0.4)(Dense_3)

    Output = Dense(units = 1, activation = 'relu')(dropout_2)

    model = Model(inputs = [Inp1, Inp2, Inp3], outputs = [Output])
    
    '''Loading the best model weights'''
    model.load_weights('best_model_weights.hdf5')
    
    '''Evaluating the best model on the data'''
    score = model.evaluate([np.array(pritem_desc_padded_sequence), np.array(brcat_padded_sequence), numerical_features_scaled],Y)
    
    '''Printing the metric'''
    print("The rmsle on the data given is : {}".format(score[1]))
    
    '''Returning the evaluation metric'''
    return score[1]

    

In [2]:
'''Importing the train data'''

training_data = pd.read_csv("train.tsv", sep="\t")

training_data.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [None]:
X = training_data.loc[:,["train_id","name","item_condition_id","category_name","brand_name","shipping","item_description"]]
Y = training_data[["price"]]

In [None]:
rmsle = final_metric(X,Y)

print("The root mean squared logarithmic error on the training data is : {}".format(rmsle))

In [3]:
training_data.iloc[1,1]

'Razer BlackWidow Chroma Keyboard'

In [4]:
training_data.iloc[1,3]

'Electronics/Computers & Tablets/Components & Parts'

In [7]:
training_data.iloc[3,7]

'New with tags. Leather horses. Retail for [rm] each. Stand about a foot high. They are being sold as a pair. Any questions please ask. Free shipping. Just got out of storage'