# <center> Step 3 - Embedding </center>

In [28]:
#import modules

import os.path
import pandas as pd
import numpy as np
import json

import re
import gensim
from gensim import corpora, similarities
from gensim.test.utils import common_dictionary, common_corpus, get_tmpfile
from gensim.models import LsiModel, FastText
from gensim.models.coherencemodel import CoherenceModel

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Reading the Data

In [None]:
# for use on google colab
from google.colab import files
uploaded = files.upload()

In [2]:
def read_jl_file(file_name):
    values = []
    with open(file_name, 'rb') as f:
        line = '---'
        while len(line)>1:
            line = f.readline()
            values.append(line)
    values = values[:-1]
    values = [json.loads(i) for i in values]
    df = pd.DataFrame(values)
    return df

# Reading file
#df = read_jl_file('/Users/Thomas/Documents/Data Science X/Cours/Capgemini project/1 - Scrapping : SWOT/ReviewRestoSpider.jl')

df = read_jl_file('/Users/Thomas/Documents/GitHub/Capgemini_NLP_project-ongoing-/Coding/ReviewRestoSpider_10pages.jl')
df.Rating = df.Rating.apply(lambda x: x[0])
df.Rating = df.Rating.apply(lambda x: int(x[0]))

In [3]:
df

Unnamed: 0,partial content,name,title,Restaurant_name,Rating
0,I have been using this restaurant for years. T...,tracey h,"Best authentic Food, Chefs are top quality",[ Karahi Junction ],5
1,"The most amazing breakfast, ingredients fresh ...",kazowen,Breakfast Bliss,[ Melucci's ],5
2,We went with friends ( 2 couples). The food he...,elthamfams,Excellent Food,[ Melucci's ],5
3,So why not five stars? Everything was was very...,richardcC3581NV,"Friendly staff, good food, good portions",[ Awesome Thai ],4
4,"Great food, friendly staff, decent prices. I h...",leona939,Great food,[ Namaste Gurkha ],5
...,...,...,...,...,...
3648,We were a large group from the states; was the...,I8025IKdonnal,Over the top exceptional!,[ Mezzet Lebanese Restaurant ],5
3649,Mezzet is always very welcoming and you will n...,VMD67,Simply Delicious,[ Mezzet Lebanese Restaurant ],5
3650,Visited for our friends anniversary lunch. We ...,GarethrT,Anniversary lunch,[ The French Table ],5
3651,We recently went to Mezzet for the second time...,jackdL849SV,Fantastic Meal!,[ Mezzet Lebanese Restaurant ],5


In [4]:
type(df['partial content'][0])

str

# Preprocessing (data cleaning)

In [5]:
def preprocess_data(raw_text):
    """
    Input  : raw text to clean
    Purpose: preprocess (i.e. clean) text (tokenizing, removing stopwords, stemming)
    Output : preprocessed text
    """
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    
    # create English stop words list
    en_stop = set(stopwords.words('english'))
    
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    
    # initialize list for tokenized documents in loop
    df["cleaned_description"] = ""
    
    # clean and tokenize document string
    if raw_text!=None:
        raw = raw_text.lower()
    else:
        raw = "none"
    tokens = tokenizer.tokenize(raw)
    
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

    return stemmed_tokens

# clean
df["clean_description"] = df["partial content"].apply(preprocess_data)
df.head()

Unnamed: 0,partial content,name,title,Restaurant_name,Rating,cleaned_description,clean_description
0,I have been using this restaurant for years. T...,tracey h,"Best authentic Food, Chefs are top quality",[ Karahi Junction ],5,,"[use, restaur, year, chef, excel, alway, good,..."
1,"The most amazing breakfast, ingredients fresh ...",kazowen,Breakfast Bliss,[ Melucci's ],5,,"[amaz, breakfast, ingredi, fresh, cook, perfec..."
2,We went with friends ( 2 couples). The food he...,elthamfams,Excellent Food,[ Melucci's ],5,,"[went, friend, 2, coupl, food, excel, staff, g..."
3,So why not five stars? Everything was was very...,richardcC3581NV,"Friendly staff, good food, good portions",[ Awesome Thai ],4,,"[five, star, everyth, pleasant, busi, popular,..."
4,"Great food, friendly staff, decent prices. I h...",leona939,Great food,[ Namaste Gurkha ],5,,"[great, food, friendli, staff, decent, price, ..."


In [None]:
def get_dictionary(doc_clean):
    """
    Input  : clean document
    Purpose: get the whole associated vocabulary
    Output : term dictionary where every unique term is assigned an index
    """
    return corpora.Dictionary(doc_clean)

# get vocabulary
dictionary = get_dictionary(df.clean_description)
for i,j in dictionary.items():
    print("word =", j, "--> ID =", i)

# Embeddings

## TFIDF (built from scratch)

In [20]:
def get_TF_matrix(doc_clean, useTransfertDict=True):
    """
    Input  : clean document
    Purpose: get the term frequency matrix from a corpus
    Output : Document Term Frequency Matrix
    """
    # Creating the term dictionary of our courpus, where every unique term is assigned an index 
    dictionary = corpora.Dictionary(doc_clean)

    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above
    return [dictionary.doc2bow(doc) for doc in doc_clean]
    
def get_normTF_matrix(doc_clean):
    """
    Input  : clean document
    Purpose: get the term frequency matrix from a corpus
    Output : Document Term Frequency Matrix
    """
    # Creating the term dictionary of our courpus, where every unique term is assigned an index
    dictionary = corpora.Dictionary(doc_clean)

    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above
    TFmat = pd.DataFrame(0,
                         columns=[i for i in range(len(doc_clean))],
                         index=[i for i in range(len(dictionary))]
                        )
    i=0
    for doc in doc_clean:
        vec = [0 for i in range(len(dictionary))]
        for wrd, freq in dictionary.doc2bow(doc):
            vec[wrd] = round(freq / len(doc), 4) # normalization
        TFmat[i] = vec
        i+=1
    return TFmat

def get_IDF_matrix(doc_clean, useTransfertDict=True):
    """
    Input  : clean document
    Purpose: create IDF matrix
    Output : IDF Matrix
    """
    # Create vocabulary
    if useTransfertDict:
        vocabulary = common_dictionary
    else:
        vocabulary = get_dictionary(doc_clean)
    
    # Creating tf matrix
    TF_matrix = get_TF_matrix(doc_clean)
    
    # Converting list of documents (corpus) into IDF Matrix using dictionary prepared above
    word_idf_values = {i:0 for i in list(vocabulary.keys())}
    
    for w in range(len(vocabulary)): 
        for doc in TF_matrix:
            for wrd in doc:
                if w == wrd[0]:
                    word_idf_values[w]+=1
        word_idf_values[w] = np.log(len(doc_clean) / (1 + word_idf_values[w]))
    return word_idf_values

def get_TFIDF_matrix(doc_clean):
    """
    Input  : clean document
    Purpose: create TFIDF matrix
    Output : TFIDF Matrix
    """
    # Creating tf matrix
    TF_matrix = get_normTF_matrix(doc_clean)

    # Create IDF
    IDF_vector = get_IDF_matrix(doc_clean)

    # initialize TFIDF as TF
    TFIDF_matrix = TF_matrix.copy()
    
    # update TF Matrix using IDF term.
    for doc in range(len(doc_clean)):
        for word, score in IDF_vector.items():
            TFIDF_matrix[doc][word] = TFIDF_matrix[doc][word] * score
    return TFIDF_matrix

# Compute TFIDF matrix
corpus_TFIDFmatrix = get_TFIDF_matrix(df.clean_description)
corpus_TFIDFmatrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3643,3644,3645,3646,3647,3648,3649,3650,3651,3652
0,0.396297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0000,0.475461,0.190184,0.0,0.0000
1,0.090799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0000,0.000000,0.000000,0.0,0.0000
2,0.215121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0000,0.000000,0.000000,0.0,0.0000
3,0.156819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0000,0.000000,0.000000,0.0,0.0000
4,0.190390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0000,0.000000,0.000000,0.0,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0000,0.000000,0.000000,0.0,0.0000
6266,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0256,0.000000,0.000000,0.0,0.0000
6267,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0256,0.000000,0.000000,0.0,0.0000
6268,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0256,0.000000,0.000000,0.0,0.0000


## LSI Embedding

In [21]:
def create_gensim_lsi_model(clean_documents_list, k=None):
    """
    Input  : clean document, dictionary
    Purpose: create LSI model (Latent Semantic Indexing) from corpus and dictionary
    Output : return LSI model
    """
    
    # LSI model consists of Singular Value Decomposition (SVD) of
    # Term Document Matrix M: M = T x S x D
    # and dimensionality reductions of T, S and D' ("Derivation")
    
    dictionary = get_dictionary(clean_documents_list)
    corpus = get_TF_matrix(clean_documents_list)
    
    if k is not None:
        lsi_model = LsiModel(
                corpus = corpus,
                id2word = dictionary,
                num_topics = int(k)
                )
    else:
            lsi_model = LsiModel(
            corpus = corpus,
            id2word = dictionary 
            )
    
    print(); print(); print("="*20, "Training LSI model report", "="*20); print() # for output design
    print("Initial TF matrix (NwordsXNdocuments): ")
    
    TF = []
    for x in corpus:
        wrds = [0 for i in range(len(dictionary))]
        for i, j in x:
            wrds[i] = j 
        TF.append(wrds)

    print(pd.np.transpose(TF))
    print()
    print("Derivation of Term Matrix T of Training Document Word Stems: ")
    print(lsi_model.get_topics())
    print()
    
    #Derivation of Term Document Matrix of Training Document Word Stems = M' * [Derivation of T]
    print("LSI Vectors of Training Document Word Stems: ")
    print([lsi_model[document_word_stems] for document_word_stems in corpus])
    print("="*70); print(); print()
    return lsi_model

def get_lsi_vector(lsi_model, clean_text):
    return lsi_model[dictionary.doc2bow(clean_text)]

def select_optimal_k_value(singular_values, significativity=75):
    singular_values.sort()
    lsi_model.projection.s = lsi_model.projection.s[::-1]
    sum_of_singular_values = sum(singular_values)
    s = 0; k = 0 
    while((s < significativity * sum_of_singular_values / 100) and 
          k < max(2, len(singular_values) - 2)):
        s+=singular_values[k]
        k+=1
    return(k)

In [24]:
# train lsi model (init)
lsi_model = create_gensim_lsi_model(df.clean_description, k=None)




Initial TF matrix (NwordsXNdocuments): 
[[4 0 0 ... 2 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]]

Derivation of Term Matrix T of Training Document Word Stems: 
[[ 7.23705607e-02  6.97908324e-02  4.70463610e-03 ...  1.39348665e-04
   1.39348665e-04  1.34751860e-04]
 [ 3.75794766e-02  5.24634529e-02 -1.46296916e-02 ... -1.61322341e-05
  -1.61322341e-05  1.72675783e-05]
 [ 1.22073780e-02 -8.25197875e-02  1.07036888e-03 ... -4.52808118e-04
  -4.52808118e-04 -2.28038634e-04]
 ...
 [ 5.94529913e-03  1.90138412e-02  2.10619885e-02 ...  3.13933687e-03
   3.13933687e-03 -1.75720093e-03]
 [-9.51369553e-03 -1.21448589e-02 -2.14777424e-02 ...  3.23036416e-04
   3.23036416e-04  5.08535382e-04]
 [-5.69904118e-03 -2.18282804e-03  3.90754960e-03 ... -3.77929032e-04
  -3.77929032e-04  4.19763646e-04]]

LSI Vectors of Training Document Word Stems: 


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [26]:
class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

In [27]:
optimal_k = select_optimal_k_value(lsi_model.projection.s, significativity=75)
print(color.BOLD, "Dimension reduction - optimal k is: ", color.END, optimal_k)

[1m Dimension reduction - optimal k is:  [0m 175


In [28]:
# train lsi model (optimal_k)
lsi_model = create_gensim_lsi_model(df.clean_description, k=optimal_k)




Initial TF matrix (NwordsXNdocuments): 
[[4 0 0 ... 2 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]]

Derivation of Term Matrix T of Training Document Word Stems: 
[[ 7.23705879e-02  6.97906795e-02  4.70473752e-03 ...  1.39348779e-04
   1.39348779e-04  1.34747845e-04]
 [-3.75782983e-02 -5.24650501e-02  1.46274887e-02 ...  1.59579103e-05
   1.59579103e-05 -1.68235266e-05]
 [ 1.22030008e-02 -8.25370239e-02  1.09320727e-03 ... -4.50397972e-04
  -4.50397972e-04 -2.25314951e-04]
 ...
 [ 7.13572006e-04  4.45298318e-02  1.15942290e-02 ...  3.26972889e-04
   3.26972889e-04  1.76289808e-04]
 [-4.37734403e-03 -4.61756133e-03  4.84904691e-03 ...  2.58569631e-03
   2.58569631e-03 -1.71858652e-05]
 [ 5.30136764e-04  7.24309674e-03  7.62407484e-03 ... -1.38575750e-04
  -1.38575750e-04 -4.00436337e-04]]

LSI Vectors of Training Document Word Stems: 


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
# display singular values
# lsi_model.projection.s

# display left singular vectors
# lsi_model.projection.u

# display right singular vectors (can be reconstructed if needed).
#print(color.BOLD, "LSI Vectors of Training Document Word Stems: ", color.END)
#[lsi_model[d] for d in get_TF_matrix(df.clean_description)]

## Word2Vec Embedding

In [7]:
corpus = df.clean_description.tolist()
max_embedding = 100

path = get_tmpfile("word2vec.model")

model = gensim.models.Word2Vec(size=max_embedding, window=3, min_count=5, workers=4, seed=1, iter=50)
model.build_vocab(corpus[:3000])
model.train(corpus[:3000], total_examples=model.corpus_count, epochs=model.iter)
model.save("word2vec.model")

  


In [8]:
model = gensim.models.Word2Vec.load("word2vec.model")

In [9]:
model.wv.most_similar("dessert", topn=10)

[('browni', 0.5312399864196777),
 ('chocol', 0.5199911594390869),
 ('bread', 0.510167121887207),
 ('pistachio', 0.4986397624015808),
 ('tiramisu', 0.4875122904777527),
 ('mango', 0.4767257571220398),
 ('bitter', 0.47206079959869385),
 ('chees', 0.468998521566391),
 ('tomato', 0.4586534798145294),
 ('cream', 0.4545954167842865)]

In [10]:
model.wv['use'].shape

(100,)

In [11]:
embedding_matrix = dict()

for word in model.wv.vocab.keys():
    embedding_matrix[word] = list(model.wv[word])
    
embedding_matrix = pd.DataFrame(embedding_matrix)

In [12]:
embedding_matrix.head()

Unnamed: 0,use,restaur,year,chef,excel,alway,good,portion,food,fresh,...,martin,tunisian,cultur,moniz,portugues,masti,skewd,beehiv,gezi,kulcha
0,-0.788264,0.038806,0.183511,-1.572962,-0.188596,-0.130794,-0.005102,0.107818,0.898989,-0.901034,...,-0.088729,-0.017593,0.081196,0.094013,0.096104,0.152688,-0.380789,0.210192,-0.117906,-0.288409
1,-0.194942,0.829009,1.469187,1.423897,0.366503,1.367753,0.533291,-0.994256,0.186483,-0.478119,...,0.029562,0.070191,0.00175,-0.290413,0.012101,0.07441,0.044519,0.224215,-0.2531,0.150007
2,-0.144564,0.507244,0.066529,-0.22282,0.460443,-0.192375,-0.33475,1.666057,-0.118697,0.775607,...,-0.1369,-0.052802,0.022676,-0.095946,-0.214366,-0.127279,-0.06102,-0.434667,-0.279612,0.041811
3,0.01841,0.360076,0.611324,-0.099415,-0.923164,-0.225428,-1.382897,0.030158,-0.638276,-0.992657,...,0.117096,-0.116768,0.045856,0.018902,-0.211426,0.006323,-0.180946,0.245709,-0.118267,-0.403101
4,0.030509,-0.546878,-0.531355,-0.658993,0.480832,-0.339031,0.406634,-0.670564,0.22176,0.171875,...,0.306006,-0.23292,-0.15812,-0.134483,-0.123283,0.014353,-0.250805,0.054812,-0.249806,0.256638


## Fast Text Embedding

In [35]:
model_FT = gensim.models.FastText(size=max_embedding, window=3, 
                                  min_count=5, workers=4, sg=1, seed=1, iter=50)

In [36]:
model_FT.build_vocab(corpus[:3000])
model_FT.train(corpus[:3000], total_examples=model.corpus_count, epochs=model.iter)

  


In [37]:
model_FT.wv.most_similar("dessert", topn=10)

[('desert', 0.6242941617965698),
 ('chocol', 0.5313356518745422),
 ('mango', 0.5165298581123352),
 ('appet', 0.49837106466293335),
 ('sorbet', 0.48578161001205444),
 ('appetis', 0.46856147050857544),
 ('appetit', 0.4657207429409027),
 ('appl', 0.458345890045166),
 ('fondant', 0.44326961040496826),
 ('pud', 0.436087965965271)]

In [38]:
model_FT.wv['use'].shape

(100,)

In [39]:
embedding_matrix_FT = dict()

for word in model_FT.wv.vocab.keys():
    embedding_matrix_FT[word] = list(model_FT.wv[word])
    
embedding_matrix_FT = pd.DataFrame(embedding_matrix_FT)

embedding_matrix_FT.head()

Unnamed: 0,use,restaur,year,chef,excel,alway,good,portion,food,fresh,...,martin,tunisian,cultur,moniz,portugues,masti,skewd,beehiv,gezi,kulcha
0,-0.44192,0.198596,0.544554,0.627261,0.194475,0.00895,0.201402,-0.031286,0.041259,0.038559,...,0.349276,0.28822,0.265725,0.157498,-0.132034,0.122696,0.509829,0.045211,0.366371,0.040678
1,-0.282004,-0.37162,-0.625822,-0.0241,-0.125392,0.169849,-0.199216,-0.472755,-0.359557,-0.245306,...,-0.346349,-0.360504,-0.362741,-0.469855,-0.53589,-0.681524,-0.501094,-0.260336,-0.556496,-0.256627
2,0.121368,0.004188,0.171993,-0.036525,0.141909,0.207596,0.166569,0.0517,0.076973,-0.194531,...,-0.114003,0.022822,0.094328,0.416586,0.02123,0.65988,0.165043,-0.161475,0.10574,-0.062406
3,-0.24456,-0.020557,0.08102,0.188281,-0.037971,-0.01292,-0.002355,-0.260551,-0.122618,-0.501384,...,0.554781,-0.088594,-0.025938,0.091139,0.157923,-0.067108,0.552165,-0.095319,-0.197058,0.005904
4,0.333622,-0.23836,0.500754,0.225712,-0.251495,-0.13204,-0.311705,0.114618,-0.034449,-0.158335,...,-0.06177,-0.427533,-0.001008,0.655511,-0.262465,0.058301,-0.436308,-0.025215,-0.217383,-0.002148


# Classification

## Split train/test data

We start by splitting the data into a train set and a validation set, in order to train and validate our models.

In [14]:
n = len(df)
df.sample(n=n, random_state=16) #shuffle
n = int(2 * n / 3)
df_dataset_train = df[:n]
df_dataset_test = df[n:]
print("Split train/test: ", df_dataset_train.shape, "VS", df_dataset_test.shape)

corpus_TFmatrix_train = get_TF_matrix(df_dataset_train.clean_description)
corpus_TFmatrix_test = get_TF_matrix(df_dataset_test.clean_description)

Split train/test:  (2435, 7) VS (1218, 7)


## Cosine Distance Classifier

### Cosine Distance on LSI embeddings

In [30]:
def distance_classifier_cosine_traning(lsi_vector_trainDB):
    """
    Input  : LSI vectors
    Purpose: calculate cosine similarity matrix for all training document LSI vectors
    Output : return similarity matrix
    """
    return similarities.MatrixSimilarity(lsi_vector_trainDB)

def distance_classifier_cosine_test(classification_model, training_data, test_doc_lsi_vector, N=1):
    """
    Input  : trained classifier model, the training data (list of descriptions),
             lsi vectors of a document and N nearest document in the training data base
    Purpose: calculate cosine similarity matrix against all training document LSI vectors
    Output : return nearest N document and classes
    """
    cosine_similarities = classification_model[test_doc_lsi_vector]
    most_similar_document_test = training_data[np.argmax(cosine_similarities)]
    return most_similar_document_test

def reco_rate(ref_labels, predicted_labels):
    commun_labels = (pd.np.array(ref_labels)==pd.np.array(predicted_labels)).sum()
    return 100 * commun_labels / len(ref_labels)

In [31]:
# train classification model

classification_model = distance_classifier_cosine_traning(lsi_model[corpus_TFmatrix_train])
classification_model

<gensim.similarities.docsim.MatrixSimilarity at 0x1a2f8d1e80>

In [32]:
# test on train DB

predicted_class = [distance_classifier_cosine_test(classification_model, 
                                df_dataset_train.Rating, 
                                get_lsi_vector(lsi_model, df_dataset_train.clean_description.iloc[i]))
                                for i in range(df_dataset_train.shape[0])]

print(color.BOLD + "Classifier performances on train DB: %.2f" 
      % reco_rate(df_dataset_train.Rating, predicted_class), "%" + color.END)

[1mClassifier performances on train DB: 100.00 %[0m


In [36]:
# test on test DB

predicted_class_test = [distance_classifier_cosine_test(classification_model, 
                                 df_dataset_train.Rating, 
                                 get_lsi_vector(lsi_model, 
                                               df_dataset_test.clean_description.iloc[i]
                                              ))
                   for i in range(df_dataset_test.shape[0])]

print(color.BOLD + "Classifier performances on test DB: %.2f" 
      % (reco_rate(df_dataset_test.Rating, predicted_class_test)), "%" + color.END)

[1mClassifier performances on test DB: 68.56 %[0m


## CNN Classifier

### CNN on LSI embeddings

In [15]:
from keras.models import Sequential
from keras.layers import Dense,GRU,LSTM
from keras.optimizers import Adam
from keras.layers import Dropout,Bidirectional,Conv1D,Conv2D,GlobalMaxPooling1D,GlobalMaxPooling2D,MaxPooling1D,Flatten,BatchNormalization,Embedding
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical

Using TensorFlow backend.


In [128]:
# For the CNN:
# `x_train` and `x_val` = embedded reviews (reformated for CNN in the following lines of code)
# `y_train` and `y_val` = ratings (one-hot encoded)

x_train = [get_lsi_vector(lsi_model, df_dataset_train.clean_description.iloc[i]) for i in range(df_dataset_train.shape[0])]
# previous line is equiv to: x_train = np.asarray(lsi_model[corpus_TFmatrix_train])
x_train = np.asarray([[x_train[j][i][1] for i in range(len(x_train[j]))] for j in range(len(x_train))])
x_train = x_train.reshape((x_train.shape[0], 1, x_train.shape[1])) # because CNN has to be fed a 3D object
print(x_train.shape)

x_val = [get_lsi_vector(lsi_model, df_dataset_test.clean_description.iloc[i]) for i in range(df_dataset_test.shape[0])]
x_val = np.asarray([[x_val[j][i][1] for i in range(len(x_val[j]))] for j in range(len(x_val))])
x_val = x_val.reshape((x_val.shape[0], 1, x_val.shape[1]))
print(x_val.shape)

y_train = np.asarray(df_dataset_train.Rating)
y_train = to_categorical(y_train)
print(y_train.shape)

y_val = np.asarray(df_dataset_test.Rating)
y_val = to_categorical(y_val)
print(y_val.shape)

num_classes = y_train.shape[1]

(2435, 1, 175)
(1218, 1, 175)
(2435, 6)
(1218, 6)


In [125]:
model_CNN = Sequential()

model_CNN.add(Conv1D(64,input_shape=(1, 175),kernel_size=3,padding='same',activation='relu',strides=1))
model_CNN.add(GlobalMaxPooling1D())
model_CNN.add(Dense(128,activation='relu'))
model_CNN.add(Dropout(0.2))
model_CNN.add(BatchNormalization())
model_CNN.add(Dense(num_classes,activation='sigmoid'))

model_CNN.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model_CNN.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_9 (Conv1D)            (None, 1, 64)             33664     
_________________________________________________________________
global_max_pooling1d_9 (Glob (None, 64)                0         
_________________________________________________________________
dense_15 (Dense)             (None, 128)               8320      
_________________________________________________________________
dropout_9 (Dropout)          (None, 128)               0         
_________________________________________________________________
batch_normalization_8 (Batch (None, 128)               512       
_________________________________________________________________
dense_16 (Dense)             (None, 6)                 774       
Total params: 43,270
Trainable params: 43,014
Non-trainable params: 256
_________________________________________________________________


In [126]:
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)

history_CNN = model_CNN.fit(x_train, y_train, 
                            validation_data=(x_val, y_val),
                            epochs=4,
                            batch_size=32, 
                            verbose=1,
                            callbacks = [early_stop])

Train on 2435 samples, validate on 1218 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


Accuracy is much higher than with a simpler model (cosine distance).

### CNN on Word2Vec embeddings

In [15]:
embedding_matrix.head()

Unnamed: 0,use,restaur,year,chef,excel,alway,good,portion,food,fresh,...,martin,tunisian,cultur,moniz,portugues,masti,skewd,beehiv,gezi,kulcha
0,-0.429325,-0.240608,0.424857,-0.125756,0.038207,-0.202852,-0.076771,0.409456,0.211813,-0.145445,...,0.076039,-0.075377,-0.119985,0.016004,0.012585,0.014679,-0.009203,0.191222,-0.227424,-0.038551
1,0.001536,-0.160717,-0.022637,0.47118,0.155363,0.592102,0.332712,-1.040051,0.072936,0.311472,...,-0.115206,-0.301491,0.076295,0.021358,-0.210045,-0.058071,0.069347,-0.024982,0.080424,-0.143771
2,0.551598,0.228602,0.469899,0.354883,0.200154,0.138538,0.86581,0.127472,0.146347,0.060945,...,-0.251848,0.167571,-0.132739,-0.129528,0.014269,-0.098752,0.16086,-0.161444,0.023455,-0.242818
3,-0.010923,0.254806,0.219923,0.247228,0.07753,0.105497,0.134444,-0.844041,0.06645,0.025498,...,0.143763,-0.03412,-0.125462,0.099024,-0.016786,0.007096,0.120283,-0.044866,0.266713,0.017892
4,0.247785,-0.121593,0.001933,-0.499855,0.236961,-0.05299,0.256099,0.300218,0.127449,0.614651,...,0.050362,-0.044157,-0.154888,-0.001106,0.038348,0.029304,-0.05464,-0.072185,-0.047177,0.008093


We first embed the clean review descriptions from the train set to build `x_train`:

In [16]:
max_words = 20
max_embedding = 100 #see 'Word2Vec Embedding' section

In [17]:
# keeping only the 20 first words of each clean description (i.e. each review)
x_train = [df_dataset_train.clean_description.iloc[i][:max_words] for i in range(df_dataset_train.shape[0])]

# embedding each word in each review, and padding with 0 vectors if review is shorter than 20 words
x_train_w2v = []
for i in range(len(x_train)):
    x = np.array([np.array(embedding_matrix[word]) for word in x_train[i] if word in embedding_matrix.columns])
    while x.shape[0] < max_words:
        x = np.append(x, [np.zeros(100)], axis=0)
    x_train_w2v.append(x)

# converting as array and checking shape
x_train = np.array(x_train_w2v)
x_train.shape

(2435, 20, 100)

We do the same with the test set to build `x_val`:

In [18]:
# keeping only the 20 first words of each clean description (i.e. each review)
x_val = [df_dataset_test.clean_description.iloc[i][:max_words] for i in range(df_dataset_test.shape[0])]

# embedding each word in each review, and padding with 0 vectors if review is shorter than 20 words
x_val_w2v = []
for i in range(len(x_val)):
    x = np.array([np.array(embedding_matrix[word]) for word in x_val[i] if word in embedding_matrix.columns])
    while x.shape[0] < max_words:
        x = np.reshape(x, (x.shape[0],max_embedding))
        x = np.concatenate((x, [np.zeros(max_embedding)]), axis=0)
    x_val_w2v.append(x)

# converting as array and checking shape
x_val = np.array(x_val_w2v)
x_val.shape

(1218, 20, 100)

For `y_train` and `y_val`, we keep the same as in the previous CNN (one-hot encoded ratings):

In [19]:
y_train = np.asarray(df_dataset_train.Rating)
y_train = to_categorical(y_train)
print(y_train.shape)

y_val = np.asarray(df_dataset_test.Rating)
y_val = to_categorical(y_val)
print(y_val.shape)

(2435, 6)
(1218, 6)


Let's now train and test the CNN classifier:

In [36]:
model_CNN = Sequential()

model_CNN.add(Conv1D(128,input_shape=(max_words, max_embedding),kernel_size=3,padding='same',activation='relu',strides=1))
model_CNN.add(MaxPooling1D())
model_CNN.add(Conv1D(64,kernel_size=3,padding='same',activation='relu',strides=1))
model_CNN.add(GlobalMaxPooling1D())
model_CNN.add(Dense(128,activation='relu'))
model_CNN.add(Dropout(0.2))
model_CNN.add(BatchNormalization())
model_CNN.add(Dense(num_classes,activation='sigmoid'))

model_CNN.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model_CNN.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_9 (Conv1D)            (None, 20, 128)           38528     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 10, 128)           0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 10, 64)            24640     
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 64)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 128)               8320      
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
batch_normalization_4 (Batch (None, 128)               512       
__________

In [37]:
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)

history_CNN = model_CNN.fit(x_train, y_train, 
                            validation_data=(x_val, y_val),
                            epochs=4,
                            batch_size=32, 
                            verbose=1,
                            callbacks = [early_stop])

Train on 2435 samples, validate on 1218 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


Accuracy is more or less the same as with LSI embeddings. For this last network, adding a 2nd convolutional layer improves performance a little, but it remains just a little lower as the previous CNN based on LSI embeddings. 

To be more precise about what model works best to predict ratings, we should test our models on a larger set of data (the full scrapped data) and take more time to tune hyperparameters (model architecture, dropout rate, batch normalization, pooling method, kernel size, padding and stride, embedding length, number of words per review, etc. - the list is long...!).

## LSTM classifier

We finally tried LSTM networks, expecting that the LSTM cells would work best at analyzing text thanks to their "memory" property.

### LSTM with Word2Vec embeddings

In [26]:
max_features = 1615 # nb of words in the vocabulary

model_LSTM = Sequential()
model_LSTM.add(LSTM(64, input_shape=(max_words, max_embedding), dropout=0.3, return_sequences=True))  
model_LSTM.add(LSTM(32, dropout=0.3, return_sequences=False))
model_LSTM.add(Dense(num_classes, activation='softmax'))

model_LSTM.compile(loss='binary_crossentropy',
                    optimizer=Adam(lr=0.001),
                    metrics=['accuracy'])
model_LSTM.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_7 (LSTM)                (None, 20, 64)            42240     
_________________________________________________________________
lstm_8 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 198       
Total params: 54,854
Trainable params: 54,854
Non-trainable params: 0
_________________________________________________________________


In [27]:
history_LSTM = model_LSTM.fit(x_train, y_train,
                              validation_data=(x_val, y_val),
                              epochs=4,
                              batch_size=64,
                              verbose=1)

Train on 2435 samples, validate on 1218 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


The LSTM network gets a little higher accuracy on the validation set than previous DL models - even though training time is a little longer (more parameters). We think that with model fine-tuning and training on a larger dataset, LSTM could be the preferable and most performant model choice.

### LSTM with FastText embeddings

As a last try, we implemented a LSTM based on FastText embeddings:

In [40]:
# Pre-processing

x_train = [df_dataset_train.clean_description.iloc[i][:max_words] for i in range(df_dataset_train.shape[0])]

x_train_FT = []
for i in range(len(x_train)):
    x = np.array([np.array(embedding_matrix_FT[word]) for word in x_train[i] if word in embedding_matrix_FT.columns])
    while x.shape[0] < max_words:
        x = np.append(x, [np.zeros(100)], axis=0)
    x_train_FT.append(x)

x_train = np.array(x_train_FT)
x_train.shape

(2435, 20, 100)

In [41]:
x_val = [df_dataset_test.clean_description.iloc[i][:max_words] for i in range(df_dataset_test.shape[0])]

x_val_FT = []
for i in range(len(x_val)):
    x = np.array([np.array(embedding_matrix_FT[word]) for word in x_val[i] if word in embedding_matrix_FT.columns])
    while x.shape[0] < max_words:
        x = np.reshape(x, (x.shape[0],max_embedding))
        x = np.concatenate((x, [np.zeros(max_embedding)]), axis=0)
    x_val_FT.append(x)

x_val = np.array(x_val_FT)
x_val.shape

(1218, 20, 100)

In [42]:
max_features = 1615 # nb of words in the vocabulary

model_LSTM = Sequential()
model_LSTM.add(LSTM(64, input_shape=(max_words, max_embedding), dropout=0.3, return_sequences=True))  
model_LSTM.add(LSTM(32, dropout=0.3, return_sequences=False))
model_LSTM.add(Dense(num_classes, activation='softmax'))

model_LSTM.compile(loss='binary_crossentropy',
                    optimizer=Adam(lr=0.001),
                    metrics=['accuracy'])
model_LSTM.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_9 (LSTM)                (None, 20, 64)            42240     
_________________________________________________________________
lstm_10 (LSTM)               (None, 32)                12416     
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 198       
Total params: 54,854
Trainable params: 54,854
Non-trainable params: 0
_________________________________________________________________


In [43]:
history_LSTM = model_LSTM.fit(x_train, y_train,
                              validation_data=(x_val, y_val),
                              epochs=4,
                              batch_size=64,
                              verbose=1)

Train on 2435 samples, validate on 1218 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


Accuracy is very similar to LSTM with Word2Vec, which remains our preferable choice. We think that accuracy for the 2 last models (LSTM with Word2Vec, LSTM with FastText) could be significantly improved by:

* training the network on a larger dataset (full scrapped data)
* fine-tuning network hyper-parameters (model architecture, dropout rate, batch normalization, etc.)
* adjusting the preprocessing part (number of words per review)
* training the FastText embedding model on a larger dataset
* fine-tuning the hyperparameters of the FastText model (embedding size, nb of epochs, etc.)