In [1]:
import pandas as pd
import numpy as np
import re
from re import sub
import multiprocessing
from unidecode import unidecode

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors


from time import time 
from collections import defaultdict

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [2]:
#pip install unidecode

In [8]:
file = pd.read_csv("amazon_alexa.tsv",sep='\t')
file.head()
print('shape',file.shape)

shape (3150, 5)


In [5]:
file_cleaned = file.dropna().drop_duplicates().reset_index(drop=True).rename(columns={'verified_reviews':'title'})

In [10]:
print(file_cleaned.head())
print('shape',file_cleaned.shape)

   rating       date         variation  \
0       5  31-Jul-18  Charcoal Fabric    
1       5  31-Jul-18  Charcoal Fabric    
2       4  31-Jul-18    Walnut Finish    
3       5  31-Jul-18  Charcoal Fabric    
4       5  31-Jul-18  Charcoal Fabric    

                                               title  feedback  
0                                      Love my Echo!         1  
1                                          Loved it!         1  
2  Sometimes while playing a game, you can answer...         1  
3  I have had a lot of fun with this thing. My 4 ...         1  
4                                              Music         1  
shape (2435, 5)


In [11]:
file_cleaned.rating.value_counts()/len(file_cleaned)

INFO - 12:05:46: NumExpr defaulting to 4 threads.


5    0.721150
4    0.141684
1    0.057906
3    0.046407
2    0.032854
Name: rating, dtype: float64

In [12]:
file_cleaned[file_cleaned.rating==0]

Unnamed: 0,rating,date,variation,title,feedback


In [13]:
file_cleaned = file_cleaned[file_cleaned.rating!=0]

In [15]:
def text_to_word_list(text, remove_polish_letters):
    ''' Pre process and convert texts to a list of words 
    method inspired by method from eliorc github repo: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb'''
    text = remove_polish_letters(text)
    text = str(text)
    text = text.lower()

    # Clean the text
    text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = sub(r"\+", " plus ", text)
    text = sub(r",", " ", text)
    text = sub(r"\.", " ", text)
    text = sub(r"!", " ! ", text)
    text = sub(r"\?", " ? ", text)
    text = sub(r"'", " ", text)
    text = sub(r":", " : ", text)
    text = sub(r"\s{2,}", " ", text)
    
    text = text.split()

    return text

In [16]:
file_cleaned.title = file_cleaned.title.apply(lambda x: text_to_word_list(x, unidecode))

In [17]:
file_model = file_cleaned.copy()
file_model = file_model[file_model.title.str.len()>1]

In [19]:
file_model.head()

Unnamed: 0,rating,date,variation,title,feedback
0,5,31-Jul-18,Charcoal Fabric,"[love, my, echo, !]",1
1,5,31-Jul-18,Charcoal Fabric,"[loved, it, !]",1
2,4,31-Jul-18,Walnut Finish,"[sometimes, while, playing, a, game, you, can,...",1
3,5,31-Jul-18,Charcoal Fabric,"[i, have, had, a, lot, of, fun, with, this, th...",1
5,5,31-Jul-18,Heather Gray Fabric,"[i, received, the, echo, as, a, gift, i, neede...",1


In [20]:
sent = [row for row in file_model.title]
phrases = Phrases(sent, min_count=1, progress_per=50000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences[1]
sent[1]

INFO - 12:22:24: collecting all words and their counts
INFO - 12:22:24: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 12:22:24: collected 31976 word types from a corpus of 66936 words (unigram + bigrams) and 2337 sentences
INFO - 12:22:24: using 31976 counts as vocab in Phrases<0 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>
INFO - 12:22:24: source_vocab length 31976
INFO - 12:22:24: Phraser built with 1562 phrasegrams


['loved', 'it', '!']

In [21]:
sentences[1]

['loved', 'it', '!']

In [22]:
w2v_model = Word2Vec(min_count=3,
                     window=4,
                     size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1)

start = time()

w2v_model.build_vocab(sentences, progress_per=50000)

print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))

INFO - 12:22:36: collecting all words and their counts
INFO - 12:22:36: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 12:22:36: collected 5389 word types from a corpus of 58484 raw words and 2337 sentences
INFO - 12:22:36: Loading a fresh vocabulary
INFO - 12:22:36: effective_min_count=3 retains 2100 unique words (38% of original 5389, drops 3289)
INFO - 12:22:36: effective_min_count=3 leaves 54026 word corpus (92% of original 58484, drops 4458)
INFO - 12:22:36: deleting the raw counts dictionary of 5389 items
INFO - 12:22:36: sample=1e-05 downsamples 2100 most-common words
INFO - 12:22:36: downsampling leaves estimated 6516 word corpus (12.1% of prior 54026)
INFO - 12:22:36: estimated required memory for 2100 words and 300 dimensions: 6090000 bytes
INFO - 12:22:36: resetting layer weights


Time to build vocab: 0.02 mins


In [23]:
start = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

w2v_model.init_sims(replace=True)

INFO - 12:23:43: training model with 3 workers on 2100 vocabulary and 300 features, using sg=0 hs=0 sample=1e-05 negative=20 window=4
INFO - 12:23:43: worker thread finished; awaiting finish of 2 more threads
INFO - 12:23:43: worker thread finished; awaiting finish of 1 more threads
INFO - 12:23:43: worker thread finished; awaiting finish of 0 more threads
INFO - 12:23:43: EPOCH - 1 : training on 58484 raw words (6438 effective words) took 0.2s, 31174 effective words/s
INFO - 12:23:44: worker thread finished; awaiting finish of 2 more threads
INFO - 12:23:44: worker thread finished; awaiting finish of 1 more threads
INFO - 12:23:44: worker thread finished; awaiting finish of 0 more threads
INFO - 12:23:44: EPOCH - 2 : training on 58484 raw words (6573 effective words) took 0.3s, 23583 effective words/s
INFO - 12:23:44: worker thread finished; awaiting finish of 2 more threads
INFO - 12:23:44: worker thread finished; awaiting finish of 1 more threads
INFO - 12:23:44: worker thread finis

INFO - 12:23:49: EPOCH - 24 : training on 58484 raw words (6540 effective words) took 0.2s, 27129 effective words/s
INFO - 12:23:49: worker thread finished; awaiting finish of 2 more threads
INFO - 12:23:49: worker thread finished; awaiting finish of 1 more threads
INFO - 12:23:49: worker thread finished; awaiting finish of 0 more threads
INFO - 12:23:49: EPOCH - 25 : training on 58484 raw words (6524 effective words) took 0.3s, 25094 effective words/s
INFO - 12:23:50: worker thread finished; awaiting finish of 2 more threads
INFO - 12:23:50: worker thread finished; awaiting finish of 1 more threads
INFO - 12:23:50: worker thread finished; awaiting finish of 0 more threads
INFO - 12:23:50: EPOCH - 26 : training on 58484 raw words (6491 effective words) took 0.2s, 27893 effective words/s
INFO - 12:23:50: worker thread finished; awaiting finish of 2 more threads
INFO - 12:23:50: worker thread finished; awaiting finish of 1 more threads
INFO - 12:23:50: worker thread finished; awaiting fi

Time to train the model: 0.13 mins


In [24]:
w2v_model.save("word2vec_amazon.model")

INFO - 12:26:14: saving Word2Vec object under word2vec_amazon.model, separately None
INFO - 12:26:14: not storing attribute vectors_norm
INFO - 12:26:14: not storing attribute cum_table
INFO - 12:26:14: saved word2vec_amazon.model


In [26]:
file_export = file_model.copy()
file_export['old_title'] = file_export.title
file_export.old_title = file_export.old_title.str.join(' ')
file_export.title = file_export.title.apply(lambda x: ' '.join(bigram[x]))
file_export.rating = file_export.rating.astype('int8')

In [29]:
file_export['old_title']

0                                          love my echo !
1                                              loved it !
2       sometimes while playing a game you can answer ...
3       i have had a lot of fun with this thing my 4 y...
5       i received the echo as a gift i needed another...
                              ...                        
2429    listening to music searching locations checkin...
2430    i do love these things i have them running my ...
2431    only complaint i have is that the sound qualit...
2433                           nice little unit no issues
2434    the echo dot was easy to set up and use it hel...
Name: old_title, Length: 2337, dtype: object

In [31]:

file_export[['title', 'rating']].to_csv('cleaned_dataset_amazon.csv', index=False)

In [32]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

In [36]:
word_vectors = Word2Vec.load("../preprocessing_and_embeddings/word2vec_amazon.model").wv

INFO - 12:39:57: loading Word2Vec object from ../preprocessing_and_embeddings/word2vec_amazon.model
INFO - 12:39:57: loading wv recursively from ../preprocessing_and_embeddings/word2vec_amazon.model.wv.* with mmap=None
INFO - 12:39:57: setting ignored attribute vectors_norm to None
INFO - 12:39:57: loading vocabulary recursively from ../preprocessing_and_embeddings/word2vec_amazon.model.vocabulary.* with mmap=None
INFO - 12:39:57: loading trainables recursively from ../preprocessing_and_embeddings/word2vec_amazon.model.trainables.* with mmap=None
INFO - 12:39:57: setting ignored attribute cum_table to None
INFO - 12:39:57: loaded ../preprocessing_and_embeddings/word2vec_amazon.model


In [37]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)

In [38]:
word_vectors.similar_by_vector(model.cluster_centers_[0], topn=10, restrict_vocab=None)

INFO - 12:40:22: precomputing L2-norms of word weight vectors


[('support', 0.9999841451644897),
 ('cleaning', 0.999981164932251),
 ('functionality', 0.9999806880950928),
 ('light', 0.9999804496765137),
 ('issues', 0.9999803900718689),
 ('supposed_to', 0.9999799728393555),
 ('z_wave', 0.9999797344207764),
 ('alexa_app', 0.9999796748161316),
 ('miss', 0.9999791383743286),
 ('my_favorite', 0.9999787211418152)]

In [39]:

positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

In [42]:
len(word_vectors.vocab.keys())

2100

In [43]:

words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors.wv[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

  This is separate from the ipykernel package so we can avoid doing imports until


In [44]:
words

Unnamed: 0,words,vectors,cluster
0,love,"[-0.026340038, 0.07963559, -0.031905122, 0.008...",1
1,my,"[-0.027167274, 0.08063432, -0.03152201, 0.0081...",1
2,echo,"[-0.0275069, 0.08009132, -0.031630386, 0.00876...",1
3,!,"[-0.02724483, 0.080774404, -0.031526707, 0.008...",0
4,loved,"[-0.026332214, 0.080499455, -0.031330157, 0.00...",1
...,...,...,...
2095,tv_shows,"[-0.02728316, 0.08089661, -0.031357393, 0.0087...",0
2096,inexpensive,"[-0.02732649, 0.0801433, -0.030886805, 0.00870...",0
2097,regular_tv,"[-0.02627897, 0.079481065, -0.0322978, 0.00856...",1
2098,buffering,"[-0.025984358, 0.08114232, -0.032166343, 0.007...",1


In [71]:
len(words.iloc[2].vectors)

300

In [72]:
model.transform([words.iloc[0].vectors])

array([[0.0077113 , 0.00766221]])

In [45]:
words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [46]:
words.head(10)

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,love,"[-0.026340038, 0.07963559, -0.031905122, 0.008...",1,-1,130.510623,-130.510623
1,my,"[-0.027167274, 0.08063432, -0.03152201, 0.0081...",1,-1,129.987863,-129.987863
2,echo,"[-0.0275069, 0.08009132, -0.031630386, 0.00876...",1,-1,131.840632,-131.840632
3,!,"[-0.02724483, 0.080774404, -0.031526707, 0.008...",0,1,129.175936,129.175936
4,loved,"[-0.026332214, 0.080499455, -0.031330157, 0.00...",1,-1,129.854841,-129.854841
5,it,"[-0.027279275, 0.07926728, -0.031287294, 0.008...",0,1,127.546892,127.546892
6,sometimes,"[-0.027463114, 0.080234565, -0.03178065, 0.008...",0,1,133.607922,133.607922
7,while_playing,"[-0.027587425, 0.07945566, -0.03169518, 0.0081...",0,1,112.188644,112.188644
8,a,"[-0.027170578, 0.08081399, -0.031429715, 0.007...",0,1,137.553236,137.553236
9,you_can,"[-0.027126111, 0.08069215, -0.030887974, 0.007...",1,-1,135.410788,-135.410788


In [47]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)

In [48]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from IPython.display import display

In [49]:
final_file = pd.read_csv('cleaned_dataset_amazon.csv')

In [50]:
sentiment_map = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [51]:
file_weighting = final_file.copy()


In [61]:
file_weighting.rating

0       5
1       5
2       4
3       5
4       5
       ..
2332    5
2333    5
2334    5
2335    5
2336    5
Name: rating, Length: 2337, dtype: int64

In [52]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.title)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.title)



In [53]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.title.split()))

In [54]:
%%time
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)#this step takes around 3-4 minutes minutes to calculate

Wall time: 1.68 s


In [55]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [56]:
replaced_closeness_scores = file_weighting.title.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [59]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.title, file_weighting.rating]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'sentiment']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')
replacement_df['sentiment'] = [1 if i==1 else 0 for i in replacement_df.sentiment]

In [60]:

replacement_df

Unnamed: 0,sentiment_coeff,tfidf_scores,sentence,sentiment,sentiment_rate,prediction
0,"[-130.51062274205154, -129.9878628551397, -131...","[2.330562684574323, 2.4149297233108626, 3.3725...",love my echo !,0,-758.058879,0
1,"[-129.8548408762586, 127.54689240323215, 129.1...","[5.230690617415853, 1.659502291417221, 2.35845...",loved it !,0,-162.910349,0
2,"[133.60792231119075, 112.18864393473083, 137.5...","[5.38975531204554, 7.3707567809121235, 2.42199...",sometimes while_playing a game you_can answer ...,0,6983.826723,1
3,"[-129.95542160860472, 123.77409090397802, 132....","[3.5953052597960764, 2.707317686800056, 3.8592...",i have had a_lot of fun with this_thing my 4 y...,0,1151.176776,1
4,"[-129.95542160860472, -128.21009717746549, 116...","[3.5953052597960764, 5.923837797975797, 1.7431...",i received the echo as a_gift i needed another...,0,1406.115325,1
...,...,...,...,...,...,...
2332,"[-138.9777684256068, -133.5282246028226, 99.95...","[5.355853760369858, 3.259882916738812, 6.67760...",listening_to music searching locations checkin...,0,1164.165536,1
2333,"[-129.95542160860472, 137.0524957050663, -130....","[7.190610519592153, 7.748498438891286, 4.66112...",i do love these_things i have them running my ...,0,10300.795597,1
2334,"[-122.24625312487345, -129.95542160860472, 123...","[6.677609600352178, 8.98826314949019, 10.82927...",only_complaint i have is that the sound_qualit...,0,5284.852601,1
2335,"[-126.68754077843764, 140.11219733080557, 132....","[4.662706579809913, 5.42484663185681, 5.578997...",nice little unit no_issues,0,206.344459,1


In [73]:
predicted_classes = replacement_df.prediction
y_test = replacement_df.sentiment

conf_matrix = pd.DataFrame(confusion_matrix(replacement_df.sentiment, replacement_df.prediction))
print('Confusion Matrix')
display(conf_matrix)

test_scores = accuracy_score(y_test,predicted_classes), precision_score(y_test, predicted_classes), recall_score(y_test, predicted_classes), f1_score(y_test, predicted_classes)

print('\n \n Scores')
scores = pd.DataFrame(data=[test_scores])
scores.columns = ['accuracy', 'precision', 'recall', 'f1']
scores = scores.T
scores.columns = ['scores']
display(scores)

Confusion Matrix


Unnamed: 0,0,1
0,619,1589
1,17,112



 
 Scores


Unnamed: 0,scores
accuracy,0.312794
precision,0.065844
recall,0.868217
f1,0.122404
