In [1]:
#from gensim.models import Word2Vec 
import numpy as np
import re
import pandas as pd
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.metrics import classification_report, confusion_matrix
# https://code.google.com/archive/p/word2vec/
# https://nlp.stanford.edu/projects/glove/

### GloVe (~100MB)

In [2]:
%%time
glove_model = api.load('glove-twitter-25')


CPU times: total: 21.5 s
Wall time: 21.8 s


In [3]:
df = pd.read_csv('yelp_labelled.txt', header=None, delimiter='\t', names=['txt', 'sentiment'])
df

Unnamed: 0,txt,sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


# Explore dataset, answer one or more of these questions:
### 1. How many words are in each entry?
### 2. Can you normalize words ending with n't (like hadn't, don't) to had not, do not etc?
### 3. What is the size of each class?
### 4. What is the longest and the shortest entry?
### 5. How many words are known to GloVe in each entry?
### 6. Does the dataset contains duplications? Empty values?
### 7. Assign a good/bad sentiment score to each entry based on the word vectors
### 8. How correlated is [7] with the 0/1 labels?

# Solutions

### 1. How many words are in each entry?

In [4]:
df['num_of_words'] = df['txt'].apply(lambda x : len(x.split(' ')))
df.sample(5)

Unnamed: 0,txt,sentiment,num_of_words
931,If you want to wait for mediocre food and down...,0,19
314,"Great food for the price, which is very high q...",1,13
922,"Every time I eat here, I see caring teamwork t...",1,13
841,The food came out at a good pace.,1,8
438,Wow very spicy but delicious.,1,5


### 2. Can you normalize words ending with n't (like hadn't, don't) to had not, do not etc?

In [5]:
# I used groups to avoid cases as xxx n't xxx and \b for cases as wasn'tttt

print(re.sub(r"(\w+)n't\b", r"\g<1> not", r"n't wasn't wasn'ttttt"))

df['normed_txt'] = df['txt'].apply(lambda x : re.sub(r"(\w+)n't\b", r"\g<1> not", x))

df[df.txt.str.contains("n't")].sample(5)

n't was not wasn'ttttt


Unnamed: 0,txt,sentiment,num_of_words,normed_txt
713,If you are reading this please don't go there.,0,9,If you are reading this please do not go there.
855,My fella got the huevos rancheros and they did...,0,12,My fella got the huevos rancheros and they did...
858,I probably won't be coming back here.,0,7,I probably wo not be coming back here.
731,Ryan's Bar is definitely one Edinburgh establi...,0,11,Ryan's Bar is definitely one Edinburgh establi...
920,"Needless to say, I won't be going back anytime...",0,10,"Needless to say, I wo not be going back anytim..."


### 3. What is the size of each class?

In [6]:
df.sentiment.value_counts()

1    500
0    500
Name: sentiment, dtype: int64

### 4. What is the longest and the shortest entry?

In [7]:
# assuming num of words
df.sort_values(by=['num_of_words']).head(1)


Unnamed: 0,txt,sentiment,num_of_words,normed_txt
165,DELICIOUS!!,1,1,DELICIOUS!!


In [8]:
df.sort_values(by=['num_of_words']).tail(1)

Unnamed: 0,txt,sentiment,num_of_words,normed_txt
623,a drive thru means you do not want to wait aro...,0,32,a drive thru means you do not want to wait aro...


### 5. How many words are known to GloVe in each entry?

In [9]:
df['known_to_glove'] = df['txt'].apply(lambda x : sum([1 for item in x.split(' ') if glove_model.__contains__(item)]))

In [10]:
df.sample(5)

Unnamed: 0,txt,sentiment,num_of_words,normed_txt,known_to_glove
365,"I find wasting food to be despicable, but this...",0,12,"I find wasting food to be despicable, but this...",8
60,At least think to refill my water before I str...,0,17,At least think to refill my water before I str...,13
219,- the food is rich so order accordingly.,1,8,- the food is rich so order accordingly.,7
153,Crostini that came with the salad was stale.,0,8,Crostini that came with the salad was stale.,6
663,Both great!,1,2,Both great!,0


### 6. Does the dataset contains duplications? Empty values?

In [11]:
df.duplicated(subset='txt', keep='first').sum()

4

In [12]:
df[df.duplicated(subset='txt', keep=False)]

Unnamed: 0,txt,sentiment,num_of_words,normed_txt,known_to_glove
334,I love this place.,1,4,I love this place.,2
380,I won't be back.,0,4,I wo not be back.,2
383,The food was terrible.,0,4,The food was terrible.,2
505,I would not recommend this place.,0,6,I would not recommend this place.,4
814,I love this place.,1,4,I love this place.,2
816,The food was terrible.,0,4,The food was terrible.,2
843,I won't be back.,0,4,I wo not be back.,2
846,I would not recommend this place.,0,6,I would not recommend this place.,4


In [13]:
# another way to find the duplicates...
df.txt.value_counts()

I would not recommend this place.                                                                                                         2
I love this place.                                                                                                                        2
I won't be back.                                                                                                                          2
The food was terrible.                                                                                                                    2
Wow... Loved this place.                                                                                                                  1
                                                                                                                                         ..
The burger... I got the "Gold Standard" a $17 burger and was kind of disappointed.                                                        1
OMG, the food was de

In [14]:
# empty texts
sum(df['txt'].apply(len) == 0)

0

### 7. Assign a good/bad sentiment score to each entry based on the word vectors

In [15]:
def sentence_distance_from_word(good_or_bad, sentence):
    # we define sentence disance as the highest similarity of any of the words in the sentecnce with the target word.
    
    # will tokenizing - remover non-letters and lowercase the word
    list_of_word_vectors = [glove_model[item] for item in re.split('\W+', sentence.lower()) if glove_model.__contains__(item)]
    
    if len(list_of_word_vectors) == 0:
        return 0
    
    # return the closest vector
    return max([cosine_similarity(X=[vector], Y=[glove_model[good_or_bad]])[0][0] for vector in list_of_word_vectors])


In [16]:
sentence_distance_from_word('good', 'be best')

0.9234228

In [17]:
%%time
df['distance_to_good'] = df['txt'].apply(lambda x : sentence_distance_from_word('good', x))

df['distance_to_bad'] = df['txt'].apply(lambda x : sentence_distance_from_word('bad', x))

CPU times: total: 3.19 s
Wall time: 3.2 s


In [18]:
df

Unnamed: 0,txt,sentiment,num_of_words,normed_txt,known_to_glove,distance_to_good,distance_to_bad
0,Wow... Loved this place.,1,4,Wow... Loved this place.,1,0.938756,0.922743
1,Crust is not good.,0,4,Crust is not good.,3,1.000000,0.929481
2,Not tasty and the texture was just nasty.,0,8,Not tasty and the texture was just nasty.,6,0.891364,0.929481
3,Stopped by during the late May bank holiday of...,1,15,Stopped by during the late May bank holiday of...,11,0.935348,0.936768
4,The selection on the menu was great and so wer...,1,12,The selection on the menu was great and so wer...,10,0.937852,0.911714
...,...,...,...,...,...,...,...
995,I think food should have flavor and texture an...,0,12,I think food should have flavor and texture an...,10,0.911711,0.934835
996,Appetite instantly gone.,0,3,Appetite instantly gone.,1,0.814328,0.867360
997,Overall I was not impressed and would not go b...,0,10,Overall I was not impressed and would not go b...,8,0.891364,0.929481
998,"The whole experience was underwhelming, and I ...",0,16,"The whole experience was underwhelming, and I ...",10,0.916969,0.934835


In [19]:
df['afeka_sentiment'] = (df['distance_to_good'] > df['distance_to_bad']).astype(int)

In [20]:
df

Unnamed: 0,txt,sentiment,num_of_words,normed_txt,known_to_glove,distance_to_good,distance_to_bad,afeka_sentiment
0,Wow... Loved this place.,1,4,Wow... Loved this place.,1,0.938756,0.922743,1
1,Crust is not good.,0,4,Crust is not good.,3,1.000000,0.929481,1
2,Not tasty and the texture was just nasty.,0,8,Not tasty and the texture was just nasty.,6,0.891364,0.929481,0
3,Stopped by during the late May bank holiday of...,1,15,Stopped by during the late May bank holiday of...,11,0.935348,0.936768,0
4,The selection on the menu was great and so wer...,1,12,The selection on the menu was great and so wer...,10,0.937852,0.911714,1
...,...,...,...,...,...,...,...,...
995,I think food should have flavor and texture an...,0,12,I think food should have flavor and texture an...,10,0.911711,0.934835,0
996,Appetite instantly gone.,0,3,Appetite instantly gone.,1,0.814328,0.867360,0
997,Overall I was not impressed and would not go b...,0,10,Overall I was not impressed and would not go b...,8,0.891364,0.929481,0
998,"The whole experience was underwhelming, and I ...",0,16,"The whole experience was underwhelming, and I ...",10,0.916969,0.934835,0


### 8. How correlated is [7] with the 0/1 labels?

In [21]:
print(classification_report(df['sentiment'], df['afeka_sentiment']))

print(confusion_matrix(df['sentiment'], df['afeka_sentiment']))

              precision    recall  f1-score   support

           0       0.66      0.56      0.61       500
           1       0.62      0.72      0.66       500

    accuracy                           0.64      1000
   macro avg       0.64      0.64      0.63      1000
weighted avg       0.64      0.64      0.63      1000

[[279 221]
 [142 358]]
