In [1]:
#from gensim.models import Word2Vec 
import numpy as np
import re
import pandas as pd
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.metrics import classification_report, confusion_matrix
# https://code.google.com/archive/p/word2vec/
# https://nlp.stanford.edu/projects/glove/

### GloVe (~100MB)

In [2]:
%%time
glove_model = api.load('glove-twitter-25')


CPU times: total: 22.7 s
Wall time: 23 s


In [3]:
df = pd.read_csv('yelp_labelled.txt', header=None, delimiter='\t', names=['txt', 'sentiment'])
df

Unnamed: 0,txt,sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


# Explore dataset, answer one or more of these questions:
### 1. How many words are in each entry?
### 2. Can you normalize words ending with n't (like hadn't, don't) to had not, do not etc?
### 3. What is the size of each class?
### 4. What is the longest and the shortest entry?
### 5. How many words are known to GloVe in each entry?
### 6. Does the dataset contains duplications? Empty values?
### 7. Assign a good/bad sentiment score to each entry based on the word vectors
### 8. How correlated is [7] with the 0/1 labels?

# Solutions

### 1. How many words are in each entry?

In [4]:
df['num_of_words'] = df['txt'].apply(lambda x : len(x.split(' ')))
df.sample(5)

Unnamed: 0,txt,sentiment,num_of_words
605,The food is good.,1,4
176,The service here leaves a lot to be desired.,0,9
738,I love the decor with the Chinese calligraphy ...,1,10
26,- They never brought a salad we asked for.,0,9
68,We got the food and apparently they have never...,0,20


### 2. Can you normalize word ending with n't (like hadn't, don't) to had not, do not etc?

In [5]:
# I used groups to avoid cases as xxx n't xxx and \b for cases s/a wasn'tttt
print(re.sub(r"(\w+)n't\b", r"\g<1> not", r"i n't wasn't here"))
df['normed_txt'] = df['txt'].apply(lambda x : re.sub(r"(\w+)n't\b", r"\g<1> not", x))

df[df.txt.str.contains("n't")].sample(5)

i n't was not here


Unnamed: 0,txt,sentiment,num_of_words,normed_txt
141,The waiter wasn't helpful or friendly and rare...,0,11,The waiter was not helpful or friendly and rar...
126,Bland... Not a liking this place for a number ...,0,26,Bland... Not a liking this place for a number ...
6,Honeslty it didn't taste THAT fresh.),0,6,Honeslty it did not taste THAT fresh.)
604,Couldn't ask for a more satisfying meal.,1,7,Could not ask for a more satisfying meal.
843,I won't be back.,0,4,I wo not be back.


### 3. What is the size of each class?

In [6]:
df.sentiment.value_counts()

1    500
0    500
Name: sentiment, dtype: int64

### 4. What is the longest and the shortest entry?

In [7]:
# assuming num of words
df.sort_values(by=['num_of_words']).head(1)


Unnamed: 0,txt,sentiment,num_of_words,normed_txt
165,DELICIOUS!!,1,1,DELICIOUS!!


In [8]:
df.sort_values(by=['num_of_words']).tail(1)

Unnamed: 0,txt,sentiment,num_of_words,normed_txt
623,a drive thru means you do not want to wait aro...,0,32,a drive thru means you do not want to wait aro...


### 5. How many words are known to GloVe in each entry?

In [9]:
df['known_to_glove'] = df['txt'].apply(lambda x : sum([1 for item in x.split(' ') if glove_model.__contains__(item)]))

In [10]:
df.sample(5)

Unnamed: 0,txt,sentiment,num_of_words,normed_txt,known_to_glove
934,The place was fairly clean but the food simply...,0,12,The place was fairly clean but the food simply...,10
518,"Anyway, I do not think i will go back there.",0,10,"Anyway, I do not think i will go back there.",8
904,-My order was not correct.,0,5,-My order was not correct.,3
777,So in a nutshell: 1) The restaraunt smells lik...,0,19,So in a nutshell: 1) The restaraunt smells lik...,14
718,"After one bite, I was hooked.",1,6,"After one bite, I was hooked.",2


### 6. Does the dataset contains duplications? Empty values?

In [11]:
df.duplicated(subset='txt', keep='first').sum()

4

In [12]:
df[df.duplicated(subset='txt', keep=False)]

Unnamed: 0,txt,sentiment,num_of_words,normed_txt,known_to_glove
334,I love this place.,1,4,I love this place.,2
380,I won't be back.,0,4,I wo not be back.,2
383,The food was terrible.,0,4,The food was terrible.,2
505,I would not recommend this place.,0,6,I would not recommend this place.,4
814,I love this place.,1,4,I love this place.,2
816,The food was terrible.,0,4,The food was terrible.,2
843,I won't be back.,0,4,I wo not be back.,2
846,I would not recommend this place.,0,6,I would not recommend this place.,4


In [13]:
# another way to find the duplicates...
df.txt.value_counts()

I would not recommend this place.                                                                                                         2
I love this place.                                                                                                                        2
I won't be back.                                                                                                                          2
The food was terrible.                                                                                                                    2
Wow... Loved this place.                                                                                                                  1
                                                                                                                                         ..
The burger... I got the "Gold Standard" a $17 burger and was kind of disappointed.                                                        1
OMG, the food was de

In [14]:
# empty texts
sum(df['txt'].apply(len) == 0)

0

### 7. Assign a good/bad sentiment score to each entry based on the word vectors

In [15]:
def sentence_distance_from_word(good_or_bad, sentence):
    
    list_of_word_vectors = [glove_model[item] for item in re.split('\W+', sentence.lower()) if glove_model.__contains__(item)]
    
    # find and return the closest vector
    best_vector_score = 0
    for vector in list_of_word_vectors:
        similarity = cosine_similarity(X=[vector], Y=[glove_model[good_or_bad]])[0][0]
        if similarity > best_vector_score:
            best_vector_score = similarity
    return best_vector_score



In [16]:
sentence_distance_from_word('good', 'be best')

0.9234228

In [17]:
%%time
df['distance_to_good'] = df['txt'].apply(lambda x : sentence_distance_from_word('good', x))
df['distance_to_bad'] = df['txt'].apply(lambda x : sentence_distance_from_word('bad', x))

CPU times: total: 3.44 s
Wall time: 3.44 s


In [18]:
df

Unnamed: 0,txt,sentiment,num_of_words,normed_txt,known_to_glove,distance_to_good,distance_to_bad
0,Wow... Loved this place.,1,4,Wow... Loved this place.,1,0.938756,0.922743
1,Crust is not good.,0,4,Crust is not good.,3,1.000000,0.929481
2,Not tasty and the texture was just nasty.,0,8,Not tasty and the texture was just nasty.,6,0.891364,0.929481
3,Stopped by during the late May bank holiday of...,1,15,Stopped by during the late May bank holiday of...,11,0.935348,0.936768
4,The selection on the menu was great and so wer...,1,12,The selection on the menu was great and so wer...,10,0.937852,0.911714
...,...,...,...,...,...,...,...
995,I think food should have flavor and texture an...,0,12,I think food should have flavor and texture an...,10,0.911711,0.934835
996,Appetite instantly gone.,0,3,Appetite instantly gone.,1,0.814328,0.867360
997,Overall I was not impressed and would not go b...,0,10,Overall I was not impressed and would not go b...,8,0.891364,0.929481
998,"The whole experience was underwhelming, and I ...",0,16,"The whole experience was underwhelming, and I ...",10,0.916969,0.934835


In [19]:
df['afeka_sentiment'] = (df['distance_to_good'] > df['distance_to_bad']).astype(int)

In [20]:
df

Unnamed: 0,txt,sentiment,num_of_words,normed_txt,known_to_glove,distance_to_good,distance_to_bad,afeka_sentiment
0,Wow... Loved this place.,1,4,Wow... Loved this place.,1,0.938756,0.922743,1
1,Crust is not good.,0,4,Crust is not good.,3,1.000000,0.929481,1
2,Not tasty and the texture was just nasty.,0,8,Not tasty and the texture was just nasty.,6,0.891364,0.929481,0
3,Stopped by during the late May bank holiday of...,1,15,Stopped by during the late May bank holiday of...,11,0.935348,0.936768,0
4,The selection on the menu was great and so wer...,1,12,The selection on the menu was great and so wer...,10,0.937852,0.911714,1
...,...,...,...,...,...,...,...,...
995,I think food should have flavor and texture an...,0,12,I think food should have flavor and texture an...,10,0.911711,0.934835,0
996,Appetite instantly gone.,0,3,Appetite instantly gone.,1,0.814328,0.867360,0
997,Overall I was not impressed and would not go b...,0,10,Overall I was not impressed and would not go b...,8,0.891364,0.929481,0
998,"The whole experience was underwhelming, and I ...",0,16,"The whole experience was underwhelming, and I ...",10,0.916969,0.934835,0


### 8. How correlated is [7] with the 0/1 labels?

In [21]:
print(classification_report(df['sentiment'], df['afeka_sentiment']))

print(confusion_matrix(df['sentiment'], df['afeka_sentiment']))

              precision    recall  f1-score   support

           0       0.66      0.56      0.61       500
           1       0.62      0.72      0.66       500

    accuracy                           0.64      1000
   macro avg       0.64      0.64      0.63      1000
weighted avg       0.64      0.64      0.63      1000

[[279 221]
 [142 358]]
