In [12]:
import spacy
import numpy as np

In [2]:
#%%capture
#!python -m spacy download en_core_web_lg

In [3]:
nlp = spacy.load("en_core_web_lg")

In [32]:
nlp(u"The quick brown fox jumped").vector.shape

(300,)

In [8]:
nlp(u"fox").vector.shape

(300,)

In [35]:
tokens1 = nlp(u"lion cat pet")
tokens2 = nlp(u"like love hate")

In [36]:
for token1 in tokens2:
    for token2 in tokens2:
        print(token1.text,token2.text,token1.similarity(token2))

like like 1.0
like love 0.657904
like hate 0.65746516
love like 0.657904
love love 1.0
love hate 0.63930994
hate like 0.65746516
hate love 0.63930994
hate hate 1.0


In [39]:
nlp.vocab.vectors.shape,len(nlp.vocab.vectors)

((684831, 300), 684831)

In [41]:
tokens = nlp(u"dog car John")
# oov -- > out of vocablary

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
car True 7.149045 False
John True 6.533578 False


In [44]:
from scipy import spatial

cosine_similarity = lambda vec1,vec2 : 1 - spatial.distance.cosine(vec1,vec2)

In [45]:
king = nlp.vocab["king"].vector 
man = nlp.vocab["man"].vector
woman = nlp.vocab["woman"].vector

In [47]:
# king - man + women --- > New_Vector similar Queen, princess, highness

new_vector = king - man + woman
new_vector.shape

(300,)

In [66]:
computed_similarities = []

for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarities.append((word,similarity))

In [67]:
computed_similarities[:10]

[(<spacy.lexeme.Lexeme at 0x7fb0d871cb90>, 0.018078623339533806),
 (<spacy.lexeme.Lexeme at 0x7fb0d8654280>, 0.034739524126052856),
 (<spacy.lexeme.Lexeme at 0x7fb0d86baa50>, -0.06031614914536476),
 (<spacy.lexeme.Lexeme at 0x7fb0d956aaf0>, -0.0755949541926384),
 (<spacy.lexeme.Lexeme at 0x7fb0d86babe0>, 0.06920962780714035),
 (<spacy.lexeme.Lexeme at 0x7fb0d86baf50>, -0.05690332129597664),
 (<spacy.lexeme.Lexeme at 0x7fb0d86580f0>, -0.10847809165716171),
 (<spacy.lexeme.Lexeme at 0x7fb0d864b1e0>, 0.021246029064059258),
 (<spacy.lexeme.Lexeme at 0x7fb0d864be10>, 0.019021468237042427),
 (<spacy.lexeme.Lexeme at 0x7fb0d864b3c0>, 0.021773068234324455)]

In [68]:
computed_similarities = sorted(computed_similarities, key = lambda item:-item[1])
computed_similarities[:10]

[(<spacy.lexeme.Lexeme at 0x7fb0d67eab90>, 0.8024259805679321),
 (<spacy.lexeme.Lexeme at 0x7fb0d5b8b1e0>, 0.7880843877792358),
 (<spacy.lexeme.Lexeme at 0x7fb0d54cb190>, 0.6401076912879944),
 (<spacy.lexeme.Lexeme at 0x7fb0d5abd8c0>, 0.6208544373512268),
 (<spacy.lexeme.Lexeme at 0x7fb0d5c3f3c0>, 0.6125636100769043),
 (<spacy.lexeme.Lexeme at 0x7fb0d7191e60>, 0.5800970792770386),
 (<spacy.lexeme.Lexeme at 0x7fb0d4d36230>, 0.5787012577056885),
 (<spacy.lexeme.Lexeme at 0x7fb0d6cbbdc0>, 0.5743793845176697),
 (<spacy.lexeme.Lexeme at 0x7fb0d74ea280>, 0.563362181186676),
 (<spacy.lexeme.Lexeme at 0x7fb0d5009c30>, 0.5520980954170227)]

In [69]:
print([t[0].text for t in computed_similarities[:10]])

['king', 'queen', 'prince', 'kings', 'princess', 'royal', 'throne', 'queens', 'monarch', 'kingdom']


# Sentiment Analysis

In [81]:
import nltk
import spacy
import zipfile
import pandas as pd
import numpy as np

from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [80]:
zip_file = zipfile.ZipFile("/content/drive/MyDrive/NLP_Vol1/UPDATED_NLP_COURSE.zip","r")
zip_file.extractall()
zip_file.close()

In [74]:
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [75]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

In [76]:
a = "This is a good movie"

sid.polarity_scores(a)

{'compound': 0.4404, 'neg': 0.0, 'neu': 0.508, 'pos': 0.492}

In [77]:
a = "This was the best, most awesome movie EVER MADE!!!"

sid.polarity_scores(a)

{'compound': 0.8877, 'neg': 0.0, 'neu': 0.425, 'pos': 0.575}

In [78]:
a = 'This was the worst film to ever disgrace the screen.'
sid.polarity_scores(a)

{'compound': -0.8074, 'neg': 0.477, 'neu': 0.523, 'pos': 0.0}

In [83]:
df = pd.read_csv("/content/UPDATED_NLP_COURSE/TextFiles/amazonreviews.tsv",sep = "\t")
print(f"Shape : {df.shape}")
print()
df.head()

Shape : (10000, 2)



Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [84]:
df.isnull().sum()

label     0
review    0
dtype: int64

In [85]:
df["label"].value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

In [87]:
from sklearn.model_selection import train_test_split
from sklearn import metrics 

In [90]:
blanks = []
for i, lb, rv in df.itertuples():
    #(index, label, review)
    if type(rv) == str:
        if rv.isspace():
            blanks.append(i)

blanks

[]

In [91]:
df.iloc[0]["review"]

'Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [92]:
sid.polarity_scores(df.iloc[0]["review"])

{'compound': 0.9454, 'neg': 0.088, 'neu': 0.669, 'pos': 0.243}

In [93]:
df["score"] = df["review"].apply(lambda review : sid.polarity_scores(review))
df.head()

Unnamed: 0,label,review,score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co..."
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co..."
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com..."
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com..."
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp..."


In [94]:
df["compound"] = df["score"].apply(lambda d : d["compound"])
df.head()

Unnamed: 0,label,review,score,compound
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781


In [95]:
df["comp_score"] = df["compound"].apply(lambda score : "pos" if score >= 0 else "neg")
df.head()

Unnamed: 0,label,review,score,compound,comp_score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,pos


In [97]:
print("Accuracy : ", metrics.accuracy_score(df["label"],df["comp_score"]).round(2))

Accuracy :  0.71


In [98]:
print(metrics.confusion_matrix(df["label"],df["comp_score"]))

[[2623 2474]
 [ 435 4468]]


In [99]:
print(metrics.classification_report(df["label"],df["comp_score"]))

              precision    recall  f1-score   support

         neg       0.86      0.51      0.64      5097
         pos       0.64      0.91      0.75      4903

    accuracy                           0.71     10000
   macro avg       0.75      0.71      0.70     10000
weighted avg       0.75      0.71      0.70     10000



# Movie Review Project

In [103]:
df = pd.read_csv("/content/UPDATED_NLP_COURSE/TextFiles/moviereviews.tsv",sep = "\t")
df.isnull().sum()

label      0
review    35
dtype: int64

In [104]:
print(f"Shape : {df.shape}")
print()
df.head()

Shape : (2000, 2)



Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [105]:
df.dropna(inplace = True)
df.isnull().sum()

label     0
review    0
dtype: int64

In [106]:
blanks = []

for i,lb,rv in df.itertuples():

    if type(rv) == str:
        if rv.isspace():
            blanks.append(i)

df.drop(blanks, inplace = True)

In [108]:
df["label"].value_counts()

neg    969
pos    969
Name: label, dtype: int64

In [109]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

In [110]:
df["scores"] = df["review"].apply(lambda review : sid.polarity_scores(review))

df["compound"] = df["scores"].apply(lambda d : d["compound"])

df["comp_score"] = df["compound"].apply(lambda score : "pos" if score >= 0 else "neg")

df.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com...",0.9953,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co...",0.9972,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com...",-0.7264,neg


In [112]:
print("Accuracy : ", metrics.accuracy_score(df["label"],df["comp_score"]).round(2))
print()
print("Confusion Matrix")
print(metrics.confusion_matrix(df["label"],df["comp_score"]))
print()
print(metrics.classification_report(df["label"],df["comp_score"]))

Accuracy :  0.64

Confusion Matrix
[[427 542]
 [162 807]]

              precision    recall  f1-score   support

         neg       0.72      0.44      0.55       969
         pos       0.60      0.83      0.70       969

    accuracy                           0.64      1938
   macro avg       0.66      0.64      0.62      1938
weighted avg       0.66      0.64      0.62      1938



# Exercise

In [114]:
import spacy

nlp = spacy.load("en_core_web_lg")

In [115]:
word1 = nlp.vocab["wolf"].vector
word2 = nlp.vocab["dog"].vector
word3 = nlp.vocab["cat"].vector

In [117]:
from scipy import spatial

consine_sim = lambda x,y : 1- spatial.distance.cosine(x,y)

In [118]:
new_vector = word1 -word2 + word3
new_vector.shape

(300,)

In [119]:
computed_sims = []

for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                sim = consine_sim(new_vector,word.vector)
                computed_sims.append((word,sim))

In [120]:
computed_sims[:10]

[(<spacy.lexeme.Lexeme at 0x7fb0e65b82d0>, -0.008464708924293518),
 (<spacy.lexeme.Lexeme at 0x7fb0e65b4230>, 0.07630855590105057),
 (<spacy.lexeme.Lexeme at 0x7fb0e65b8410>, -0.05378255248069763),
 (<spacy.lexeme.Lexeme at 0x7fb0e65b8320>, 0.08638708293437958),
 (<spacy.lexeme.Lexeme at 0x7fb0e65b87d0>, 0.14269191026687622),
 (<spacy.lexeme.Lexeme at 0x7fb0e65b8370>, -0.01335872896015644),
 (<spacy.lexeme.Lexeme at 0x7fb0e65b8fa0>, 0.04877590388059616),
 (<spacy.lexeme.Lexeme at 0x7fb0e65b8d70>, 0.01627308689057827),
 (<spacy.lexeme.Lexeme at 0x7fb0e65b8550>, 0.03511975333094597),
 (<spacy.lexeme.Lexeme at 0x7fb0e65b8230>, 0.0932503342628479)]

In [121]:
computed_sims = sorted(computed_sims, key = lambda item : -item[-1])
computed_sims[:10]

[(<spacy.lexeme.Lexeme at 0x7fb0b2b6b230>, 0.8239490985870361),
 (<spacy.lexeme.Lexeme at 0x7fb0b2776230>, 0.6372271180152893),
 (<spacy.lexeme.Lexeme at 0x7fb0b2ba3a50>, 0.6041543483734131),
 (<spacy.lexeme.Lexeme at 0x7fb0b2b212d0>, 0.5882272124290466),
 (<spacy.lexeme.Lexeme at 0x7fb0b1cf6af0>, 0.5815284252166748),
 (<spacy.lexeme.Lexeme at 0x7fb0b34b9af0>, 0.5802767872810364),
 (<spacy.lexeme.Lexeme at 0x7fb0b325c2d0>, 0.57982337474823),
 (<spacy.lexeme.Lexeme at 0x7fb0b2146960>, 0.573781430721283),
 (<spacy.lexeme.Lexeme at 0x7fb0b2ceb230>, 0.5730767846107483),
 (<spacy.lexeme.Lexeme at 0x7fb0b27b4eb0>, 0.5641543865203857)]

In [123]:
print([w[0].text for w in computed_sims[:10]])

['wolf', 'wolves', 'panther', 'lynx', 'owl', 'tiger', 'lion', 'fox', 'cat', 'otter']


In [128]:
def vector_math(a,b,c):

    from scipy import spatial

    word1 = nlp.vocab[a].vector
    word2 = nlp.vocab[b].vector
    word3 = nlp.vocab[c].vector

    new_vector = word1 - word2 + word3

    computed_sims = []

    consine_sim = lambda x,y : 1- spatial.distance.cosine(x,y)

    for word in nlp.vocab:
        if word.has_vector:
            if word.is_lower:
                if word.is_alpha:
                    sim = consine_sim(new_vector,word.vector)
                    computed_sims.append((word,sim))

    computed_sims = sorted(computed_sims, key = lambda item : -item[-1])

    return [w[0].text  for w in computed_sims[:10]]

In [130]:
vector_math("king","man","women")

['king',
 'women',
 'queen',
 'kings',
 'queens',
 'royal',
 'princes',
 'womens',
 'nobles',
 'princess']

In [131]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

In [133]:
review = 'This movie portrayed real people, and was based on actual events.'

sid.polarity_scores(review)

{'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0}

In [134]:
def review_rating(string):

    scores = sid.polarity_scores(string)

    if scores["compound"] == 0:
        return "Neutral" 

    elif scores["compound"] > 0:
        return "Positive" 
    
    else: 
        return "Negative" 

In [135]:
review_rating(review)

'Neutral'