In [1]:
# Semantics & Word Vectors

# word2vec -> package
# Goals:
# - Allow us to get vector from text
# - Detect similarities mathematically
# Definition: is a two-layert neural net that processes text

# Giving enough data, usage and contexts, word2vec can make 
#  highly accurate guesses about a word's meaning based on past appearances.

# Note: Think that each word is now represented by a vector (a list of numerical elements)
# Note: In spacy each of these vectors has 300 dimensions

In [1]:
import spacy

nlp = spacy.load('en_core_web_lg')

In [3]:
text = "lion cat pet"
doc = nlp(text)

In [6]:
doc.vector.shape

(300,)

In [4]:
for tk in doc:
    for tk2 in doc:
        print(tk.text, tk2.text, tk.similarity(tk2))

lion lion 1.0
lion cat 0.5265437
lion pet 0.39923772
cat lion 0.5265437
cat cat 1.0
cat pet 0.7505456
pet lion 0.39923772
pet cat 0.7505456
pet pet 1.0


In [7]:
# words in similar context tend to be similar in the similarity score.
print(len(nlp.vocab.vectors))
print(nlp.vocab.vectors.shape)

684831
(684831, 300)


In [8]:
tks = nlp("dog cat nargle")
for tk in tks:
    print(tk.text, tk.has_vector, tk.vector_norm, tk.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
nargle False 0.0 True


In [11]:
# Doing Arithmetic with vectors
from scipy import spatial

# Building the function
cosine_similarity = lambda vec1, vec2: 1 - spatial.distance.cosine(vec1, vec2)

king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector
queen = nlp.vocab['queen'].vector

new_vector = king - man + woman
print(cosine_similarity(new_vector, queen))

0.7880843877792358


In [None]:
# Sentiment Analysis
# We explored text clasification and we used it to predict sentiment
#  labels on pre-labeled movie reviews. But what if we don't already have those labels?

# Are there methods of attempting to discern sentiment on raw unlabeled text?

# VADER (Valence Aware Dictionary for sEntiment Reasoning) is a model used for 
# text sentiment analysis that is sensitive to both polatiry (positive/negative)
# and intensity (strength) of emotion.
#  How does it work?
#  - It basically takes the sentiment score (positive or negative) for each word, and at the end we take
#     all the words to sum them up.
#  - VADER also understand context of the sentence
#  - Understand upper and lower case text


# Sentiment on raw text is always challenging due to a variety of possible factors:
# - Positive and Negative sentiment in the same text data
# - Sarcasm using positive words in a negative way

In [12]:
import nltk

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [19]:
from pathlib import Path
import pandas as pd

from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [15]:
sid = SentimentIntensityAnalyzer()

mystring = "This is a good movie"
mystring2 = "This was the best, most awesome movie EVER MADE!!"
mystring3 = "This was the WORST that has ever disgraced the screen"

In [16]:
# sid.polarity_scores(str) -> Gives neg, neu, pos, and compound scores
print(sid.polarity_scores(mystring))
print(sid.polarity_scores(mystring2))
print(sid.polarity_scores(mystring3))

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}
{'neg': 0.0, 'neu': 0.433, 'pos': 0.567, 'compound': 0.88}
{'neg': 0.495, 'neu': 0.505, 'pos': 0.0, 'compound': -0.8331}


In [24]:
current_path = str(Path('.').absolute())
data_path = str(current_path) + '/data/amazonreviews.tsv'
df = pd.read_csv(data_path, sep='\t')
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [25]:
# Cleaning data
df.dropna(inplace=True)

In [26]:
# Get blanks records
blanks = []
for i, lb, rv in df.itertuples():
    # (index, label, review)
    if type(rv) == str:
        if rv.isspace():
            blanks.append(i)

In [27]:
# if blanks
# df.drop(blanks, inplace=True)

[]

In [29]:
# df.iloc[0] -> ACCESING FIRST ROW
# print(df.iloc[0]['review'])
print(sid.polarity_scores(df.iloc[0]['review']))

Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^
{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'compound': 0.9454}
