In [1]:
#!pip install vaderSentiment

In [2]:
from vaderSentiment.vaderSentiment \
        import SentimentIntensityAnalyzer

In [3]:
sa = SentimentIntensityAnalyzer()

In [4]:
"""
SentimentIntensityAnalyzer.lexicon contains that dictionary of
tokens and their scores that we talked about.
"""
sa.lexicon

{'$:': -1.5,
 '%)': -0.4,
 '%-)': -1.5,
 '&-:': -0.4,
 '&:': -0.7,
 "( '}{' )": 1.6,
 '(%': -0.9,
 "('-:": 2.2,
 "(':": 2.3,
 '((-:': 2.1,
 '(*': 1.1,
 '(-%': -0.7,
 '(-*': 1.3,
 '(-:': 1.6,
 '(-:0': 2.8,
 '(-:<': -0.4,
 '(-:o': 1.5,
 '(-:O': 1.5,
 '(-:{': -0.1,
 '(-:|>*': 1.9,
 '(-;': 1.3,
 '(-;|': 2.1,
 '(8': 2.6,
 '(:': 2.2,
 '(:0': 2.4,
 '(:<': -0.2,
 '(:o': 2.5,
 '(:O': 2.5,
 '(;': 1.1,
 '(;<': 0.3,
 '(=': 2.2,
 '(?:': 2.1,
 '(^:': 1.5,
 '(^;': 1.5,
 '(^;0': 2.0,
 '(^;o': 1.9,
 '(o:': 1.6,
 ")':": -2.0,
 ")-':": -2.1,
 ')-:': -2.1,
 ')-:<': -2.2,
 ')-:{': -2.1,
 '):': -1.8,
 '):<': -1.9,
 '):{': -2.3,
 ');<': -2.6,
 '*)': 0.6,
 '*-)': 0.3,
 '*-:': 2.1,
 '*-;': 2.4,
 '*:': 1.9,
 '*<|:-)': 1.6,
 '*\\0/*': 2.3,
 '*^:': 1.6,
 ',-:': 1.2,
 "---'-;-{@": 2.3,
 '--<--<@': 2.2,
 '.-:': -1.2,
 '..###-:': -1.7,
 '..###:': -1.9,
 '/-:': -1.3,
 '/:': -1.3,
 '/:<': -1.4,
 '/=': -0.9,
 '/^:': -1.0,
 '/o:': -1.4,
 '0-8': 0.1,
 '0-|': -1.2,
 '0:)': 1.9,
 '0:-)': 1.4,
 '0:-3': 1.5,
 '0:03': 1.9,
 '

If you use a stemmer (or lemmatizer) in your pipeline, you’ll need
to apply that stemmer to the VADER lexicon, too, combining the
scores for all the words that go together in a single stem or lemma.

In [5]:
"""
If you use a stemmer (or lemmatizer) in your pipeline, you’ll need
to apply that stemmer to the VADER lexicon, too, combining the
scores for all the words that go together in a single stem or lemma.
4 Out of 7500 tokens defined in VADER, only 3 contain spaces, and
only 2 of those are actually n-grams; the other is an emoticon for
“kiss.”
"""
[(tok, score) for tok, score in sa.lexicon.items() if " " in tok]

[("( '}{' )", 1.6),
 ("can't stand", -2.0),
 ('fed up', -1.8),
 ('screwed up', -1.5)]

The VADER algorithm considers the intensity of sentiment
polarity in three separate scores (positive, negative, and neutral) and
then combines them together into a compound positivity sentiment.

In [6]:
sa.polarity_scores(text=\
                   "Python is very readable and it's great for NLP.")

{'neg': 0.0, 'neu': 0.661, 'pos': 0.339, 'compound': 0.6249}

Notice that VADER handles negation pretty well—“great” has a
slightly more positive sentiment than “not bad.” VADER’s built-in
tokenizer ignores any words that aren’t in its lexicon, and it doesn’t
consider n-grams at all.

In [7]:
sa.polarity_scores(text=\
                  "Python is not a bad choice for most applications.")

{'neg': 0.0, 'neu': 0.737, 'pos': 0.263, 'compound': 0.431}

In [8]:
corpus = ["Absolutely perfect! Love it! :-) :-) :-)",
... "Horrible! Completely useless. :(",
... "It was OK. Some good and some bad things."]

In [9]:
for doc in corpus:
    scores = sa.polarity_scores(doc)
    print('{:+}: {}'.format(scores['compound'], doc))

+0.9428: Absolutely perfect! Love it! :-) :-) :-)
-0.8768: Horrible! Completely useless. :(
-0.1531: It was OK. Some good and some bad things.


### Naive Bayes
A Naive Bayes model tries to find keywords in a set of
documents that are predictive of your target (output) variable.
When your target variable is the sentiment you are trying to
predict, the model will find words that predict that sentiment.
The nice thing about a Naive Bayes model is that the internal
coefficients will map words or tokens to scores just like
VADER does. Only this time you won’t have to be limited to
just what an individual human decided those scores should be.
The machine will find the “best” scores for any problem.

In [26]:
import warnings
warnings.filterwarnings('ignore')

In [27]:
from nlpia.data.loaders import get_data

In [39]:
movies = get_data('hutto_movies')

INFO:nlpia.futil:Reading CSV with `read_csv(*('C:\\Users\\Yassine Yazidi\\anaconda3\\lib\\site-packages\\nlpia\\data\\hutto_ICWSM_2014/movieReviewSnippets_GroundTruth.csv.gz',), **{'nrows': None, 'low_memory': False})`...


In [40]:
movies.head().round(2)

Unnamed: 0_level_0,sentiment,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.27,The Rock is destined to be the 21st Century's ...
2,3.53,The gorgeously elaborate continuation of ''The...
3,-0.6,Effective but too tepid biopic
4,1.47,If you sometimes like to go to the movies to h...
5,1.73,"Emerges as something rare, an issue movie that..."


In [14]:
movies.describe().round(2)

Unnamed: 0,sentiment
count,10605.0
mean,0.0
std,1.92
min,-3.88
25%,-1.77
50%,-0.08
75%,1.83
max,3.94


In [15]:
import pandas as pd

In [16]:
pd.set_option('display.width', 75)

In [17]:
from nltk.tokenize import casual_tokenize

In [18]:
bags_of_words = []

In [19]:
from collections import Counter

In [20]:
for text in movies.text:
    bags_of_words.append(Counter(casual_tokenize(text)))

In [21]:
df_bows = pd.DataFrame.from_records(bags_of_words)

In [22]:
df_bows = df_bows.fillna(0).astype(int)

In [23]:
df_bows.shape

(10605, 20756)

In [24]:
df_bows.head()

Unnamed: 0,The,Rock,is,destined,to,be,the,21st,Century's,new,...,Ill,slummer,Rashomon,dipsticks,Bearable,Staggeringly,’,ve,muttering,dissing
0,1,1,1,1,2,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,4,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
df_bows.head()[list(bags_of_words[0].keys())]

Unnamed: 0,The,Rock,is,destined,to,be,the,21st,Century's,new,...,Schwarzenegger,",",Jean,Claud,Van,Damme,or,Steven,Segal,.
0,1,1,1,1,2,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,2,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,4
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,4,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [31]:
from sklearn.naive_bayes import MultinomialNB 

In [41]:
nb = MultinomialNB()
nb = nb.fit(df_bows, movies.sentiment > 0)

In [43]:
# Get probabilities for positive sentiment class
positive_sentiment_probs = nb.predict_proba(df_bows)[:, 1]

# Transform probabilities to match the desired scale (-4 to 4)
predicted_sentiment = positive_sentiment_probs * 8 - 4

# Assign the predicted sentiment to the movies DataFrame
movies['predicted_sentiment'] = predicted_sentiment

In [49]:
movies['error'] = (movies.predicted_sentiment - 
                  movies.sentiment).abs()

In [50]:
mean_error = movies.error.mean()
round(mean_error, 1)

1.9

In [51]:
movies['sentiment_ispositive'] = (movies.sentiment > 0)\
    .astype(int)

In [52]:
movies['predicted_ispostive'] = (movies.predicted_sentiment \
                                >0).astype(int)

In [53]:
movies['''sentiment predicted_sentiment sentiment_ispositive'''.split()].head(8)

Unnamed: 0_level_0,sentiment,predicted_sentiment,sentiment_ispositive
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2.266667,2.511515,1
2,3.533333,3.999904,1
3,-0.6,-3.655976,0
4,1.466667,1.940954,1
5,1.733333,3.910373,1
6,2.533333,3.995188,1
7,2.466667,3.960466,1
8,1.266667,-1.918701,1


In [55]:
(movies.predicted_ispostive == movies.sentiment_ispositive).sum()/\
len(movies)

0.9344648750589345

In [70]:
products = get_data('hutto_products')
bags_of_words = []
for text in products.text:
    bags_of_words.append(Counter(casual_tokenize(text)))

INFO:nlpia.futil:Reading CSV with `read_csv(*('C:\\Users\\Yassine Yazidi\\anaconda3\\lib\\site-packages\\nlpia\\data\\hutto_ICWSM_2014/amazonReviewSnippets_GroundTruth.csv.gz',), **{'nrows': None, 'low_memory': False})`...


In [71]:
df_product_bows = pd.DataFrame.from_records(bags_of_words)
df_product_bows = df_product_bows.fillna(0).astype(int)
df_all_bows = df_bows.append(df_product_bows)
df_all_bows.columns

Index(['The', 'Rock', 'is', 'destined', 'to', 'be', 'the', '21st',
       'Century's', 'new',
       ...
       'sligtly', 'owner', '81', 'defectively', 'warrranty', 'expire',
       'expired', 'voids', 'baghdad', 'harddisk'],
      dtype='object', length=23302)

In [72]:
df_product_bows = df_all_bows.iloc[len(movies):][df_bows.columns] 
df_product_bows.shape

(3546, 20756)

In [73]:
df_bows.shape

(10605, 20756)

In [74]:
products['ispos'] = (products.sentiment>0).astype(int)

In [78]:
# Handle missing values
df_product_bows = df_product_bows.fillna(0)

# Ensure there are no extremely large values (you might not need this step for bag-of-words data)
# df_product_bows = df_product_bows.clip(max=SomeMaxValue) 

# Predict using the classifier
predicted_ispositive = nb.predict(df_product_bows.values).astype(int)

# Assign the predicted values to the 'predicted_ispositive' column in the 'products' DataFrame
products['predicted_ispositive'] = predicted_ispositive


In [79]:
products.head()

Unnamed: 0,id,sentiment,text,ispos,predicted_ispositive
0,1_1,-0.9,troubleshooting ad-2500 and ad-2600 no picture...,0,0
1,1_2,-0.15,"repost from january 13, 2004 with a better fit...",0,0
2,1_3,-0.2,does your apex dvd player only play dvd audio ...,0,0
3,1_4,-0.1,or does it play audio and video but scrolling ...,0,0
4,1_5,-0.5,before you try to return the player or waste h...,0,0


In [81]:
(products.predicted_ispositive == products.ispos).sum()/len(products)

0.5572476029328821