In [40]:
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

In [41]:
imdb = pd.read_csv('https://raw.githubusercontent.com/skathirmani/datasets/master/imdb_sentiment.csv')

In [42]:
imdb.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [43]:
print(imdb['review'].head(1))

0    A very, very, very slow-moving, aimless movie ...
Name: review, dtype: object


- Sentence Polarity = No. of Positive Words - No. of Negative Words.
- To overcome the intensity polarity nltk came with Intensity of the word Variance Aware Dictionary

In [44]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Name\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [45]:
sentiment = SentimentIntensityAnalyzer()
sentiment.polarity_scores('i love you')

{'compound': 0.6369, 'neg': 0.0, 'neu': 0.192, 'pos': 0.808}

- compound score = score / sqrt(score ** 2 + alpha)
- positive = positive / positive + neutral + negative
- negative = negative  / positive + neutral + negative
- neutral = neutral / positive + neutral + negative

In [46]:
sentiment.polarity_scores('i love india i hate girl ')

{'compound': 0.128, 'neg': 0.374, 'neu': 0.202, 'pos': 0.424}

In [47]:
compound = 0.5/np.sqrt(0.5**2 +15)
compound

0.12803687993289598

In [48]:
positive = (4.2)/(4.2+1+1+3.7)
positive

0.42424242424242425

In [49]:
negative = (3.7)/(4.2+1+1+3.7)
negative

0.37373737373737376

In [50]:
neutral = (2)/(4.2+1+1+3.7)
neutral

0.20202020202020202

In [51]:
def get_sentiment(text):
    sentiment = SentimentIntensityAnalyzer()
    compound = sentiment.polarity_scores(text)['compound']
    if compound > 0:
        return 1
    else:
        return 0

imdb['sentiment_vader'] = imdb['review'].apply(get_sentiment)

In [52]:
from sklearn.metrics import accuracy_score
accuracy_score(imdb['sentiment'], imdb['sentiment_vader'])

0.7941176470588235

# Basic Topics
- Basic feature extraction from text data
- Text cleaning / transformaion
- Text classification
- Bag of words analysis
- Text clustering
- Sentiment Analysis ( supervised, unsupervised )

In [53]:
amazon = pd.read_csv('https://raw.githubusercontent.com/skathirmani/datasets/master/amazon_reviews.csv')

In [54]:
amazon.head(5)

Unnamed: 0.1,Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,528881469,"[0, 0]",5.0,We got this GPS for my husband who is an (OTR)...,"06 2, 2013",AO94DHGC771SJ,amazdnu,Gotta have GPS!,1370131000.0
1,1,528881469,"[12, 15]",1.0,"I'm a professional OTR truck driver, and I bou...","11 25, 2010",AMO214LNFCEI4,Amazon Customer,Very Disappointed,1290643000.0
2,2,528881469,"[43, 45]",3.0,"Well, what can I say. I've had this unit in m...","09 9, 2010",A3N7T0DY83Y4IG,C. A. Freeman,1st impression,1283990000.0
3,3,528881469,"[9, 10]",2.0,"Not going to write a long review, even thought...","11 24, 2010",A1H8PY3QHMQQA0,"Dave M. Shaw ""mack dave""","Great grafics, POOR GPS",1290557000.0
4,4,528881469,"[0, 0]",1.0,I've had mine for a year and here's what we go...,"09 29, 2011",A24EV6RXELQZ63,Wayne Smith,"Major issues, only excuses for support",1317254000.0


In [55]:
docs = amazon['reviewText'].fillna('').str.lower().str.replace('[^a-z ]', '')
stopwords = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.PorterStemmer()
def clean_senetence(text):
    words=text.split(' ')
    words_clean = [stemmer.stem(word) for word in words if word not in stopwords]
    return ' '.join(words_clean)
docs_clean = docs.apply(clean_senetence)
docs_clean.head()

0    got gp husband otr road trucker  impress ship ...
1    im profession otr truck driver bought tnd  tru...
2    well say  ive unit truck four day  prior garmi...
3    go write long review even thought unit deserv ...
4    ive mine year here got tri rout non truck rout...
Name: reviewText, dtype: object

In [56]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(5)
vectorizer.fit(docs_clean)
dtm = vectorizer.transform(docs_clean)
df_dtm = pd.DataFrame(dtm.toarray(),
                     columns = vectorizer.get_feature_names())
df_dtm.shape

(999, 7172)

In [57]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([df_dtm['kindl'], df_dtm['camera']])

array([[1.        , 0.06940188],
       [0.06940188, 1.        ]])

In [58]:
def get_similar_words(input_word, sim_mat):
    cos_vals = sim_mat[input_word].sort_values(ascending = False)
    similar_words = cos_vals.drop(input_word).head(5)
    return similar_words

In [59]:
sim_mat = cosine_similarity(df_dtm.T)
sim_mat = pd.DataFrame(sim_mat,
                      columns=df_dtm.columns,
                      index= df_dtm.columns)
sim_mat['tablet'].sort_values(ascending = False).head(5)

tablet     1.000000
app        0.625789
android    0.572984
invit      0.534830
version    0.513131
Name: tablet, dtype: float64

In [60]:
get_similar_words('app', sim_mat)

tablet        0.625789
android       0.555623
marketplac    0.536400
bird          0.532974
doesnt        0.508315
Name: app, dtype: float64