# Sentiment Analysis using NLTK Part-1

### The goal here is to predict the sentiments behind movie reviews.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from random import shuffle
from nltk.corpus import movie_reviews as mr
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords

## Preparing Data

We will convert the **CategorizedPlaintextCorpusReader** data into **python list**

In [2]:
reviews = []
for fileid in mr.fileids():
    tag, filename = fileid.split('/')
    reviews.append([mr.raw(fileid), tag])

In [3]:
len(reviews)

2000

In our dataset first 1000 reviews are negative and later are positives. So we will shuffle the data using **shuffle**

In [4]:
shuffle(reviews)

In [5]:
reviews[1768]

['in _daylight_ , sylvester stallone breaks no new ground , cinematically speaking , but he covers familiar territory quite well . \nyesterday , as i was going about my business in the university bookstore , i noticed this stand with these _daylight_ movie posters on it . \ni had never heard of the movie , so i stopped by to check it out . \nit turned out that they were giving away free passes to the movie for people who had or applied for the credit card they were offering . \nso , i shrugged , thought , " hey , they probably won\'t give me the card anyway , and i could always use a free movie , " so i signed up . \nall in all , i don\'t think i got that bad of a deal , either . \nrob cohen , who has previously brought us such films as _dragon : the bruce lee story_ and , more recently , _dragonheart_ ( gee , i wonder if he likes dragons for some reason ? \nyou think ? ) , now brings us a very nicely executed disaster picture , with fx by industrial light and magic , about the after-e

Now we divide the dataset into train and test. 

In [6]:
train=reviews[:1500]
test=reviews[1500:]

We will create a vocabulary for each review and use it to get unigram features from the data

In [7]:
#Required for Bag of words (unigram features) creation
vocabulary = [x.lower() for tagged_sent in train for x in tagged_sent[0].split()]
vocabulary = list(set(vocabulary))
vocabulary.sort() #sorting the list
print(len(vocabulary))
#print(vocabulary)

44116


In [8]:
def get_unigram_features(data,vocab):
    fet_vec_all = []
    for tup in data:
        single_feat_vec = []
        sent = tup[0].lower() #lowercasing the dataset
        for v in vocab:
            if sent.__contains__(v):
                single_feat_vec.append(1)
            else:
                single_feat_vec.append(0)
        fet_vec_all.append(single_feat_vec)
    return fet_vec_all

In [9]:
def get_senti_wordnet_features(data):
    fet_vec_all = []
    for tup in data:
        sent = tup[0].lower()
        words = sent.split()
        pos_score = 0
        neg_score = 0
        for w in words:
            senti_synsets = swn.senti_synsets(w.lower())
            for senti_synset in senti_synsets:
                p = senti_synset.pos_score()
                n = senti_synset.neg_score()
                pos_score+=p
                neg_score+=n
                break #take only the first synset (Most frequent sense)
        fet_vec_all.append([float(pos_score),float(neg_score)])
    return fet_vec_all

In [10]:
def merge_features(featureList1,featureList2):
    # For merging two features
    if featureList1==[]:
        return featureList2
    merged = []
    for i in range(len(featureList1)):
        m = featureList1[i]+featureList2[i]
        merged.append(m)
    return merged

In [11]:
def get_lables(data):
    labels = []
    for tup in data:
        if tup[1].lower()=="neg":
            labels.append(-1)
        else:
            labels.append(1)
    return labels

In [19]:
def calculate_precision(prediction, actual):
    prediction = list(prediction)
    correct_labels = [prediction[i]  for i in range(len(prediction)) if actual[i] == prediction[i]]
    precision = float(len(correct_labels))/float(len(prediction))
    return precision

In [13]:
def real_time_test(classifier,vocab):
    print("Enter a sentence: ")
    inp = input()
    feat_vec_uni = get_unigram_features(inp,vocab)
    feat_vec_swn =get_senti_wordnet_features(test_data)
    feat_vec = merge_features(feat_vec_uni, feat_vec_swn)

    predict = classifier.predict(feat_vec)
    if predict[0]==1:
        print("The sentiment expressed is: positive")
    else:
        print("The sentiment expressed is: negative") 

In [14]:
training_features = get_unigram_features(train,vocabulary) # vocabulary extracted in the beginning
training_labels = get_lables(train)

test_features = get_unigram_features(test,vocabulary)
test_gold_labels = get_lables(test)

In [15]:
pre_lst=[]

In [20]:
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB(alpha=0.1).fit(training_features,training_labels) #training process

print("Precision of NB classifier is")
pred = nb_classifier.predict(training_features)
precision = calculate_precision(pred,training_labels)
print("Training data\t" + str(precision))
pred = nb_classifier.predict(test_features)
precision = calculate_precision(pred,test_gold_labels)
print("Test data\t" + str(precision))
pre_lst.append(precision)

Precision of NB classifier is
Training data	0.9913333333333333
Test data	0.81


In [22]:
from sklearn.svm import SVC
clf = SVC(C=0.01, kernel='sigmoid').fit(training_features,training_labels)
print("Precision of SVM classifier is")
pred = clf.predict(training_features)
precision = calculate_precision(pred,training_labels)
print("Training data\t" + str(precision))
pred = clf.predict(test_features)
precision = calculate_precision(pred,test_gold_labels)
print("Test data\t" + str(precision))
pre_lst.append(precision)

Precision of SVM classifier is
Training data	0.5053333333333333
Test data	0.484


In [23]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=5, max_features='sqrt', random_state=42).fit(training_features,training_labels)
print("Precision of Decision Tree classifier is")
pred = clf.predict(training_features)
precision = calculate_precision(pred,training_labels)
print("Training data\t" + str(precision))
pred = clf.predict(test_features)
precision = calculate_precision(pred,test_gold_labels)
print("Test data\t" + str(precision))
pre_lst.append(precision)

Precision of Decision Tree classifier is
Training data	0.5926666666666667
Test data	0.554


In [24]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=42).fit(training_features,training_labels)
print("Precision of Logistic Regression is")
pred = clf.predict(training_features)
precision = calculate_precision(pred,training_labels)
print("Training data\t" + str(precision))
pred = clf.predict(test_features)
precision = calculate_precision(pred,test_gold_labels)
print("Test data\t" + str(precision))
pre_lst.append(precision)

Precision of Logistic Regression is
Training data	1.0
Test data	0.828


# Sentiment Analysis using NLTK Part-2

### The goal here is to predict the sentiments behind tweets.

In [25]:
from nltk.corpus import twitter_samples as ts
import json

In [26]:
mypath= 'C:/Users/Tejas/AppData/Roaming/nltk_data/corpora/twitter_samples/'
pos = []
for line in open (mypath + r'positive_tweets.json', 'r'):
    pos.append(json.loads(line))

neg = []
for line in open (mypath + r'negative_tweets.json', 'r'):
    neg.append(json.loads(line))

In [27]:
len(pos)

5000

In [28]:
#colname = ["created_at", "favorite_count", "retweet_count", "id", "place", "coordinates", "geo", "text"]

df0 = []
for dic in pos:
    ll = []
    ll.append(dic['text'])
    ll.append("pos")
    df0.append(ll)

df1 = []
for dic in neg:
    ll = []
    ll.append(dic['text'])
    ll.append("neg")
    df1.append(ll)

In [29]:
df0.extend(df1)

In [30]:
df=df0
len(df)

10000

In [31]:
shuffle(df)

In [32]:
train=df[:7000]
test=df[7000:]

In [33]:
#Required for Bag of words (unigram features) creation
vocabulary = [x.lower() for tagged_sent in train for x in tagged_sent[0].split()]
vocabulary = list(set(vocabulary))
vocabulary.sort() #sorting the list
print(len(vocabulary))
#print(vocabulary)

19555


In [34]:
training_features = get_unigram_features(train,vocabulary) # vocabulary extracted in the beginning
training_labels = get_lables(train)

test_features = get_unigram_features(test,vocabulary)
test_gold_labels = get_lables(test)

In [35]:
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB(alpha=0.1).fit(training_features,training_labels) #training process

print("Precision of NB classifier is")
pred = nb_classifier.predict(training_features)
precision = calculate_precision(pred,training_labels)
print("Training data\t" + str(precision))
pred = nb_classifier.predict(test_features)
precision = calculate_precision(pred,test_gold_labels)
print("Test data\t" + str(precision))
pre_lst.append(precision)

Precision of NB classifier is
Training data	0.9995714285714286
Test data	0.9863333333333333


In [36]:
from sklearn.svm import SVC
clf = SVC(C=0.01, kernel='sigmoid').fit(training_features,training_labels)
print("Precision of SVM classifier is")
pred = clf.predict(training_features)
precision = calculate_precision(pred,training_labels)
print("Training data\t" + str(precision))
pred = clf.predict(test_features)
precision = calculate_precision(pred,test_gold_labels)
print("Test data\t" + str(precision))
pre_lst.append(precision)

Precision of SVM classifier is
Training data	0.503
Test data	0.493


In [37]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=5, max_features='sqrt', random_state=42).fit(training_features,training_labels)
print("Precision of Decision Tree classifier is")
pred = clf.predict(training_features)
precision = calculate_precision(pred,training_labels)
print("Training data\t" + str(precision))
pred = clf.predict(test_features)
precision = calculate_precision(pred,test_gold_labels)
print("Test data\t" + str(precision))
pre_lst.append(precision)

Precision of Decision Tree classifier is
Training data	0.5354285714285715
Test data	0.5183333333333333


In [38]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=42).fit(training_features,training_labels)
print("Precision of Logistic Regression is")
pred = clf.predict(training_features)
precision = calculate_precision(pred,training_labels)
print("Training data\t" + str(precision))
pred = clf.predict(test_features)
precision = calculate_precision(pred,test_gold_labels)
print("Test data\t" + str(precision))
pre_lst.append(precision)

Precision of Logistic Regression is
Training data	1.0
Test data	0.9996666666666667


In [46]:
print('''
(Dataset)        (Naive Bayes)          (SVM)           (Decision-tree)       (Logistic-Regression) 

movie_review        {}                    {}                 {}                     {}

twitter_dataset     {}           {}      {}              {}

'''.format(*pre_lst))


(Dataset)        (Naive Bayes)          (SVM)           (Decision-tree)       (Logistic-Regression) 

movie_review        0.81                    0.484                 0.554                     0.828

twitter_dataset     0.9863333333333333           0.493      0.5183333333333333              0.9996666666666667


