### In this tutorial we will train a sentiment classifier on a sample dataset

In [21]:
import csv
import random
from nltk.corpus import sentiwordnet as swn

In [22]:
################## Loading data file #######################
reader_train = csv.reader(open('data/sentiment_analysis/training.csv','r'))
reader_test = csv.reader(open('data/sentiment_analysis/test.csv','r'))
training_data = []
test_data = []
header = 1
for row in reader_train:
        if header==1:
                header=0
                continue
        training_data.append(row)
header=1
for row in reader_test:
        if header==1:
                header=0
                continue
        test_data.append(row)

In [23]:
# Examples from training data
print(training_data[1])
print(len(training_data), len(test_data))

["Sorry, I\\'ve given GA several chances and it\\'s never delivered more than what it is: A predictable, prime time Melodrama geared to the feminine Soap Opera audience.", 'neg']
385 80


In [24]:
#Required for Bag of words (unigram features) creation
vocabulary = [x.lower() for tagged_sent in training_data for x in tagged_sent[0].split()]
print(len(vocabulary))
vocabulary = list(set(vocabulary))
vocabulary.sort() #sorting the list
print(len(vocabulary))
# print(vocabulary)

10495
2759


################## Extracting Features #########################
#### Prepare a unigram feature vector based on the presence or absence of words######### 

In [25]:
def get_unigram_features(data,vocab):
    fet_vec_all = []
    for tup in data:
        single_feat_vec = []
        sent = tup[0].lower() #lowercasing the dataset
        for v in vocab:
            if sent.__contains__(v):
                single_feat_vec.append(1)
            else:
                single_feat_vec.append(0)
        fet_vec_all.append(single_feat_vec)
    return fet_vec_all

#### Add sentiment scores from sentiwordnet, here we take the average sentiment scores of all words 

In [26]:
def get_senti_wordnet_features(data):
    fet_vec_all = []
    for tup in data:
        sent = tup[0].lower()
        words = sent.split()
        pos_score = 0
        neg_score = 0
        for w in words:
            senti_synsets = swn.senti_synsets(w.lower())
            for senti_synset in senti_synsets:
                p = senti_synset.pos_score()
                n = senti_synset.neg_score()
                pos_score+=p
                neg_score+=n
                break #take only the first synset (Most frequent sense)
        fet_vec_all.append([float(pos_score),float(neg_score)])
    return fet_vec_all

#### Merge the two scores ####

In [27]:
def merge_features(featureList1,featureList2):
    # For merging two features
    if featureList1==[]:
        return featureList2
    merged = []
    for i in range(len(featureList1)):
        m = featureList1[i]+featureList2[i]
        merged.append(m)
    return merged

In [28]:
#extract the sentiment labels by making positive reviews as class 1 and negative reviews as class 2
def get_lables(data):
    labels = []
    for tup in data:
        if tup[1].lower()=="neg":
            labels.append(-1)
        else:
            labels.append(1)
    return labels

In [29]:
def calculate_precision(prediction, actual):
    prediction = list(prediction)
    correct_labels = [predictions[i]  for i in range(len(predictions)) if actual[i] == predictions[i]]
    precision = float(len(correct_labels))/float(len(prediction))
    return precision

In [30]:
def real_time_test(classifier,vocab):
    print("Enter a sentence: ")
    inp = input()
    print(inp)
    feat_vec_uni = get_unigram_features(inp,vocab)
    feat_vec_swn =get_senti_wordnet_features(test_data)
    feat_vec = merge_features(feat_vec_uni, feat_vec_swn)

    predict = classifier.predict(feat_vec)
    if predict[0]==1:
        print("The sentiment expressed is: positive")
    else:
        print("The sentiment expressed is: negative")   


################# Training and Evaluation #######################
#### Preparing training and test tuples
#### The feature_vecor set looks like [featurevector1, featurevector2,...,featurevectorN] where each featurevectorX is a list
#### The label set looks like [label1,label2,...,labelN]

In [31]:
training_unigram_features = get_unigram_features(training_data,vocabulary) # vocabulary extracted in the beginning
training_swn_features = get_senti_wordnet_features(training_data)

training_features = merge_features(training_unigram_features,training_swn_features)

training_labels = get_lables(training_data)

test_unigram_features = get_unigram_features(test_data,vocabulary)
test_swn_features=get_senti_wordnet_features(test_data)
test_features= merge_features(test_unigram_features,test_swn_features)

test_gold_labels = get_lables(test_data)

In [32]:
# Naive Bayes Classifier 
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB().fit(training_features,training_labels) #training process
predictions = nb_classifier.predict(test_features)

print("Precision of NB classifier is")
predictions = nb_classifier.predict(training_features)
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = nb_classifier.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))

Precision of NB classifier is
Training data	0.987012987012987
Test data	0.775


In [33]:
#Real time tesing
real_time_test(nb_classifier,vocabulary)

Enter a sentence: 
I like movie
I like movie
The sentiment expressed is: negative


In [34]:
# SVM Classifier
#Refer to : http://scikit-learn.org/stable/modules/svm.html
from sklearn.svm import LinearSVC
svm_classifier = LinearSVC(penalty='l2', C=0.01).fit(training_features,training_labels)
predictions = svm_classifier.predict(training_features)

print("Precision of linear SVM classifier is:")
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = svm_classifier.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))

Precision of linear SVM classifier is:
Training data	0.9584415584415584
Test data	0.775


In [36]:
#Real time tesing
real_time_test(svm_classifier,vocabulary)

Enter a sentence: 
movie is awesome
movie is awesome
The sentiment expressed is: positive
