## NLP Sentiment Analysis

In [1]:
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
# pd.set_option("display.max_rows",2000)
# pd.set_option("display./max_seq_items",2000)

In [2]:
data = pd.read_csv('train.tsv', sep='\t', header=0)

### each row in our data has these features:
    - unique PhraseId
    - original sentenceId
    - Pharse
    - label- sentiment 

sentiment is from 0 - very negative to 4- very positive

In [3]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [4]:
#tokenize each pharse to list of words

In [5]:
from tokenizing import Tokenizer

In [6]:
tok = Tokenizer(preserve_case=False)
data['Phrase_tokens'] = data['Phrase'].apply(tok.tokenize)


In [7]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Phrase_tokens
0,1,1,A series of escapades demonstrating the adage ...,1,"[a, series, of, escapades, demonstrating, the,..."
1,2,1,A series of escapades demonstrating the adage ...,2,"[a, series, of, escapades, demonstrating, the,..."
2,3,1,A series,2,"[a, series]"
3,4,1,A,2,[a]
4,5,1,series,2,[series]


In [9]:
sentiment_dict = {}
max_train = 140353 #use only train data to build sentiment dict
iterator = data.iterrows()
  
for _ in range(max_train):  
    index,row = iterator.next()
    word_list = row['Phrase_tokens']
    if len(word_list)==1:
        sentiment_dict[word_list[0].lower()] = data['Sentiment'][index]

In [10]:
# Remove stop words from phrases

import nltk

from nltk.corpus import stopwords # Import the stop word list
print stopwords.words("english") 



[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u

In [12]:
data['Phrase_tokens'] = data['Phrase_tokens'].apply(lambda words:[w for w in words if not w in stopwords.words("english") and w.isalpha()])

In [13]:
data.head(10)


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Phrase_tokens
0,1,1,A series of escapades demonstrating the adage ...,1,"[series, escapades, demonstrating, adage, good..."
1,2,1,A series of escapades demonstrating the adage ...,2,"[series, escapades, demonstrating, adage, good..."
2,3,1,A series,2,[series]
3,4,1,A,2,[]
4,5,1,series,2,[series]
5,6,1,of escapades demonstrating the adage that what...,2,"[escapades, demonstrating, adage, good, goose]"
6,7,1,of,2,[]
7,8,1,escapades demonstrating the adage that what is...,2,"[escapades, demonstrating, adage, good, goose]"
8,9,1,escapades,2,[escapades]
9,10,1,demonstrating the adage that what is good for ...,2,"[demonstrating, adage, good, goose]"


###  Approch 1: create features 

we combine our dataset with the Sentiwordnet.

#### use the following features to represent each sentense:
    - avg sentiment of the words in the pharse
    - max sentiment in phrase
    - min sentiment in pharse
    - binary indicators of positive words and negative words from SentiWordnet

#### min,max,avg calc of sentiment

In [14]:
# sentiment_dict = {}
# max_train = 140353 #use only train data tot build sentiment dict

data = pd.DataFrame(data[data['Phrase_tokens'].str.len() >0])
data = data.reset_index(drop=True)

# iterator = data.iterrows()
  
# for _ in range(max_train):  
#     index,row = iterator.next()
#     word_list = row['Phrase_tokens']
#     if len(word_list)==1:
#         sentiment_dict[word_list[0].lower()] = data['Sentiment'][index]

# data = pd.DataFrame(data[data['Phrase_tokens'].str.len() >0])
# data = data.reset_index(drop=True)

In [15]:
#count each term appearance for TF-IDF weights
count_dict = {}
for i,word_list in enumerate(data['Phrase_tokens'][:max_train]):
    for word in word_list:
        word = word.lower()
        if count_dict.has_key(word):
           count_dict[word] = count_dict[word] + 1
        else:
            count_dict[word] = 1
            

In [16]:
num_document_train = max_train

def TF_IDF_weighting(word):
    if count_dict.has_key(word):
        return np.log(num_document_train/(count_dict[word]))
    else: return 1


In [17]:
def creat_sentiment_list(word_list):
    sent_list = []
    for word in word_list:
        word_sent  = sentiment_dict.get(word.lower(),2)
        #weighted_word_sent = word_sent *TF_IDF_weighting(word.lower())
        sent_list.append(word_sent)
    return sent_list

data['sentiment list'] = data['Phrase_tokens'].apply(creat_sentiment_list)
#data['avg sentiment'] = data['sentiment list'].apply(lambda x: sum(x)/float(len(x)))
data['max sentiment'] = data['sentiment list'].apply(lambda x: max(x))
data['min sentiment'] =  data['sentiment list'].apply(lambda x: min(x))

In [18]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Phrase_tokens,sentiment list,max sentiment,min sentiment
0,1,1,A series of escapades demonstrating the adage ...,1,"[series, escapades, demonstrating, adage, good...","[2, 2, 2, 2, 3, 2, 2, 3, 2, 2, 3, 2, 2, 2, 2]",3,2
1,2,1,A series of escapades demonstrating the adage ...,2,"[series, escapades, demonstrating, adage, good...","[2, 2, 2, 2, 3, 2]",3,2
2,3,1,A series,2,[series],[2],2,2
3,5,1,series,2,[series],[2],2,2
4,6,1,of escapades demonstrating the adage that what...,2,"[escapades, demonstrating, adage, good, goose]","[2, 2, 2, 3, 2]",3,2


In [19]:
def weight_sentiment(word_list,sentiment_list):
    weight_list = []
    for word in word_list:
        word_TF_IDF =  TF_IDF_weighting(word.lower())
        weight_list.append(word_TF_IDF)
    weight_sum = sum(weight_list)
    return np.dot(sentiment_list,weight_list)/weight_sum

In [20]:
weighted_avg = []
for word_list,sentiment_list in zip(data['Phrase_tokens'],data['sentiment list']):
    result = weight_sentiment(word_list,sentiment_list)
    weighted_avg.append(result)
data['avg sentiment']=  pd.Series(weighted_avg)


In [21]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Phrase_tokens,sentiment list,max sentiment,min sentiment,avg sentiment
0,1,1,A series of escapades demonstrating the adage ...,1,"[series, escapades, demonstrating, adage, good...","[2, 2, 2, 2, 3, 2, 2, 3, 2, 2, 3, 2, 2, 2, 2]",3,2,2.172159
1,2,1,A series of escapades demonstrating the adage ...,2,"[series, escapades, demonstrating, adage, good...","[2, 2, 2, 2, 3, 2]",3,2,2.092056
2,3,1,A series,2,[series],[2],2,2,2.0
3,5,1,series,2,[series],[2],2,2,2.0
4,6,1,of escapades demonstrating the adage that what...,2,"[escapades, demonstrating, adage, good, goose]","[2, 2, 2, 3, 2]",3,2,2.105791


#### binary indicators of positivity/negativity 

explain about sentiwordnet and thresholding

In [22]:
thresh = 0.8
pos = [] #list of positive words
neg = [] # list of negative words
with open('SentiWordNet.txt','r') as f:
    for line in f:
        if line[0]=='#' or line[0].isspace() : continue #skip comment rows in database  
        e = line.split('\t')
        if float(e[2])>=thresh: pos.append(e[4].split('#')[0])
        elif float(e[3])>=thresh: neg.append(e[4].split('#')[0])

In [23]:
for senti_word in pos+neg:
    
    def check_if_word_exist(word_list):
        lower_pharse = [x.lower() for x in word_list]
        return senti_word in lower_pharse
    
    data['contain_'+senti_word] = data['Phrase_tokens'].apply(lambda word_list: check_if_word_exist(word_list))


In [24]:
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,Phrase_tokens,sentiment list,max sentiment,min sentiment,avg sentiment,contain_veracious,...,contain_resent,contain_pound,contain_mislead,contain_desensitize,contain_burn,contain_twinge,contain_stink,contain_smell,contain_trouble,contain_humbug
0,1,1,A series of escapades demonstrating the adage ...,1,"[series, escapades, demonstrating, adage, good...","[2, 2, 2, 2, 3, 2, 2, 3, 2, 2, 3, 2, 2, 2, 2]",3,2,2.172159,False,...,False,False,False,False,False,False,False,False,False,False
1,2,1,A series of escapades demonstrating the adage ...,2,"[series, escapades, demonstrating, adage, good...","[2, 2, 2, 2, 3, 2]",3,2,2.092056,False,...,False,False,False,False,False,False,False,False,False,False
2,3,1,A series,2,[series],[2],2,2,2.0,False,...,False,False,False,False,False,False,False,False,False,False
3,5,1,series,2,[series],[2],2,2,2.0,False,...,False,False,False,False,False,False,False,False,False,False
4,6,1,of escapades demonstrating the adage that what...,2,"[escapades, demonstrating, adage, good, goose]","[2, 2, 2, 3, 2]",3,2,2.105791,False,...,False,False,False,False,False,False,False,False,False,False


### check results of classifier

In [25]:
# train on ~90% of the data and check on the rest

In [26]:
from sklearn.preprocessing import normalize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier


In [27]:
# select only features columms. split to train and test datasets
contain_list = [column for column in data.columns.values if column.startswith('contain') ]
feature_list = ['avg sentiment', 'max sentiment', 'min sentiment'] + contain_list
features = data[feature_list]
features = normalize(features)
x_train,y_train = features[:max_train],data['Sentiment'][:max_train]
x_test,y_test = features[max_train:],data['Sentiment'][max_train:]

In [28]:
model = OneVsRestClassifier(RandomForestClassifier(n_jobs=3))
model.fit(x_train,y_train)
predict = model.predict(x_test)

In [29]:
print classification_report(y_test,predict)

             precision    recall  f1-score   support

          0       0.16      0.11      0.13       606
          1       0.31      0.18      0.23      2442
          2       0.58      0.77      0.66      5882
          3       0.32      0.24      0.28      2503
          4       0.16      0.15      0.15       719

avg / total       0.43      0.47      0.44     12152



# bag of words

In [None]:
data.shape[0]



In [None]:
#creating vocabulary
NUM_OF_PHARSE = data.shape[0]
vocab = []
for i in range(NUM_OF_PHARSE):
    vocab.extend(data['Phrase'].iloc[i])
vocab = list(set(vocab))


In [None]:
len(vocab)

In [None]:
data['Bag_of_Words']=pd.Series(0)
data.head()

In [None]:
bag_of_words = np.zeros((NUM_OF_PHARSE,len(vocab)))
for i in range(NUM_OF_PHARSE):
    for word in data['Phrase'][i]:
        indx = vocab.index(word)
        bag_of_words[i][indx] +=1

In [None]:
print "Training the random forest..."
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 5) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( bag_of_words, data["Sentiment"] )