### Handlabeling Negative-Positive Evaluator


This is a manual approach / we comapre its success with dictionary approach:

** aJavaher: This is to test out validation of dictionary approach.


> Our model first considers text as a bag of words. Here word as combination of letters and digits that are separated by: {SPACE ! , . / : ; ? " ' # - _ + } that can have positive/neutral/negative sentiment.


> A negative/neutra/positive vote is considered based on perecentage of common view in the readers. If the votes are different opinions the effect of the vote will be reduced. (I.E. it would be a weak negative for example).

> For simplicity, we didn't include confidence measure of the words.


In [1]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import time
import pandas as pd
import numpy as np
import re
import math
from hatesonar import Sonar

In [2]:
analyzer = SentimentIntensityAnalyzer() 
# reading the dataset
ActualData = pd.read_csv('../Data/data_IRA_Ads.csv', sep='\t') 
# dictionary analyzer
ActualData['pos'] = np.nan
ActualData['neg'] = np.nan
ActualData['neu'] = np.nan
ActualData['compound'] = np.nan
ActualData['flag'] = np.nan

for index, eachAdText in ActualData['AD_TEXT'].iteritems():
    # skip empty texts
    if pd.isnull(eachAdText):
        continue
        
    # calculate the senti-index for each advert text
    tempRespond = analyzer.polarity_scores(eachAdText)    
    
    ActualData.loc[index, 'pos'] = tempRespond['pos']
    ActualData.loc[index, 'neg'] = tempRespond['neg']
    ActualData.loc[index, 'neu'] = tempRespond['neu']
    ActualData.loc[index, 'compound'] = tempRespond['compound']
    
    # make the flags
    if 0.5 <= tempRespond['compound']  and tempRespond['compound'] <= 1:
        ActualData.loc[index, 'flag'] = 'positive'
    elif -0.5 < tempRespond['compound']  and tempRespond['compound'] < 0.5:
        ActualData.loc[index, 'flag'] = 'neutral'
    else:
        ActualData.loc[index, 'flag'] = 'negative'

A: Word model: Decision about positive/negativeness based on words (word by word measure)

In [3]:
def mostCommon(li):
    st = set(li)
    mx = -1
    for each in st:
        temp = li.count(each)
        if mx < temp:
            mx = temp 
            h = each 
    return h

In [4]:
#reading the labeling csv
data = pd.read_csv('../Data/dataset_handlabeling.csv', sep=',') 
#list of voters
voters = list(data.columns.values)
voters.remove('AD_TEXT')
data['vote'] = np.nan
data['percentage'] = np.nan

#analyze row by row
for index, row in data.iterrows():
    decisions = []
    for voter in voters:
        decisions.append(row[voter].lower())
    convergenceValue = mostCommon(decisions)
    convergencePecentage = decisions.count(convergenceValue) / len(decisions)
    data.loc[index, 'vote'] = convergenceValue
    data.loc[index, 'percentage'] = convergencePecentage
    

In [5]:
# make bag of words vote
bagOfWords = {}
for index, row in data.iterrows():
    text = row['AD_TEXT'].lower()
    words = re.split('\t| |;|,|\*|\n|!|\.|:|,|/|"|~|\#|\?', text)
    key = row['vote']
    for word in words:
        if '' == word: continue
        if word in bagOfWords:
            bagOfWords[word][key] = bagOfWords[word][key] + 1
            if (bagOfWords[word][key] > bagOfWords[word][bagOfWords[word]['common']]):
                bagOfWords[word]['common'] = key 
        else:
            bagOfWords[word] = {'negative':0, 'positive': 0, 'neutral': 0, 'common': key}
            bagOfWords[word][key] = 1

In [6]:
# give a vote for the whole sentence:
for index, row in data.iterrows():
    text = row['AD_TEXT'].lower()
    words = re.split('\t| |;|,|\*|\n|!|\.|:|,|/|"|~|\#|\?', text)
    key = row['vote']
    for word in words:
        if '' == word: continue
        if word in bagOfWords:
            bagOfWords[word][key] = bagOfWords[word][key] + 1
            if (bagOfWords[word][key] > bagOfWords[word][bagOfWords[word]['common']]):
                bagOfWords[word]['common'] = key 
        else:
            bagOfWords[word] = {'negative':0, 'positive': 0, 'neutral': 0, 'common': key}
            bagOfWords[word][key] = 1

In [7]:
# evaluate result of word model
for index, row in ActualData.iterrows():
    if pd.isnull(row['AD_TEXT']):
        continue 
    text = row['AD_TEXT'].lower()
    words = re.split('\t| |;|,|\*|\n|!|\.|:|,|/|"|~|\#|\?', text)
    textVote = {'negative':0, 'positive': 0, 'neutral': 0, 'common': 'neutral'}
    for word in words:
        if word in bagOfWords:
            myVote = bagOfWords[word]['common']
            textVote[myVote] = textVote[myVote] + 1
            if (textVote[myVote] > textVote[textVote['common']] ):
                textVote['common'] = myVote                
    ActualData.loc[index, 'handVote'] = textVote['common'] 
                

In [8]:
# how much do they agree
partion = 0 
total = 0
for index, row in ActualData.iterrows():
    if pd.isnull(row['AD_TEXT']):
        continue
    total = total + 1
    if row['flag'] ==  row['handVote']:
        partion = partion + 1

In [9]:
agreeing = 0 

if 0 != total:
    agreeing = partion / total * 100

print ('With 118 known adverts out of ', total)
print ('Similar Votes:' , partion , ' out of ' , total)
print ('Percentage of Agreeing' , agreeing , '%')


With 118 known adverts out of  3487
Similar Votes: 1473  out of  3487
Percentage of Agreeing 42.242615428735306 %
