In [6]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import os
from nltk.tokenize import sent_tokenize
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


First we create a dictionary with all positive and negative words and their sentiment scores

In [29]:
 def make_lex_dict(lexicon_file):
        """
        Convert lexicon file to a dictionary
        """
        lex_dict = {}
        for line in lexicon_file.split('\n'):
            (word, measure) = line.strip().split('\t')[0:2]
            lex_dict[word] = float(measure)
        return lex_dict
    
sent_dict = make_lex_dict(open('src/vader_lexicon.txt', 'r').read())


Sentiment analysis. Analysis is performed for each sentence and the sentiment scores kept in lists. Sentiment scores are calculated by averaging the sentiment scores for all sentences.

In [25]:
def return_sentiment_scores(sentence):
    # return just the sentiment scores
    snt = analyser.polarity_scores(sentence)
    return snt

def sentiment_analysis(directory):
    # returns the sentiment of every book in the directory
    data = pd.read_csv('data/output/tard.csv', index_col=0)
    print(len(data.index))
    print(data.index)
    pos_list = []
    neg_list = []
    neu_list = []
    comp_list = []
    
    # for every book
    for filename in os.listdir(directory):
        
        sub_pos_list = []
        sub_neg_list = []
        sub_neu_list = []
        sub_comp_list = []
        
        # if file is a textfile
        if filename.endswith(".txt"):
            text = open(os.path.join(directory, filename), 'r', errors='replace')
            # for every line in the text
            for line in text.readlines():
                scores = return_sentiment_scores(line)
                # save sentiment scores 
                sub_neg_list.append(scores['neg'])
                sub_neu_list.append(scores['neu'])
                sub_pos_list.append(scores['pos'])
                sub_comp_list.append(scores['compound'])
            
            # then save average sentiment scores for each book
            neg_list.append((sum(sub_neg_list) / float(len(sub_neg_list))))
            pos_list.append((sum(sub_pos_list) / float(len(sub_pos_list))))
            neu_list.append((sum(sub_neu_list) / float(len(sub_neu_list))))
            comp_list.append((sum(sub_comp_list) / float(len(sub_comp_list))))
            
    # convert scores to pandas compatible list
    neg = pd.Series(neg_list)
    pos = pd.Series(pos_list)
    neu = pd.Series(neu_list)
    com = pd.Series(comp_list)

    print(len(neg), len(pos), len(neu), len(com))
    # fill the right columns with the right data
    data['neg score'] = neg.values
    data['pos score'] = pos.values
    data['neu score'] = neu.values
    data['comp score'] = com.values

    data.to_csv('data/output/tard.csv')
    
analyser = SentimentIntensityAnalyzer()                    
sentiment_analysis('data/test/')

6
Index(['9915.txt', '9914.txt', '9916.txt', '9915-8.txt', '9917-8.txt',
       '9916-8.txt'],
      dtype='object', name='filename')
6 6 6 6


We also want to count the amount of positive and negative words as features

In [36]:
def count_sentiment_words(directory):
    list_pos_words = []
    list_neg_words = []
    pos_list = []
    neg_list = []
    
    data = pd.read_csv('data/output/tard.csv', index_col=0)

    for filename in os.listdir(directory):
        pos_count = 0
        neg_count = 0
        
        if filename.endswith(".txt"):
            text = open(os.path.join(directory, filename), 'r', errors='replace')
            for line in text.readlines():
                for word in line.split(" "):
                    if word in sent_dict:
                        if sent_dict[word] >= 0:
                            pos_count += 1
                            list_pos_words.append(word)
                        else:
                            neg_count += 1
                            list_neg_words.append(word)
               
            pos_list.append(pos_count)
            neg_list.append(neg_count)
            
    data['amt pos'] = pos_list 
    data['amt neg'] = neg_list
    
    data.to_csv('data/output/tard.csv')
     
count_sentiment_words('data/test/')

587 496
461 261
1841 876
587 496
2278 1168
1841 876


In [3]:
data = pd.read_csv('data/output/tard.csv', index_col=0)
