In [1]:
#with this script, we use VADER to conduct a sentiment analysis of all relevant books by Virginia Woolf. This will be done once with an unedited version of all sentences and once with an edited one - meaning with all words being lemmatized and stopwords being removed

import os
import pandas as pd
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
#the first couple of operations are identital to what we already did in the word_count-script

path_of_folder = "C:\\Users\\Jakob\\Desktop\\Werkzeuge Hausarbeit\\Virginia_Woolf_Files\\Bearbeitete_Downloads"
file_iter = os.scandir(path_of_folder)

all_texts = {}

for f in file_iter:
    myfile = open(path_of_folder + "\\" + f.name, 'r', encoding='utf-8')
    f_name = f.name[5:-4]
    text = ""
    
    for line in myfile.readlines():
        if (line.lower().startswith("chapter")) or (line.strip().isdigit()):
            continue
        else:
            text += line
            
    text = text.replace('\n', ' ')
    all_texts[f_name] = text
    myfile.close()

In [3]:
#just like in the word_count_script, we now tokenize all texts. Here, however, there is no word-tokenization, as we will use VADER for analyzing not single words, but sentences

sentences_all_texts = {}

for key in all_texts:
    sent_list = nltk.sent_tokenize(all_texts[key])
    sentences_all_texts[key]=sent_list

In [4]:
#now we want to create an alternative dictionary with all sentences, where all words are lemmatized and stopwords are removed
#for a better code structure, stopword-removal and lemmatization will be implemented in a separate function

def stopword_removal_lemmatization(sentence):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    #now we need word-tokens again
    word_tokens = nltk.word_tokenize(sentence)
    
    filtered_sentence = []
    
    for w in word_tokens:
        if w.lower() not in stop_words:
            filtered_sentence.append(w)
    
    #now the tokens in filtered_sentence will be lemmatized. For this, they first need to be POS-tagged
    #these tags then need to be transformed to be able to lemmatize them with the WordNetLemmatizer
    tagged_tokens = nltk.pos_tag(filtered_sentence)
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmata = []
    for tok_tag in tagged_tokens:
        (tok,tag) = tok_tag
        pos = ''
        if tag.startswith('JJ'):
            pos = 'a'
        elif tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('V'):
            pos = 'v'
        if pos:
            lemmata.append(lemmatizer.lemmatize(tok,pos))
        else:
            lemmata.append(tok)
    
    edited_sentence = str(" ".join(lemmata))
    return edited_sentence

In [5]:
#define the new dictionary and fill it by calling the stopword_removal_lemmatization-function for every sentence of every book. This takes some time

no_stop_lemma_all_texts = {}

for key in sentences_all_texts:
    edited_sentences = []
    for sentence in sentences_all_texts[key]:
        edited_sentence = stopword_removal_lemmatization(sentence)
        edited_sentences.append(edited_sentence)
    no_stop_lemma_all_texts[key] = edited_sentences

In [6]:
#now we can do the actual sentiment analysis with VADER
#define a function for creating a sentiment-dictionary for a sentence with VADER
def sentiment_scores(sentence):
    analyzer = SentimentIntensityAnalyzer()
    sentiment_dict = analyzer.polarity_scores(sentence)
    return sentiment_dict

In [7]:
#iterate over every sentence of every book and create a sentiment-dictionary out of it by calling the sentiment_scores-function
#extract different kinds of sentiment information out of the dictionary
#for every book, calculate the arithmetic mean of different sentiment-values and save all of this in a tuple which is then returned

def sentiment_analysis(some_dict):
    
    sentiment_all_texts = {}
    
    for key in some_dict:
        sentence_count = 0
        
        sentence_neg_count = 0
        sentence_neu_count = 0
        sentence_pos_count = 0

        neg_count = 0
        neu_count = 0
        pos_count = 0
        compount_count = 0

        for sentence in some_dict[key]:
            sentiment_dict = sentiment_scores(sentence)
            neg_count += sentiment_dict['neg']
            neu_count += sentiment_dict['neu']
            pos_count += sentiment_dict['pos']
            compount_count += sentiment_dict['compound']

            sentence_count += 1

            if sentiment_dict['compound'] >= 0.05:
                sentence_pos_count += 1
            elif sentiment_dict['compound'] <= -0.05:
                sentence_neg_count += 1
            else:
                sentence_neu_count += 1
        
        neg_sentences_perc = sentence_neg_count/(sentence_count/100)
        neu_sentences_perc = sentence_neu_count/(sentence_count/100)
        pos_sentences_perc = sentence_pos_count/(sentence_count/100)
        
        avg_neg = neg_count/sentence_count
        avg_neu = neu_count/sentence_count
        avg_pos = pos_count/sentence_count
        avg_comp = compount_count/sentence_count
        
        sentiment_tuple = (neg_sentences_perc,neu_sentences_perc,pos_sentences_perc,avg_neg,avg_neu,avg_pos,avg_comp)
        
        sentiment_all_texts[key] = sentiment_tuple

    
    return sentiment_all_texts
        

In [8]:
#call the just defined function for both sentence-dictionaries. This will take some time
final_sentiment_dict_unedited = sentiment_analysis(sentences_all_texts)
final_sentiment_dict_no_stop_lemma = sentiment_analysis(no_stop_lemma_all_texts)

In [21]:
#convert the resulting dictionaries in a list of tuples which makes it easier to create a DataFrame out of it
final_sentiment_tuple_unedited = [(k,v1,v2,v3,v4,v5,v6,v7) for k,(v1,v2,v3,v4,v5,v6,v7) in final_sentiment_dict_unedited.items()]
final_sentiment_tuple_no_stop_lemma = [(k,v1,v2,v3,v4,v5,v6,v7) for k,(v1,v2,v3,v4,v5,v6,v7) in final_sentiment_dict_no_stop_lemma.items()] 

In [23]:
#create a DataFrame out of these lists of tuples
df_unedited = pd.DataFrame(final_sentiment_tuple_unedited, columns = ['name','neg_sentences_perc','neu_sentences_perc','pos_sentences_perc','avg_neg','avg_neu','avg_pos','avg_comp'])
df_no_stop_lemma = pd.DataFrame(final_sentiment_tuple_no_stop_lemma, columns = ['name','neg_sentences_perc','neu_sentences_perc','pos_sentences_perc','avg_neg','avg_neu','avg_pos','avg_comp'])

In [24]:
#check if DataFrame-setup worked
df_unedited

Unnamed: 0,name,neg_sentences_perc,neu_sentences_perc,pos_sentences_perc,avg_neg,avg_neu,avg_pos,avg_comp
0,The Voyage Out,23.964455,40.318188,35.717357,0.05675,0.862865,0.080377,0.077669
1,Night and Day,26.112366,33.550272,40.337362,0.062205,0.850129,0.087663,0.094508
2,Monday or Tuesday,21.792619,47.275923,30.931459,0.06949,0.838007,0.09251,0.053961
3,Jacob's Room,21.629543,50.293083,28.077374,0.058438,0.870877,0.070682,0.043306
4,Mrs Dalloway,22.769064,45.078421,32.152515,0.071012,0.83651,0.092484,0.072555
5,To the Lighthouse,24.189881,42.95054,32.859579,0.062536,0.855915,0.08155,0.062279
6,Orlando,25.871667,38.265456,35.862877,0.063321,0.85326,0.083415,0.071983
7,The Waves,24.701349,48.728324,26.570328,0.06834,0.865676,0.065984,0.016515
8,Flush,29.468822,47.066975,23.464203,0.076055,0.865644,0.058298,-0.008477
9,The Years,15.456293,64.317409,20.226298,0.046482,0.892418,0.061101,0.034301


In [25]:
df_no_stop_lemma

Unnamed: 0,name,neg_sentences_perc,neu_sentences_perc,pos_sentences_perc,avg_neg,avg_neu,avg_pos,avg_comp
0,The Voyage Out,22.502508,36.491329,41.006163,0.073472,0.803337,0.12319,0.112337
1,Night and Day,23.748174,31.45172,44.800106,0.078337,0.787602,0.134059,0.13233
2,Monday or Tuesday,20.56239,45.6942,33.743409,0.073873,0.814946,0.111181,0.07713
3,Jacob's Room,20.867526,49.560375,29.572098,0.067657,0.843924,0.088424,0.055916
4,Mrs Dalloway,20.876149,44.835046,34.288805,0.082544,0.79705,0.120402,0.091306
5,To the Lighthouse,21.461057,42.382035,36.156907,0.078344,0.803622,0.11803,0.088968
6,Orlando,23.469089,37.943159,38.587753,0.079297,0.801791,0.118919,0.100176
7,The Waves,22.273603,49.691715,28.034682,0.079472,0.826746,0.093781,0.036466
8,Flush,27.852194,47.713626,24.43418,0.09344,0.824173,0.082384,0.007876
9,The Years,14.701967,63.29685,22.001183,0.053322,0.869668,0.077009,0.044137


In [26]:
#export each DataFrame to a csv-file to be able to work with them later
df_unedited.to_csv(r'C:\\Users\\Jakob\\Desktop\\Werkzeuge Hausarbeit\\Ergebnisse\\sentiment_analysis_unedited.csv', index=False)
df_no_stop_lemma.to_csv(r'C:\\Users\\Jakob\\Desktop\\Werkzeuge Hausarbeit\\Ergebnisse\\sentiment_analysis_no_stop_lemma.csv', index=False)