# Sentiment Analysis using vaderSentiment

### Importing modules

In [7]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer # Used to analyse the sentiment of sentences - this function is based on a database of words with positive/neutral/negative sentiment as rated by sample groups
import glob # Used for importing files matching a specific pattern
from pathlib import Path #Used to interact more easily with the file system
import nltk # Used for tokenizing texts into sentences
nltk.download('punkt') # The specific module needed for this sentence tokenizing
import pandas as pd # Used to shape data as tables and import that data into Excel
from statistics import mean # Used to calculate a mean without having to do the maths manually

sentimentAnalyser = SentimentIntensityAnalyzer() # Converts this function to a British spelling and makes it easier to call

[nltk_data] Downloading package punkt to C:\Users\Tia
[nltk_data]     Work\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Creating the sentiment analysis function

In [13]:
# Creating a function to do the sentiment analysis for each 5% of the text
def find_sentiment(filepath,title):
    with open(f'{filepath}', encoding='utf8') as f: # Opens the .txt file as variable f
        text = f.read() # Reads the .txt file into a string and then closes the file
    text = text.replace('\n', ' ') # Replaces new-line symbols with spaces
    text_sentences = nltk.sent_tokenize(text) # Splits the text up into a list of sentences using a NLTK (Natural Language Toolkit) method
    
    no_sentences = len(text_sentences) # Finds the number of sentences by getting the length of the list of sentences
    start_pos = 0 # Creates a starting position for sentence number, beginning at zero (counting starts at zero in Python)
    five_percent = round(no_sentences/20) # Calculates how many sentences are equivalent to five percent of the whole document

    section_scores = {'text':f'{title}',} # Creates an empty dictionary which will go on to contain the sentiment score for each five percent, and records the document's name as its first key/value pair

    for i in range(1, 21): # For each five percent...
        total_scores = [] # This makes an empty list which will store the scores for each sentence
        for sentence in text_sentences[start_pos:start_pos+five_percent]: # For every sentence in this five percent...
            scores = sentimentAnalyser.polarity_scores(sentence) # This works out its scores
            total_scores.append(scores['compound']) #This adds its overall score to the list of scores
        
        if len(total_scores) != 0: # If there are sentences in the list of scores for this five percent...
            scores_count = mean(total_scores) # This works out the average of this scores and stores it
        else: # If there aren't...
            scores_count = 0 # This sets the overall score for that section to zero

        section_scores[i*5] = float(scores_count) # Makes a new key/value pair in the dictionary, with the key of which percent it is, and the average sentiment value
        start_pos += five_percent # Increases the starting position to after this five percent, so we work with the next five percent of the document

    return section_scores # Returns the dictionary of a document's sentiment scores

### Processing documents

In [14]:
# Setting up the directory
directory_path = 'resources/doyle_holmes_sep' # Sets the directory path to the separated Holmes corpus
document_files = glob.glob(f'{directory_path}/*.txt') # Creates a list with the name of each .txt file (document) in the directory
document_files = [slashes.replace('\\', '/') for slashes in document_files] # Replaces double backslashes with forward slashes so Python can find these files
document_titles = [Path(document).stem for document in document_files] # Creates a list of document titles by taking the file list and removing the trailing '.txt'

# Running the find_sentiment function on all documents and recording the results
percentage_data = [] # Creates an empty array which will be used to hold the sentiment data
for document_filepath, document_title in zip(document_files, document_titles): # For every filepath and its associated document title...
    percentage_data.append(find_sentiment(document_filepath,document_title)) # This applies the find_sentiment function on the document and adds the returned data to the array of all the sentiment data data
    print(document_title) # This prints the document title when it's done doing the above

1887_12_Study_in_Scarlet
1890_02_Sign_of_the_Four
1891_06_Scandal_in_Bohemia
1891_08_Red-Headed_League
1891_09_Case_of_Identity
1891_10_Boscome_Valley_Mystery
1891_11_Five_Orange_Pips
1891_12_Man_with_the_Twisted_Lip
1892_01_Blue_Carbuncle
1892_02_Speckled_Band
1892_03_Engineers_Thumb
1892_04_Noble_Bachelor
1892_05_Beryl_Coronet
1892_06_Copper_Beeches
1892_12_Silver_Blaze
1893_01_Cardboard_Box
1893_02_Yellow_Face
1893_03_Stockbrokers_Clerk
1893_04_Gloria_Scott
1893_05_Musgrave_Ritual
1893_06_Reigate_Squire
1893_07_Crooked_Man
1893_08_Resident_Patient
1893_09_Greek_Interpreter
1893_11_Naval_Treaty
1893_12_The_Final_Problem
1902_03_Hound_of_the_Baskervilles
1903_10_Empty_House
1903_11_Norwood_Builder
1903_12_Dancing_Men
1904_01_Solitary_Cyclist
1904_02_Priory_School
1904_03_Black_Peter
1904_04_Charles_Augustus_Milverton
1904_05_Six_Napoleons
1904_06_Three_Students
1904_07_Golden_Pince-Nez
1904_08_Missing_Three-Quarter
1904_09_Abbey_Grange
1904_12_Second_Stain
1908_08_Wisteria_Lodge
1908_

### Writing this data to Excel

In [18]:
# Installs xlsxwriter so it can be used
!pip install xlsxwriter
import xlsxwriter # Used as an engine for writing to Excel .xlsx files

Collecting xlsxwriter
  Obtaining dependency information for xlsxwriter from https://files.pythonhosted.org/packages/f7/3e/05ba2194cd5073602422859c949a4f21310a3c49bf8dccde9e03d4522b11/XlsxWriter-3.1.9-py3-none-any.whl.metadata
  Downloading XlsxWriter-3.1.9-py3-none-any.whl.metadata (2.6 kB)
Downloading XlsxWriter-3.1.9-py3-none-any.whl (154 kB)
   ---------------------------------------- 0.0/154.8 kB ? eta -:--:--
   -- ------------------------------------- 10.2/154.8 kB ? eta -:--:--
   --------------- ----------------------- 61.4/154.8 kB 812.7 kB/s eta 0:00:01
   ---------------------------------------- 154.8/154.8 kB 1.5 MB/s eta 0:00:00
Installing collected packages: xlsxwriter
Successfully installed xlsxwriter-3.1.9



[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
# Writes the sentiment data to a Pandas dataframe and then writes this to an Excel file, which we can then use for visualisations
percentage_df = pd.DataFrame(percentage_data) # Makes the sentiment data into a Pandas dataframe (a table, essentially)
with pd.ExcelWriter('holmes_sentiment_analysis.xlsx', engine='xlsxwriter') as writer: # Opens an Excel spreadsheet with the above name and...
    percentage_df.to_excel(writer, sheet_name=f'sentiment_analysis') # Writes the sentiment dataframe to it, then closes it