Reference: https://github.com/chuachinhon/practical_nlp/blob/master/notebooks/1.0_speech_sentiment_cch.ipynb

In [1]:
import nltk
from nltk.corpus import PlaintextCorpusReader
import matplotlib as mpl
import pandas as pd
import numpy as np
import re
from transformers import pipeline

In [210]:
# Define the file directory
file_directory = '..\\Dataset\\'

# Read a single file 
with open(file_directory + '2022_eng.txt', 'r', encoding="utf8") as file_to_read:
    df = file_to_read.read()

**Issues**
- Hugging Face pipeline can only deal with max 512 tokens. Our paragraphs are too long and hence, need to convert to sentences for hugging face.

In [211]:
# Convert into dataframes, split by sentences
df = (pd.DataFrame(df.split("\n\n"))).rename(columns={0: "Paras"})

In [212]:
# Pre-processing
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\n", " ", text) #replace "\n" with " "
    text = re.sub(r"\W", " ", text) #replaces non-word characters like ',' with " "
    text = re.sub(r"^\d+\s|\s\d+\s|\s\d+$", " ", text) #removes digits e.g. 16 years >> years
    text = text.strip(" ")
    text = re.sub(r"[^\w\s]", "", text)
    return text

df["Clean_Text"] = df['Paras'].map(lambda text: clean_text(text))

**Sentiment Analysis**

In [213]:
corpus = list(df['Clean_Text'].values)

In [214]:
nlp_sentiment = pipeline("sentiment-analysis", model= "distilbert-base-uncased-finetuned-sst-2-english" )  
#using 'sentiment-analysis'same as "text-classification" 

In [215]:
df["Sentiment"] = nlp_sentiment(corpus)

Token indices sequence length is longer than the specified maximum sequence length for this model (605 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: The size of tensor a (605) must match the size of tensor b (512) at non-singleton dimension 1

In [None]:
# Extract pipeline's sentiment analysis output that consists of a label and a score into separate columns

df['Sentiment_Label'] = [x.get('label') for x in df['Sentiment']]
df['Sentiment_Score'] = [x.get('score') for x in df['Sentiment']]

### Sentiment Analysis Attempt 2
- Overcoming the issues on max length

In [148]:
# Define the file directory
file_directory = '..\\Dataset\\'

# Read a single file 
with open(file_directory + '2022_eng.txt', 'r', encoding="utf8") as file_to_read:
    df = file_to_read.read()

In [149]:
# Convert into dataframes, split by paragraphs
df = (pd.DataFrame(df.split("\n\n"))).rename(columns={0: "Paragraphs"})

#Create new column to store Paragraph Numbers 
df['ParasNo'] = df.index

In [150]:
df.head()

Unnamed: 0,Paragraphs,ParasNo
0,My fellow Singaporeans,0
1,Good evening.,1
2,COVID-19\nWe have come a long way in our fight...,2
3,"In many other countries, when a wave happens, ...",3
4,"Thus far, we have had fewer than 1,600 COVID-1...",4


In [151]:
import numpy as np
from itertools import chain

# return list from series of comma-separated strings
def chainer(s):
    return list(chain.from_iterable(s.str.split('.')))

# calculate lengths of splits using full stops as we are looking for sentences
lens = df['Paragraphs'].str.split('.').map(len)

# create new dataframe, repeating or chaining as appropriate
df = pd.DataFrame({'ParasNo': np.repeat(df['ParasNo'], lens),
                    'Sentences': chainer(df['Paragraphs'])}) 

df.reset_index(inplace=True, drop=True)

In [152]:
df

Unnamed: 0,ParasNo,Sentences
0,0,My fellow Singaporeans
1,1,Good evening
2,1,
3,2,COVID-19\nWe have come a long way in our fight...
4,2,We are now learning to live with the virus
...,...,...
628,96,"But with your trust, we can come through what..."
629,96,"With your support, we can turn hopes and drea..."
630,96,"Not just for now, not just for ourselves, but..."
631,96,


In [155]:
# Pre-processing

#Drop rows with empty strings 
df.drop(index=df[df['Sentences'] == ''].index, inplace=True)
df.reset_index(inplace=True, drop=True)

def clean_text(text):
    text = text.lower()
    text = re.sub(r"\n", " ", text) #replace "\n" with " "
    text = re.sub(r"\W", " ", text) #replaces non-word characters like ',' with " "
    text = re.sub(r"^\d+\s|\s\d+\s|\s\d+$", " ", text) #removes digits e.g. 16 years >> years
    text = text.strip(" ")
    text = re.sub(r"[^\w\s]", "", text)
    return text

df["Clean_Text"] = df['Sentences'].map(lambda text: clean_text(text))

In [157]:
df.head()

Unnamed: 0,ParasNo,Sentences,Clean_Text
0,0,My fellow Singaporeans,my fellow singaporeans
1,1,Good evening,good evening
2,2,COVID-19\nWe have come a long way in our fight...,covid we have come a long way in our fight aga...
3,2,We are now learning to live with the virus,we are now learning to live with the virus
4,2,"With each infection wave, we have managed the...",with each infection wave we have managed the ...


Sentiment Analysis

In [158]:
corpus = list(df['Clean_Text'].values)

In [159]:
nlp_sentiment = pipeline("sentiment-analysis", model = "distilbert-base-uncased-finetuned-sst-2-english")

In [160]:
%%time
df["Sentiment"] = nlp_sentiment(corpus)

Wall time: 12.8 s


In [161]:
# Extract pipeline's sentiment analysis output that consists of a label and a score into separate columns

df['Sentiment_Label'] = [x.get('label') for x in df['Sentiment']]
df['Sentiment_Score'] = [x.get('score') for x in df['Sentiment']]

In [162]:
df

Unnamed: 0,ParasNo,Sentences,Clean_Text,Sentiment,Sentiment_Label,Sentiment_Score
0,0,My fellow Singaporeans,my fellow singaporeans,"{'label': 'POSITIVE', 'score': 0.9980775117874...",POSITIVE,0.998078
1,1,Good evening,good evening,"{'label': 'POSITIVE', 'score': 0.9998613595962...",POSITIVE,0.999861
2,2,COVID-19\nWe have come a long way in our fight...,covid we have come a long way in our fight aga...,"{'label': 'POSITIVE', 'score': 0.9577266573905...",POSITIVE,0.957727
3,2,We are now learning to live with the virus,we are now learning to live with the virus,"{'label': 'POSITIVE', 'score': 0.9938626289367...",POSITIVE,0.993863
4,2,"With each infection wave, we have managed the...",with each infection wave we have managed the ...,"{'label': 'POSITIVE', 'score': 0.9479452967643...",POSITIVE,0.947945
...,...,...,...,...,...,...
537,96,I have given you my take of what we can achie...,i have given you my take of what we can achiev...,"{'label': 'POSITIVE', 'score': 0.9860721230506...",POSITIVE,0.986072
538,96,"But with your trust, we can come through what...",but with your trust we can come through whate...,"{'label': 'POSITIVE', 'score': 0.9050359129905...",POSITIVE,0.905036
539,96,"With your support, we can turn hopes and drea...",with your support we can turn hopes and dream...,"{'label': 'POSITIVE', 'score': 0.9996213912963...",POSITIVE,0.999621
540,96,"Not just for now, not just for ourselves, but...",not just for now not just for ourselves but ...,"{'label': 'POSITIVE', 'score': 0.9988539218902...",POSITIVE,0.998854


In [163]:
df['Sentiment_Label'].value_counts()

POSITIVE    328
NEGATIVE    214
Name: Sentiment_Label, dtype: int64

Not sure if the below is alright.
Each sentence has a sentiment label and score. I think that the score tells you how positive or negative it is.
E.g. Sentiment_Label = Positive and Sentiment_Score = 0.957 means it is very very positive ; 
E.g. Sentiment_Label = Negative and Sentiment_Score = 0.788761 means it is very negative.

Basically, if a paragraph has 5 sentences e.g. Paragraph 2, what I am doing is:
1. Calculate how positive each sentence is. This entails:
- Finding out which are the negative sentence 
- Then derive how positive it is
- E.g., Para 2, Sentence 5. Since it is 0.788761 negative, it is 1-0.788.. = 0.21.. positive 

2. Thereafter, I will take the average of the scores to derive how positive of negative a paragraph is.
The existing threshold used is >= 0.5 for positive and <0.5 for negative.

In [201]:
#For illustration - can be removed later on
df[df['ParasNo']==2]

Unnamed: 0,ParasNo,Sentences,Clean_Text,Sentiment,Sentiment_Label,Sentiment_Score,Positive_Score
2,2,COVID-19\nWe have come a long way in our fight...,covid we have come a long way in our fight aga...,"{'label': 'POSITIVE', 'score': 0.9577266573905...",POSITIVE,0.957727,0.957727
3,2,We are now learning to live with the virus,we are now learning to live with the virus,"{'label': 'POSITIVE', 'score': 0.9938626289367...",POSITIVE,0.993863,0.993863
4,2,"With each infection wave, we have managed the...",with each infection wave we have managed the ...,"{'label': 'POSITIVE', 'score': 0.9479452967643...",POSITIVE,0.947945,0.947945
5,2,"The latest, the Omicron BA",the latest the omicron ba,"{'label': 'POSITIVE', 'score': 0.6443324685096...",POSITIVE,0.644332,0.644332
6,2,"5 wave, is now subsiding",wave is now subsiding,"{'label': 'NEGATIVE', 'score': 0.7887611389160...",NEGATIVE,0.788761,0.211239


In [185]:
#Create new column to store positive scores
df['Positive_Score'] = df['Sentiment_Score']

#Update the 'positive scores' for rows of negative sentiment 
mask = (df['Sentiment_Label'] == 'NEGATIVE')
df.loc[mask, 'Positive_Score'] = 1 - df['Sentiment_Score']

In [192]:
#Take the average of the sentence scores and store in a new dataframe
results = df.groupby(['ParasNo']).mean('Positive_Score').drop(columns='Sentiment_Score',inplace=True)
#Assign Sentiment Label for each paragraph where sentiment is positive if score >= 0.50, and negative otherwise. 
results['Paragraph_Sentiment_Label'] = np.where(results['Positive_Score']>=0.50, "POSITIVE","NEGATIVE")

results.head()

In [199]:
results['Paragraph_Sentiment_Label'].value_counts()

POSITIVE    66
NEGATIVE    32
Name: Paragraph_Sentiment_Label, dtype: int64