Reference: https://github.com/chuachinhon/practical_nlp/blob/master/notebooks/1.0_speech_sentiment_cch.ipynb

In [23]:
import nltk
from nltk.corpus import PlaintextCorpusReader
import matplotlib as mpl
import pandas as pd
import numpy as np
import re
from transformers import pipeline

In [72]:
# Define the file directory
file_directory = '..\\Dataset\\'

# Read a single file 
with open(file_directory + '1982.txt', 'r') as file_to_read:
    yr1982 = file_to_read.read()

In [73]:
# Convert into dataframes, split by paragraphs
yr1982 = (pd.DataFrame(yr1982.split("\n"))).rename(columns={0: "Paras"})

In [74]:
yr1982

Unnamed: 0,Paras
0,"16 years, every year it has been a pleasure to..."
1,"Because we have worked hard, but this 17th yea..."
2,"although we have done well, we are not going t..."
3,"Maybe only half as well. And worst, if a reces..."
4,if it continues to be sluggish and a recovery ...
...,...
449,So we decided let's keep the communists out of...
450,"And on that note, I wish you all, not perhaps ..."
451,Maybe next year will be worse. But the opportu...
452,"Those who see the opportunities, seize them. R..."


In [75]:
# Pre-processing
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\n", " ", text) #replace "\n" with " "
    text = re.sub(r"\W", " ", text) #replaces non-word characters like ',' with " "
    text = re.sub(r"^\d+\s|\s\d+\s|\s\d+$", " ", text) #removes digits e.g. 16 years >> years
    text = text.strip(" ")
    text = re.sub(r"[^\w\s]", "", text)
    return text

yr1982["Clean_Text"] = yr1982['Paras'].map(lambda text: clean_text(text))

In [76]:
yr1982

Unnamed: 0,Paras,Clean_Text
0,"16 years, every year it has been a pleasure to...",years every year it has been a pleasure to an...
1,"Because we have worked hard, but this 17th yea...",because we have worked hard but this 17th yea...
2,"although we have done well, we are not going t...",although we have done well we are not going t...
3,"Maybe only half as well. And worst, if a reces...",maybe only half as well and worst if a reces...
4,if it continues to be sluggish and a recovery ...,if it continues to be sluggish and a recovery ...
...,...,...
449,So we decided let's keep the communists out of...,so we decided let s keep the communists out of...
450,"And on that note, I wish you all, not perhaps ...",and on that note i wish you all not perhaps ...
451,Maybe next year will be worse. But the opportu...,maybe next year will be worse but the opportu...
452,"Those who see the opportunities, seize them. R...",those who see the opportunities seize them r...


Sentiment Analysis

In [77]:
corpus = list(yr1982['Clean_Text'].values)

In [78]:
nlp_sentiment = pipeline(
    "sentiment-analysis"
)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [79]:
%%time
yr1982["Sentiment"] = nlp_sentiment(corpus)

Wall time: 11.1 s


In [80]:
# The pipeline's sentiment analysis output consists of a label and a score
# I prefer to extract them into separate columns

yr1982['Sentiment_Label'] = [x.get('label') for x in yr1982['Sentiment']]

yr1982['Sentiment_Score'] = [x.get('score') for x in yr1982['Sentiment']]

In [82]:
yr1982

Unnamed: 0,Paras,Clean_Text,Sentiment,Sentiment_Label,Sentiment_Score
0,"16 years, every year it has been a pleasure to...",years every year it has been a pleasure to an...,"{'label': 'POSITIVE', 'score': 0.9998446702957...",POSITIVE,0.999845
1,"Because we have worked hard, but this 17th yea...",because we have worked hard but this 17th yea...,"{'label': 'NEGATIVE', 'score': 0.9633998870849...",NEGATIVE,0.963400
2,"although we have done well, we are not going t...",although we have done well we are not going t...,"{'label': 'NEGATIVE', 'score': 0.9974650144577...",NEGATIVE,0.997465
3,"Maybe only half as well. And worst, if a reces...",maybe only half as well and worst if a reces...,"{'label': 'NEGATIVE', 'score': 0.99783855676651}",NEGATIVE,0.997839
4,if it continues to be sluggish and a recovery ...,if it continues to be sluggish and a recovery ...,"{'label': 'NEGATIVE', 'score': 0.9985513091087...",NEGATIVE,0.998551
...,...,...,...,...,...
449,So we decided let's keep the communists out of...,so we decided let s keep the communists out of...,"{'label': 'POSITIVE', 'score': 0.9985767602920...",POSITIVE,0.998577
450,"And on that note, I wish you all, not perhaps ...",and on that note i wish you all not perhaps ...,"{'label': 'POSITIVE', 'score': 0.8563585877418...",POSITIVE,0.856359
451,Maybe next year will be worse. But the opportu...,maybe next year will be worse but the opportu...,"{'label': 'POSITIVE', 'score': 0.9923906922340...",POSITIVE,0.992391
452,"Those who see the opportunities, seize them. R...",those who see the opportunities seize them r...,"{'label': 'POSITIVE', 'score': 0.9997991919517...",POSITIVE,0.999799


In [84]:
yr1982['Sentiment_Label'].value_counts()

NEGATIVE    287
POSITIVE    167
Name: Sentiment_Label, dtype: int64

**Issues**
- Hugging Face pipeline can only deal with max 512 tokens. Our paragraphs are too long and hence, need to convert to sentences for hugging face.

2nd Attempt

In [123]:
# Define the file directory
file_directory = '..\\Dataset\\'

# Read a single file 
with open(file_directory + '2022_eng.txt', 'r', encoding="utf8") as file_to_read:
    df = file_to_read.read()

In [124]:
# Convert into dataframes, split by paragraphs
df = (pd.DataFrame(df.split("\n\n"))).rename(columns={0: "Sentences"})

In [125]:
df['ParasNo'] = df.index

In [126]:
df

Unnamed: 0,Sentences,ParasNo
0,My fellow Singaporeans,0
1,Good evening.,1
2,COVID-19\nWe have come a long way in our fight...,2
3,"In many other countries, when a wave happens, ...",3
4,"Thus far, we have had fewer than 1,600 COVID-1...",4
...,...,...
93,"Thankfully, for 57 years, over three generatio...",93
94,"Never take this trust, nor this competence, fo...",94
95,Leadership succession is therefore of paramoun...,95
96,The next few decades will be bracing but exhil...,96


In [133]:
import numpy as np
from itertools import chain

# return list from series of comma-separated strings
def chainer(s):
    return list(chain.from_iterable(s.str.split('\n')))

# calculate lengths of splits
lens = df['Sentences'].str.split('\n').map(len)

# create new dataframe, repeating or chaining as appropriate
df = pd.DataFrame({'ParasNo': np.repeat(df['ParasNo'], lens),
                    'Sentences': chainer(df['Sentences'])})

df

Unnamed: 0,ParasNo,Sentences
0,0,My fellow Singaporeans
1,1,Good evening.
2,2,COVID-19
2,2,We have come a long way in our fight against C...
3,3,"In many other countries, when a wave happens, ..."
...,...,...
93,93,"Thankfully, for 57 years, over three generatio..."
94,94,"Never take this trust, nor this competence, fo..."
95,95,Leadership succession is therefore of paramoun...
96,96,The next few decades will be bracing but exhil...


In [134]:
# Pre-processing
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\n", " ", text) #replace "\n" with " "
    text = re.sub(r"\W", " ", text) #replaces non-word characters like ',' with " "
    text = re.sub(r"^\d+\s|\s\d+\s|\s\d+$", " ", text) #removes digits e.g. 16 years >> years
    text = text.strip(" ")
    text = re.sub(r"[^\w\s]", "", text)
    return text

df["Clean_Text"] = df['Sentences'].map(lambda text: clean_text(text))

In [135]:
df

Unnamed: 0,ParasNo,Sentences,Clean_Text
0,0,My fellow Singaporeans,my fellow singaporeans
1,1,Good evening.,good evening
2,2,COVID-19,covid
2,2,We have come a long way in our fight against C...,we have come a long way in our fight against c...
3,3,"In many other countries, when a wave happens, ...",in many other countries when a wave happens ...
...,...,...,...
93,93,"Thankfully, for 57 years, over three generatio...",thankfully for years over three generations ...
94,94,"Never take this trust, nor this competence, fo...",never take this trust nor this competence fo...
95,95,Leadership succession is therefore of paramoun...,leadership succession is therefore of paramoun...
96,96,The next few decades will be bracing but exhil...,the next few decades will be bracing but exhil...


Sentiment Analysis

In [136]:
corpus = list(df['Clean_Text'].values)

In [137]:
nlp_sentiment = pipeline(
    "sentiment-analysis"
)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [138]:
%%time
df["Sentiment"] = nlp_sentiment(corpus)

Token indices sequence length is longer than the specified maximum sequence length for this model (605 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: The size of tensor a (605) must match the size of tensor b (512) at non-singleton dimension 1

In [139]:
# The pipeline's sentiment analysis output consists of a label and a score
# I prefer to extract them into separate columns

df['Sentiment_Label'] = [x.get('label') for x in df['Sentiment']]

df['Sentiment_Score'] = [x.get('score') for x in df['Sentiment']]

KeyError: 'Sentiment'

In [None]:
df

In [None]:
df['Sentiment_Label'].value_counts()