# **Sentiment Analysis**

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import os

import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px
import plotly.io as pio

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
import scipy.stats as stats

In [3]:
import re
from textblob import TextBlob
from textblob import WordList

import nltk 
nltk.download('brown')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 
nltk.download('vader_lexicon') 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

stopwords = set(stopwords.words('english'))
additional_stopwords = {
    "tak",
    'bukan',
    'ini',
    'itu',
    'dia',
    'kami',
    'saya',
    'kita',
    'mereka',
    "huh", 
    's',
    "oh",
    "ayo",
    "rt", # retweet
    "lrt", # last retweet
    "im",
    "gon",
    "na",
    'ca',
    'nt',
    'wan',
    'na',
    'lol',
    'lmao',
    'rofl',
    'lmfao',
    'hi',
    'hello',
    'haha',
    'hahaha',
    'eh',
    'dah',
    'la',
    'lah',
    "ka",
    "ke",
    "kah",
    "aku",
    "kau",
    "guys"
    }

short_forms = {
    "irl": "in real life",
    "u": "you",
    "U": "you",
    "tpm": "dpm",
    "malaysians": "malaysian",
    "ds": "dato seri",
    "pm": "Prime Minister",
    "PM": "Prime Minister",
    "pm10": "Prime Minister 10",
    "PM10": "Prime Minister 10",
    "PMX": "Prime Minister 10",
    "pmx": "Prime Minister 10",
    "congrats": "congratulations",
    "congratulation": "congratulations",
    "tahniah": "congratulations",
    "btw": "by the way",
    "omg": "oh my god",
    "ni": "this",
    "nt": "not",
    "msia": "malaysia",
    "gov": "government",
    "govt": "government",
    "pls": "please",
    "pru": "General Election",
    "pru15": "General Election 15",
    "ge": "General Election",
    "ge15": "General Election 15",
    "kl": "kuala lumpur",
    "ngos": "ngo",
    "eksyen": "action",
    "wtf": "what the fuck",
    "tf": "the fuck",
    "stfu": "shut the fuck up",
    "idk": "i don't know",
    "dont": "do not",
    "don't": "do not",
    "can't": "cannot",
    "won't": "will not",
    "it's": "it is",
    "isn't": "is not",
    "we're": "we are",
    "you're": "you are",
    "they're": "they are",
    "he's": "he is",
    "she's": "she is",
    "I'm": "I am",
    "that's": "that is",
    "there's": "there is",
    "let's": "let us",
    "who's": "who is",
    "what's": "what is",
    "where's": "where is",
    "when's": "when is",
    "why's": "why is",
    "how's": "how is",
    "didn't": "did not",
    "wasn't": "was not",
    "weren't": "were not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "can't": "can not",
    "couldn't": "could not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "mightn't": "might not",
    "mustn't": "must not",
    "i've": "I have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "he'd": "he would",
    "she'd": "she would",
    "i'd": "I would",
    "you'd": "you would",
    "we'd": "we would",
    "they'd": "they would",
    "he'll": "he will",
    "she'll": "she will",
    "i'll": "I will",
    "you'll": "you will",
    "we'll": "we will",
    "they'll": "they will",
    "i'd": "I had",
    "you'd": "you had",
    "we'd": "we had",
    "they'd": "they had",
    "should've": "should have",
    "could've": "could have",
    "would've": "would have",
    "might've": "might have",
    "must've": "must have",
    "ought to": "should",
    "need to": "should",
    "gotta": "got to",
    "wanna": "want to",
    "kinda": "kind of",
    "sorta": "sort of",
    "outta": "out of",
    "aren't": "are not",
    "isn't": "is not",
    "wasn't": "was not",
    "weren't": "were not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "doesn't": "does not",
    "don't": "do not",
    "didn't": "did not",
    "won't": "will not",
    "wouldn't": "would not",
    "shan't": "shall not",
    "shouldn't": "should not",
    "can't": "cannot",
    "couldn't": "could not",
    "needn't": "need not",
    "oughtn't": "ought not",
    "mightn't": "might not",
    "mustn't": "must not",
    "daren't": "dare not",
    "ain't": "is not",
    "it's": "it is",
    "let's": "let us",
    "that's": "that is",
    "what's": "what is",
    "where's": "where is",
    "who's": "who is",
    "how's": "how is",
    "there's": "there is",
    "here's": "here is",
    "smh": "shake my head",
    "fyi": "for your information",
    "imo": "in my opinion",
    "brb": "be right back",
    "n": "and",
    # Add more short forms/contractions and their expansions as needed
}

stopwords.update(additional_stopwords)


[nltk_data] Downloading package brown to /Users/waizwafiq/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/waizwafiq/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/waizwafiq/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/waizwafiq/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/waizwafiq/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/waizwafiq/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## **Data Loader**

In [17]:
# df = pd.read_csv('./data/events/KerajaanGagal_until2022-10-12_event.csv')
# df = pd.read_csv('./data/events/KerajaanGagal_event.csv')
# df = pd.read_csv('./data/political_figs/IsmailSabri60_posts.csv')
# df = pd.read_csv('./data/political_figs/anwaribrahim_posts.csv')
df = pd.read_csv('./data/political_figs/DrZahidHamidi_posts.csv')
df.head()

Unnamed: 0,Twitter Username,Post,Date Posted
0,@00000q7,Don’t read too deep about it. Most malays reje...,"Dec 2, 2022"
1,@00000q7,Good. But we’ll be more happy if you can rejec...,"Nov 28, 2022"
2,@0002ris,There are reasons why Zahid Hamidi became TPM ...,"Dec 3, 2022"
3,@1Kesidang,I like the idea. But I need to know what is DS...,"Nov 23, 2022"
4,@1negara1,Because they signed SDs supporting Muhyiddin w...,"Nov 23, 2022"


In [18]:
def remove_URL(text):
    return re.sub(r"http\S+", "", text) 

def remove_hashtags(sample):
    return re.sub(r"#\S+", "", sample) 

def remove_breaklines(text):
    return re.sub(r"\n", " ", text)

# remove stopwords from a string
def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_text = [word for word in tokens if word.lower() not in stopwords]
    return ' '.join(filtered_text)

def remove_symbols(text):
    return re.sub(r"[^\w\s]", "", text)

def remove_whitespace(text):
    return re.sub(r"\s+", " ", text.strip())

def expand_shortforms(text):
    words = text.split()
    expanded_words = [short_forms.get(word.lower(), word) for word in words]
    expanded_text = ' '.join(expanded_words)
    return expanded_text

def lemmatize_word(word, pos):
    lemmatizer = WordNetLemmatizer()
    if pos.startswith('J'):
        pos = wordnet.ADJ
    elif pos.startswith('V'):
        pos = wordnet.VERB
    elif pos.startswith('N'):
        pos = wordnet.NOUN
    elif pos.startswith('R'):
        pos = wordnet.ADV
    else:
        pos = wordnet.NOUN  # Default to noun if the part of speech is not recognized
    
    return lemmatizer.lemmatize(word, pos=pos)

def convert_date(date_str):
    """
    Converts the given date string into the desired format.

    Parameters:
    - date_str (str): The input date string to be converted.

    Returns:
    - str: The converted date string in the format "DD-MM-YYYY".

    Steps:
    1. Get the current datetime.
    2. If the length of the date string is greater than 3 and it contains a comma:
        a. Try to parse the date string with the format "%b %d, %Y".
        b. If successful, format the parsed date as "DD-MM-YYYY" and return it.
        c. If parsing fails, return the current date formatted as "DD-MM-YYYY".
    3. If the date string ends with 'h', 'm', or 's':
        a. Return the current date formatted as "DD-MM-YYYY".
    4. If the length of the date string is 10 and it has '-' at positions 2 and 5:
        a. Return the date string as it is without any changes.
    5. Otherwise, try to parse the date string with the format "%b %d".
        a. If successful, format the parsed date as "DD-MM-YYYY" and return it.
        b. If parsing fails, return the current date formatted as "DD-MM-YYYY".
    """
    now = datetime.now()
    if len(date_str) > 3 and ',' in date_str:
        try:
            date = datetime.strptime(date_str, "%b %d, %Y")
            return date.strftime("%d-%m-%Y")
        except ValueError:
            return now.strftime("%d-%m-%Y")
    elif date_str.endswith('h') or date_str.endswith('m') or date_str.endswith('s'):
        return now.strftime("%d-%m-%Y")
    elif len(date_str) == 10 and date_str[2] == '-' and date_str[5] == '-':
        return date_str
    else:
        try:
            date = datetime.strptime(date_str, "%b %d")
            return date.strftime("%d-%m-2023")
        except ValueError:
            return now.strftime("%d-%m-%Y")
    

def processText(text):
    # out = lowercase(text)
    out = remove_URL(text)
    out = remove_hashtags(out)
    out = expand_shortforms(out)
    out = remove_stopwords(out)
    out = remove_breaklines(out)
    out = remove_symbols(out)
    out = remove_whitespace(out)
    return out
    
def lemmatize_word(word, pos):
    lemmatizer = WordNetLemmatizer()
    if pos.startswith('J'):
        pos = wordnet.ADJ
    elif pos.startswith('V'):
        pos = wordnet.VERB
    elif pos.startswith('N'):
        pos = wordnet.NOUN
    elif pos.startswith('R'):
        pos = wordnet.ADV
    else:
        pos = wordnet.NOUN  # Default to noun if the part of speech is not recognized
    
    return lemmatizer.lemmatize(word, pos=pos)

def stem_word(word):
    stemmer = PorterStemmer()
    stemmed_word = stemmer.stem(word)
    
    # Preserve original case
    if word[0].isupper():
        stemmed_word = stemmed_word.capitalize()
    elif word.isupper():
        stemmed_word = stemmed_word.upper()
        
    return stemmed_word


# Apply the function to the DataFrame column
df['Post'] = df['Post'].apply(processText)
# event1['Post'] = event1['Post'].apply(lambda x: ' '.join([lemmatize_word(word, pos) for word, pos in nltk.pos_tag(nltk.word_tokenize(x))]))

try:
    # Apply the conversion function to the "Date Posted" column
    df['Date Posted'] = df['Date Posted'].apply(convert_date)
    df['Date Posted'] = pd.to_datetime(df['Date Posted'], format='%d-%m-%Y')
except TypeError:
    pass

# Lemmatize
df['Post'] = df['Post'].apply(lambda x: ' '.join([lemmatize_word(word, pos) for word, pos in nltk.pos_tag(nltk.word_tokenize(x))]))

# Stemming
# event1['Post'] = event1['Post'].apply(lambda x: ' '.join([stem_word(word) for word in nltk.word_tokenize(x)]))
df.head()

Unnamed: 0,Twitter Username,Post,Date Posted
0,@00000q7,read deep malay reject BN lead Zahid vote PH D...,2022-12-02
1,@00000q7,Good happy reject umno court cluster cabinet e...,2022-11-28
2,@0002ris,reason Zahid Hamidi become dpm reason 2 TPM on...,2022-12-03
3,@1Kesidang,like idea need know DSAI stand Zahid Hamidi 47...,2022-11-23
4,@1negara1,sign SDs support Muhyiddin without party s con...,2022-11-23


In [19]:
# Delete tweet posts with tokens <= del_max_token
del_max_token = 4
df['post_split'] = df['Post'].apply(lambda x: x.split(" ") if isinstance(x, str) else x)
df['tokens_num'] = df['post_split'].apply(lambda x: len(x))
df[df['tokens_num'] <= del_max_token]

Unnamed: 0,Twitter Username,Post,Date Posted,post_split,tokens_num
70,@AhmadMahalil,list quote Zahid Hamidi,2022-11-24,"[list, quote, Zahid, Hamidi]",4
95,@AliffXP,Steady Lord Zahid Hamidi,2022-12-02,"[Steady, Lord, Zahid, Hamidi]",4
108,@Amore_988,Zahid Hamidi real kingmaker,2022-12-02,"[Zahid, Hamidi, real, kingmaker]",4
109,@AmyraZulkarnain,ZAHID HAMIDI joke mannnn,2022-12-02,"[ZAHID, HAMIDI, joke, mannnn]",4
112,@AnbalaganRamas3,importantly Zahid Hamidi BN,2022-12-04,"[importantly, Zahid, Hamidi, BN]",4
...,...,...,...,...,...
3007,@zahidzakariaa,wish,2023-06-07,[wish],1
3008,@zahidzakariaa,good girl Aisyah,2023-06-11,"[good, girl, Aisyah]",3
3039,@zfrnmrn,Zahid hamidi say,2022-12-02,"[Zahid, hamidi, say]",3
3043,@zhafrimulia,key ZahidledUMNO,2023-04-11,"[key, ZahidledUMNO]",2


In [20]:
df.drop(df[df['tokens_num'] <= del_max_token].index, inplace=True)
df.reset_index(drop=True, inplace=True)
len(df)

2954

## **Lexicon-Based**

In [21]:
lexicons = {
    'rid': -0.3,
    'culprit': -0.55,
    'corrupt': -0.9,
    'corruption': -0.3,
    'assassinate': -0.9,
    'betray': -0.3,
    'racist': -0.95,
    'riot': -0.68
}

def custom_SentimentScore(text):
    words = text.split()
    sentiment_score = 0.0
    for word in words:
        if word.lower() in lexicons:
            sentiment_score += lexicons[word.lower()]
    
    return sentiment_score

df_posts = df['Post']

In [22]:
polarities_textblob = [TextBlob(post).sentiment.polarity for post in df_posts]
polarities_custom = [custom_SentimentScore(post) for post in df_posts]

max_custom_score = sum(abs(score) for score in lexicons.values())
combined_polarities = [(tb + cs) for tb, cs in zip(polarities_textblob, polarities_custom)]

# Adjust scores outside the range of -1 to 1
combined_polarities = [max(min(score, 1), -1) for score in combined_polarities]

polarities_df = pd.DataFrame(combined_polarities, columns=["pol"])
polarities_df.describe()

Unnamed: 0,pol
count,2954.0
mean,0.026103
std,0.30404
min,-1.0
25%,-0.05692
50%,0.0
75%,0.186068
max,1.0


In [28]:
# Create the histogram
fig = px.histogram(polarities_df, x="pol", nbins=250)

# Compute the mean and standard deviation
mean = polarities_df["pol"].mean()
std = polarities_df["pol"].std()

# Generate the x-values for the bell curve
x = np.linspace(-1, 1, 1000)

# Fit a skewed normal distribution to the polarity data
params = stats.skewnorm.fit(polarities_df["pol"])
pdf = stats.skewnorm.pdf(x, *params)

# Add the bell curve as a line plot
# fig.add_trace(go.Scatter(x=x, y=pdf*100, mode='lines', name='Bell Curve'))
# Set the layout and save the plot
fig.update_layout(
        title_text=f'''Sentiment Analysis Histogram: {str(len(df_posts))} tweets
        <br><sup></sup>
        Mean = {mean}, Standard Deviation = {std}'''
        )
# fig.write_html("./plots/sentiment_histogram.html")

# Display the plot
fig.show()

# Describe the statistics of the polarity values
polarities_df.describe()

Unnamed: 0,pol
count,2954.0
mean,0.026103
std,0.30404
min,-1.0
25%,-0.05692
50%,0.0
75%,0.186068
max,1.0


In [45]:
p = 0.5  # Remove 'neutral' polarity by the range of [-p, p]

polarities_df_p = polarities_df[(polarities_df['pol'] <= -p) | (polarities_df['pol'] >= p)]

# Create the histogram
fig = px.histogram(polarities_df_p, x="pol", nbins=250)

# Compute the mean and standard deviation
mean1 = polarities_df_p[polarities_df_p["pol"] <= -p].mean()
mean2 = polarities_df_p[polarities_df_p["pol"] >= p].mean()
mean = mean1 + mean2

d1 = mean - mean1
d2 = mean - mean2

n1 = len(polarities_df_p[polarities_df_p["pol"] <= -p])
n2 = len(polarities_df_p[polarities_df_p["pol"] >= p])

std1 = polarities_df_p[polarities_df_p["pol"] <= -p].std()
std2 = polarities_df_p[polarities_df_p["pol"] >= p].std()
stdp1 = n1*(std1**2 + d1**2)
stdp2 = n2*(std2**2 + d2**2)
std = ((stdp1 + stdp2)/(n1 + n2))**0.5
# Generate the x-values for the bell curve
x = np.linspace(-1, 1, 1000)

# Fit a skewed normal distribution to the polarity data
params = stats.skewnorm.fit(polarities_df_p["pol"])
pdf = stats.skewnorm.pdf(x, *params)

# Add the bell curve as a line plot
# fig.add_trace(go.Scatter(x=x, y=pdf*100, mode='lines', name='Bell Curve'))

# Set the layout and save the plot
fig.update_layout(
        title_text=f'''Sentiment Analysis (polarity, |p| ≥ {p}): {str(len(polarities_df_p))} tweets
        <br><sup>Mean = {mean[0]}, Standard Deviation = {std[0]}</sup>'''
    )
# fig.write_html(f"sentiment_histogram_p.html")

# Display the plot
fig.show()

# Describe the statistics of the polarity values
polarities_df_p.describe()

Unnamed: 0,pol
count,371.0
mean,-0.026602
std,0.705916
min,-1.0
25%,-0.698571
50%,0.5
75%,0.6
max,1.0


## **Automated**

In [73]:
def SentimentAnalysis(csv_path):
    
    df = pd.read_csv(csv_path)

    def remove_URL(text):
        return re.sub(r"http\S+", "", text)

    def remove_hashtags(sample):
        return re.sub(r"#\S+", "", sample)

    def remove_breaklines(text):
        return re.sub(r"\n", " ", text)

    # remove stopwords from a string
    def remove_stopwords(text):
        tokens = word_tokenize(text)
        filtered_text = [
            word for word in tokens if word.lower() not in stopwords]
        return ' '.join(filtered_text)

    def remove_symbols(text):
        return re.sub(r"[^\w\s]", "", text)

    def remove_whitespace(text):
        return re.sub(r"\s+", " ", text.strip())

    def expand_shortforms(text):
        words = text.split()
        expanded_words = [short_forms.get(
            word.lower(), word) for word in words]
        expanded_text = ' '.join(expanded_words)
        return expanded_text

    def lemmatize_word(word, pos):
        lemmatizer = WordNetLemmatizer()
        if pos.startswith('J'):
            pos = wordnet.ADJ
        elif pos.startswith('V'):
            pos = wordnet.VERB
        elif pos.startswith('N'):
            pos = wordnet.NOUN
        elif pos.startswith('R'):
            pos = wordnet.ADV
        else:
            pos = wordnet.NOUN  # Default to noun if the part of speech is not recognized

        return lemmatizer.lemmatize(word, pos=pos)

    def convert_date(date_str):
        """
        Converts the given date string into the desired format.

        Parameters:
        - date_str (str): The input date string to be converted.

        Returns:
        - str: The converted date string in the format "DD-MM-YYYY".

        Steps:
        1. Get the current datetime.
        2. If the length of the date string is greater than 3 and it contains a comma:
            a. Try to parse the date string with the format "%b %d, %Y".
            b. If successful, format the parsed date as "DD-MM-YYYY" and return it.
            c. If parsing fails, return the current date formatted as "DD-MM-YYYY".
        3. If the date string ends with 'h', 'm', or 's':
            a. Return the current date formatted as "DD-MM-YYYY".
        4. If the length of the date string is 10 and it has '-' at positions 2 and 5:
            a. Return the date string as it is without any changes.
        5. Otherwise, try to parse the date string with the format "%b %d".
            a. If successful, format the parsed date as "DD-MM-YYYY" and return it.
            b. If parsing fails, return the current date formatted as "DD-MM-YYYY".
        """
        now = datetime.now()
        if len(date_str) > 3 and ',' in date_str:
            try:
                date = datetime.strptime(date_str, "%b %d, %Y")
                return date.strftime("%d-%m-%Y")
            except ValueError:
                return now.strftime("%d-%m-%Y")
        elif date_str.endswith('h') or date_str.endswith('m') or date_str.endswith('s'):
            return now.strftime("%d-%m-%Y")
        elif len(date_str) == 10 and date_str[2] == '-' and date_str[5] == '-':
            return date_str
        else:
            try:
                date = datetime.strptime(date_str, "%b %d")
                return date.strftime("%d-%m-2023")
            except ValueError:
                return now.strftime("%d-%m-%Y")

    def processText(text):
        # out = lowercase(text)
        out = remove_URL(text)
        out = remove_hashtags(out)
        out = expand_shortforms(out)
        out = remove_stopwords(out)
        out = remove_breaklines(out)
        out = remove_symbols(out)
        out = remove_whitespace(out)
        return out

    def lemmatize_word(word, pos):
        lemmatizer = WordNetLemmatizer()
        if pos.startswith('J'):
            pos = wordnet.ADJ
        elif pos.startswith('V'):
            pos = wordnet.VERB
        elif pos.startswith('N'):
            pos = wordnet.NOUN
        elif pos.startswith('R'):
            pos = wordnet.ADV
        else:
            pos = wordnet.NOUN  # Default to noun if the part of speech is not recognized

        return lemmatizer.lemmatize(word, pos=pos)

    def stem_word(word):
        stemmer = PorterStemmer()
        stemmed_word = stemmer.stem(word)

        # Preserve original case
        if word[0].isupper():
            stemmed_word = stemmed_word.capitalize()
        elif word.isupper():
            stemmed_word = stemmed_word.upper()

        return stemmed_word

    # Apply the function to the DataFrame column
    df['Post'] = df['Post'].apply(processText)
    # event1['Post'] = event1['Post'].apply(lambda x: ' '.join([lemmatize_word(word, pos) for word, pos in nltk.pos_tag(nltk.word_tokenize(x))]))

    try:
        # Apply the conversion function to the "Date Posted" column
        df['Date Posted'] = df['Date Posted'].apply(convert_date)
        df['Date Posted'] = pd.to_datetime(
            df['Date Posted'], format='%d-%m-%Y')
    except TypeError:
        pass

    # Lemmatize
    df['Post'] = df['Post'].apply(lambda x: ' '.join([lemmatize_word(
        word, pos) for word, pos in nltk.pos_tag(nltk.word_tokenize(x))]))

    # Delete tweet posts with tokens <= del_max_token
    del_max_token = 4
    df['post_split'] = df['Post'].apply(
        lambda x: x.split(" ") if isinstance(x, str) else x)
    df['tokens_num'] = df['post_split'].apply(lambda x: len(x))
    df.drop(df[df['tokens_num'] <= del_max_token].index, inplace=True)
    df.reset_index(drop=True, inplace=True)

    try:
        os.makedirs(f'./plots/{os.path.splitext(os.path.basename(csv_path))[0]}')
    except FileExistsError:
        pass

    lexicons = {
        'rid': -0.3,
        'culprit': -0.55,
        'corrupt': -0.9,
        'corruption': -0.3,
        'assassinate': -0.9,
        'betray': -0.3,
        'racist': -0.95,
        'riot': -0.68
    }

    def custom_SentimentScore(text):
        words = text.split()
        sentiment_score = 0.0
        for word in words:
            if word.lower() in lexicons:
                sentiment_score += lexicons[word.lower()]

        return sentiment_score

    df_posts = df['Post']
    polarities_textblob = [
        TextBlob(post).sentiment.polarity for post in df_posts]
    polarities_custom = [custom_SentimentScore(post) for post in df_posts]

    max_custom_score = sum(abs(score) for score in lexicons.values())
    combined_polarities = [(tb + cs)
                           for tb, cs in zip(polarities_textblob, polarities_custom)]

    # Adjust scores outside the range of -1 to 1
    combined_polarities = [max(min(score, 1), -1)
                           for score in combined_polarities]

    polarities_df = pd.DataFrame(combined_polarities, columns=["pol"])

    # Create the histogram
    fig = px.histogram(polarities_df, x="pol", nbins=250)

    # Compute the mean and standard deviation
    mean = polarities_df["pol"].mean()
    std = polarities_df["pol"].std()

    # Generate the x-values for the bell curve
    x = np.linspace(-1, 1, 1000)

    # Fit a skewed normal distribution to the polarity data
    params = stats.skewnorm.fit(polarities_df["pol"])
    pdf = stats.skewnorm.pdf(x, *params)

    # Add the bell curve as a line plot
    # fig.add_trace(go.Scatter(x=x, y=pdf*100, mode='lines', name='Bell Curve'))
    # Set the layout and save the plot
    fig.update_layout(
        title_text=f'''Sentiment Analysis Histogram: {str(len(df_posts))} tweets
        <br><sup>{os.path.splitext(os.path.basename(csv_path))[0]}</sup>
        Mean = {mean}, Standard Deviation = {std}'''
        )
    fig.write_html(
        f"./plots/{os.path.splitext(os.path.basename(csv_path))[0]}/sentiment_histogram.html")
    # pio.write_image(fig, f"./plots/{os.path.splitext(os.path.basename(csv_path))[0]}/sentiment_histogram.jpeg")
    # Display the plot
    # fig.show()

    p = 1/3  # Remove 'neutral' polarity by the range of [-p, p]

    polarities_df_p = polarities_df[(
        polarities_df['pol'] <= -p) | (polarities_df['pol'] >= p)]

    # Create the histogram
    fig = px.histogram(polarities_df_p, x="pol", nbins=250)

    n1 = len(polarities_df_p[polarities_df_p["pol"] <= -p])
    n2 = len(polarities_df_p[polarities_df_p["pol"] >= p])

    # Compute the mean and standard deviation
    mean1 = polarities_df_p[polarities_df_p["pol"] <= -p].mean()
    mean2 = polarities_df_p[polarities_df_p["pol"] >= p].mean()
    mean = (mean1[0]*n1 + mean2[0]*n2)/(n1+n2)

    d1 = mean - mean1
    d2 = mean - mean2

    std1 = polarities_df_p[polarities_df_p["pol"] <= -p].std()
    std2 = polarities_df_p[polarities_df_p["pol"] >= p].std()
    stdp1 = n1*(std1**2 + d1**2)
    stdp2 = n2*(std2**2 + d2**2)
    std = ((stdp1 + stdp2)/(n1 + n2))**0.5
    # Generate the x-values for the bell curve
    x = np.linspace(-1, 1, 1000)

    # Fit a skewed normal distribution to the polarity data
    params = stats.skewnorm.fit(polarities_df_p["pol"])
    pdf = stats.skewnorm.pdf(x, *params)

    # Add the bell curve as a line plot
    # fig.add_trace(go.Scatter(x=x, y=pdf*100, mode='lines', name='Bell Curve'))

    # Set the layout and save the plot
    try:
        fig.update_layout(
            title_text=f'''Sentiment Analysis (polarity, |p| ≥ {p}): {str(len(polarities_df_p))} tweets
            <br><sup>{os.path.splitext(os.path.basename(csv_path))[0]}</sup>
            Mean = {mean[0]}, Standard Deviation = {std[0]}'''
        )
    except Exception:
        fig.update_layout(
            title_text=f'''Sentiment Analysis (polarity, |p| ≥ {p}): {str(len(polarities_df_p))} tweets
            <br><sup>{os.path.splitext(os.path.basename(csv_path))[0]}</sup>
            Mean = {mean}, Standard Deviation = {std}'''
        )
    fig.write_html(
        f"./plots/{os.path.splitext(os.path.basename(csv_path))[0]}/sentiment_histogram_p.html")
    # pio.write_image(fig, f"./plots/{os.path.splitext(os.path.basename(csv_path))[0]}/sentiment_histogram_p.jpeg")
    # Display the plot
    # fig.show()

In [74]:
SentimentAnalysis('./data/events/1mdb.csv')