# Sentiment Analyzer  

# Load Data & Data Cleaning

In [2]:
import os
import pandas as pd
import re
import numpy as np
import nltk
import requests
import json

In [3]:

#For data visualization
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline

pd.options.plotting.backend = "plotly"

# First Dataset

## Data Processing For LSTM/RNN - CNN

In [25]:
nltk.download(['stopwords','vader_lexicon'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\innovation_lab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\innovation_lab\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [26]:
def get_data(file_name):
    data_go_dir = os.path.join(os.getcwd(),"data")

    df = pd.read_csv(data_go_dir+file_name)
    df.head()
    
    # Get frequency of each value
    frequence = df.label.value_counts()
    print(frequence)
    
        # create a list of the values we want to assign for each condition
    values = [1,2,3,4]
    # create a list of our conditions
    conditions = [
        (df['label'] == "fear"),
        (df['label'] == "anger"),
        (df['label'] == "sadness"),
        (df['label'] == "joy"),
    ]
    
    df['target'] = np.select(conditions, values)
    
    return df 


In [27]:
test_df = get_data("\emotion-labels-test.csv")
test_df.head()

fear       995
anger      760
joy        714
sadness    673
Name: label, dtype: int64


Unnamed: 0,text,label,target
0,You must be knowing #blithe means (adj.) Happ...,joy,4
1,Old saying 'A #smile shared is one gained for ...,joy,4
2,Bridget Jones' Baby was bloody hilarious 😅 #Br...,joy,4
3,@Elaminova sparkling water makes your life spa...,joy,4
4,I'm tired of everybody telling me to chill out...,joy,4


In [28]:
train_df = get_data("\emotion-labels-train.csv")
train_df.head()

fear       1147
anger       857
joy         823
sadness     786
Name: label, dtype: int64


Unnamed: 0,text,label,target
0,Just got back from seeing @GaryDelaney in Burs...,joy,4
1,Oh dear an evening of absolute hilarity I don'...,joy,4
2,Been waiting all week for this game ❤️❤️❤️ #ch...,joy,4
3,"@gardiner_love : Thank you so much, Gloria! Yo...",joy,4
4,I feel so blessed to work with the family that...,joy,4


In [29]:
val_df = get_data("\emotion-labels-val.csv")
val_df.head()

fear       110
anger       84
joy         79
sadness     74
Name: label, dtype: int64


Unnamed: 0,text,label,target
0,"@theclobra lol I thought maybe, couldn't decid...",joy,4
1,Nawaz Sharif is getting more funnier than @kap...,joy,4
2,Nawaz Sharif is getting more funnier than @kap...,joy,4
3,@tomderivan73 😁...I'll just people watch and e...,joy,4
4,I love my family so much #lucky #grateful #sma...,joy,4


In [30]:
all_df = pd.concat([train_df,test_df,val_df], ignore_index = True)

In [31]:
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [32]:
def clean_words(text):
    
    # convert to lowercase
    text = text.lower()
    # remove non letters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    # tokenize
    words = text.split()
    # remove stopwords
    words = [w for w in words if w not in stopwords.words("english")]
    # apply stemming
    words = [PorterStemmer().stem(w) for w in words]
    # return list
    return words

if "text" in all_df.columns:
    all_df["clean_list_words"] = all_df["text"].apply(clean_words)
    


In [33]:
all_df.to_pickle("data/all_sentiment_data.pkl")

## Data Processing For ChatGPT

In [57]:
import json

filename = 'data/all_data_sentiment_simplified.jsonl' 

all_df_copy = all_df[["text","label"]].copy()

all_df_copy.rename(columns = {'label':'completion', "text":"prompt"}, inplace = True)
sentiment_json_result = all_df_copy.to_json(filename, orient="records",lines=True)
#parsed = json.loads(sentiment_json_result)
all_df_copy.head()

Unnamed: 0,prompt,completion
0,Just got back from seeing @GaryDelaney in Burs...,joy
1,Oh dear an evening of absolute hilarity I don'...,joy
2,Been waiting all week for this game ❤️❤️❤️ #ch...,joy
3,"@gardiner_love : Thank you so much, Gloria! Yo...",joy
4,I feel so blessed to work with the family that...,joy


In [54]:
# saving the DataFrame as a CSV file
cols = all_df.columns
csv_all_data = all_df.to_csv('data/all_data_sentiment.csv', index = True, columns = cols)

In [55]:
# saving the DataFrame as a CSV file
cols = all_df_copy.columns
csv_all_data = all_df_copy.to_csv('data/all_data_sentiment_simplified.csv', index = True, columns = cols)

In [69]:
all_df.head()

Unnamed: 0,text,label,target,clean_list_words
0,Just got back from seeing @GaryDelaney in Burs...,joy,4,"[got, back, see, garydelaney, burslem, amaz, f..."
1,Oh dear an evening of absolute hilarity I don'...,joy,4,"[oh, dear, even, absolut, hilar, think, laugh,..."
2,Been waiting all week for this game ❤️❤️❤️ #ch...,joy,4,"[wait, week, game, cheer, friday]"
3,"@gardiner_love : Thank you so much, Gloria! Yo...",joy,4,"[gardin, love, thank, much, gloria, sweet, tho..."
4,I feel so blessed to work with the family that...,joy,4,"[feel, bless, work, famili, nanni, noth, love,..."


In [70]:
x = all_df["clean_list_words"]

In [71]:
import plotly.express as px
fig = px.pie(all_df, names='label', title ='Pie chart of different sentiments of tweets')
fig.show()

In [72]:
all_df.isnull().sum().sum()

0

# Second Dataset

In [4]:
from tqdm import tqdm

In [17]:
url1 =  "https://datasets-server.huggingface.co/first-rows?dataset=sentiment140&config=sentiment140&split=train"

In [28]:
url2 = "https://datasets-server.huggingface.co/first-rows?dataset=sentiment140&config=sentiment140&split=test"

In [30]:

#url: API 
def extractDf(url):
    response = requests.get(url)
    #print(json.dumps(response.json(), indent = 4, sort_keys= True))
    d = response.json()
    
    texts = list()
    sentiments = list()
    for i in tqdm(d["rows"]):
        row = i["row"] # still json
        one_text = row["text"]
        sentiment = row["sentiment"]
        texts.append(one_text)
        sentiments.append(sentiment)
    
    # dictionary of lists 
    dt = {'prompt': texts, 'completion': sentiments} 
    df = pd.DataFrame(dt)
    return df


In [31]:
df2_train = extractDf(url1)
df2_train.head()

100%|█████████████████████████████████████| 100/100 [00:00<00:00, 308404.71it/s]


Unnamed: 0,prompt,completion
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [32]:
df2_test = extractDf(url2)
df2_test.head()

100%|█████████████████████████████████████| 100/100 [00:00<00:00, 143444.05it/s]


Unnamed: 0,prompt,completion
0,@stellargirl I loooooooovvvvvveee my Kindle2. ...,4
1,Reading my kindle2... Love it... Lee childs i...,4
2,"Ok, first assesment of the #kindle2 ...it fuck...",4
3,@kenburbary You'll love your Kindle2. I've had...,4
4,@mikefish Fair enough. But i have the Kindle2...,4


In [34]:
# importing the module
import collections

# using Counter to find frequency of elements
frequency = collections.Counter(df2_train["completion"])

# printing the frequency
print("training dataset2")
print(dict(frequency))

frequency = collections.Counter(df2_test["completion"])

# printing the frequency
print("testinging dataset2")
# using Counter to find fr
# printing the frequency
print(dict(frequency))

training dataset2
{0: 100}
testinging dataset2
{4: 45, 0: 37, 2: 18}


# Third Dataset
#https://huggingface.co/datasets/Recognai/sentiment-banking

## Method1: NLTK-Sentiment Analyzer

In [38]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [39]:
sia = SentimentIntensityAnalyzer()

In [40]:
sampleText1 = "Welcome to BMW Metaverse world, how can I help today?"
def nltk_sentiment(text):
    senti_score = sia.polarity_scores(sampleText1)
    print(senti_score)
    p = senti_score.pop("compound")
    sort_senti_Score = sorted(senti_score.items(), key=lambda item: item[1], reverse = True)
    return sort_senti_Score[0]
nltk_sentiment(sampleText1)

{'neg': 0.0, 'neu': 0.551, 'pos': 0.449, 'compound': 0.6908}


('neu', 0.551)

In [77]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 5000
max_len=50

def tokenize_pad_sequences(text):
    '''
    This function tokenize the input text into sequnences of intergers and then
    pad each sequence to the same length
    '''
    # Text tokenization
    tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ')
    tokenizer.fit_on_texts(text)
    # Transforms text to a sequence of integers
    X = tokenizer.texts_to_sequences(text)
    # Pad sequences to the same length
    X = pad_sequences(X, padding='post', maxlen=max_len)
    # return sequences
    return X, tokenizer

print('Before Tokenization & Padding \n', all_df['clean_list_words'][0])
X, tokenizer = tokenize_pad_sequences(all_df['clean_list_words'])
print('After Tokenization & Padding \n', x[0])

Before Tokenization & Padding 
 ['got', 'back', 'see', 'garydelaney', 'burslem', 'amaz', 'face', 'still', 'hurt', 'laugh', 'much', 'hilari']
After Tokenization & Padding 
 [  34   42   19 3491 3492   93  117   29  308  125   50   99    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]


### Neural Network-CNN

In [86]:
from sklearn.model_selection import train_test_split
y = all_df['target']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.15, random_state=1)

In [87]:
print("x_train_shape:", x_train.shape)
print("x_test_shape:", x_test.shape)

x_train_shape: (5432, 50)
x_test_shape: (711, 50)


In [93]:
print("y_train_shape:", y_train.shape)
print("y_test_shape:", y_test.shape)

y_train_shape: (5432,)
y_test_shape: (711,)


In [94]:
from keras import models
from keras import layers
from keras.utils import to_categorical

x_train = train_df['text'].values
x_test = test_df['text'].values
x_train = vectorize(x_train)
x_test = vectorize(x_test)
y_train = np.array(y_train).astype("float32")
y_test = np.array(y_test).astype("float32")

In [95]:
model = models.Sequential()

In [96]:
# Input - Layer
model.add(layers.Dense(50, activation = "relu", input_shape=(50, )))
# Hidden - Layers
model.add(layers.Dropout(0.1, noise_shape=None, seed=None))
model.add(layers.Dense(100, activation = "relu"))
model.add(layers.Dropout(0.1, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation = "relu"))
# Output- Layer
model.add(layers.Dense(1, activation = "sigmoid"))
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 50)                2550      
                                                                 
 dropout_6 (Dropout)         (None, 50)                0         
                                                                 
 dense_13 (Dense)            (None, 100)               5100      
                                                                 
 dropout_7 (Dropout)         (None, 100)               0         
                                                                 
 dense_14 (Dense)            (None, 50)                5050      
                                                                 
 dense_15 (Dense)            (None, 1)                 51        
                                                                 
Total params: 12,751
Trainable params: 12,751
Non-trai

In [97]:
model.compile(
optimizer = "adam",
loss = "categorical_crossentropy",
metrics = ["accuracy"]
)

In [98]:
results = model.fit(
x_train, y_train,
epochs= 100,
batch_size = 100,
validation_data = (x_test, y_test)
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [99]:
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 32.63%
