# Sentiment Analysis ENSEMBLE

# Import Libraries

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
# pd.set_option('display.max_rows', None)
import numpy as np
from numpy import load
import pickle

import tensorflow
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Conv1D, Dense, Dropout, Embedding, InputLayer, Flatten, GlobalMaxPool1D, LSTM, MaxPooling1D, Dropout
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer

import gensim
from gensim.models import Word2Vec, KeyedVectors

import re
import string
from string import punctuation
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

import time
import statistics

# Open-source Sentiment Analysis libraries
from textblob import TextBlob
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

# Self-created python file
!pip install emoji # Note preprocessing_steps has emoji as dependent library
!pip install ekphrasis
from preprocessing_steps import Preprocess

# To read/write into Google drive file by Share URL keys
from io import BytesIO
from google.colab import auth, files, drive
drive.mount('/content/gdrive')
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
from apiclient.http import MediaFileUpload
from googleapiclient.discovery import build

# Get auth credentials and service
creds = GoogleCredentials.get_application_default()
service = build('drive', 'v3', credentials=creds)

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
def save_dataframe_to_csv_results_by_sharedurlkey(data_df, csv_file_id):
    # Save a temp copy into personal Google drive
    csv_file_name = 'TEMP_file.csv'
    csv_file_path = f"/content/gdrive/MyDrive/{csv_file_name}"
    data_df.to_csv(csv_file_path, index=False)

    # Create media_body from csv file that was saved into personal Google drive
    # Then upload/update back to existing csv file with shared url key
    csv_media_body = MediaFileUpload(csv_file_path, resumable=True) 
    service.files().update(
        fileId = csv_file_id,
        media_body = csv_media_body
        ).execute()

# Read and Preprocess Data

In [None]:
# data_df = pd.read_csv('tweets_compiled_2020_06_to_2021_02.csv', encoding='utf-8')
# data_df = pd.read_csv('tweets_filtered_compiled_2020_06_to_2021_02.csv', encoding='utf-8')
# data_df = pd.read_csv('tweets_compiled_2020_09_to_2021_02.csv', encoding='utf-8')
data_df = pd.read_csv('tweets_filtered_compiled_2020_09_to_2021_02.csv', encoding='utf-8')
print(len(data_df))
data_df.head(10)

# Preprocess Text (Only need to do this step if not yet preprocessed)

In [None]:
def process_text(text):
    # text = Preprocess.replace_unrecognised_open_close_inverted_comma(text) # Replaces “ ” ‘ ’ with " " ' '
    text = Preprocess.remove_urls(text)
    text = Preprocess.remove_digits(text)

    # EMOJI/EMOTICONS handling
    text = Preprocess.convert_emojis_to_sentiments(text)
    text = Preprocess.remove_emojis(text)
    text = Preprocess.convert_emoticons_to_sentiments(text)   
    # text = Preprocess.convert_emojis_to_words(text) # Note Sentiment140 dataset has no emojis
    # text = Preprocess.replace_hyphens_or_underscores_with_spaces(text)

    # Entity Recognition handling
    text = Preprocess.entity_recognition_handling(text) # NOTE: Takes consideration amount of time for large datasets

    # Convert all text to lowercase (Note: Have to perform it after entity recognition handling)
    text = Preprocess.convert_to_lowercase(text)

    # SLANGS handling
    text = Preprocess.convert_slangs_to_words(text)

    # NEGATION handling
    text = Preprocess.prepend_NOT_to_handle_negation(text)
    text = Preprocess.remove_stopwords_after_appended_NOT(text)

    # Handle hashtags with concatenated words
    # text = Preprocess.perform_word_segmentation_for_hashtags(text)

    # Handle incorrectly spelt words
    # text = Preprocess.correct_spelling_mistakes(text) # NOTE: Takes consideration amount of time for large datasets

    # LEMMATIZATION
    text = Preprocess.perform_lemmatization(text)
    
    # Handle elongated words
    # text = Preprocess.handle_elongated_words(text)

    # Remove punctuations and special characters except hyphers and underscores that identifies entities
    text = Preprocess.remove_special_characters_except_hyphens_and_underscores(text)
    
    # Remove words with very short char length (e.g. length 2)
    text = Preprocess.remove_characters_of_specified_length(text, 2)

    return text

start = time.time()
data_df["processed_text"] = data_df["text"].apply(process_text)
# data_df["processed_text"] = data_df["full_text"].apply(Preprocess.process_text)
print("\nPreprocessing of Text COMPLETED")
print("Total time taken in minutes: {:.4f}".format((time.time()-start) / 60))


Total time taken in minutes: 11.2345


# Make a DataFrame Copy (Just in case)

In [None]:
data_copy = data_df.copy()

# TEXTBLOB SA

In [None]:
start = time.time()
sentiment_TextBlob = []
for i in range(len(data_copy)):
    blob = TextBlob(data_copy.iloc[i]["processed_text"]) # Note: On processed_text
    sentiment = "neutral"
    if blob.sentiment.polarity == 0:
        sentiment = "neutral"
    elif blob.sentiment.polarity > 0:
        sentiment = "positive"
    elif blob.sentiment.polarity < 0:
        sentiment = "negative"
        
    sentiment_TextBlob.append(sentiment)

# Add in new column called "sentiment_TextBlob" with the sentiment value from TextBlob
data_copy["sentiment_TextBlob"] = sentiment_TextBlob
print(str(len(data_copy)) + " records")
print("\nTotal time taken in minutes: {:.4f}".format((time.time()-start) / 60))

48948 records

Total time taken in minutes: 0.3567


# NLTK VADER SA
Valence Aware Dictionary and sEntiment Reasoner

In [None]:
sia = SentimentIntensityAnalyzer()

start = time.time()
sentiment_Vader = []
for i in range(len(data_copy)):
    # scores_dict = sia.polarity_scores(data_copy.iloc[i]["text"]) # Note: On unprocessed text because it has an exhaustive inbuilt preprocessing
    scores_dict = sia.polarity_scores(data_copy.iloc[i]["full_text"]) # Note: On unprocessed text because it has an exhaustive inbuilt preprocessing
    sentiment = "neutral"
    if scores_dict["compound"] == 0:
        sentiment = "neutral"
    elif scores_dict["compound"] > 0:
        sentiment = "positive"
    elif scores_dict["compound"] < 0:
        sentiment = "negative"
        
    sentiment_Vader.append(sentiment)

# Add in new column called "sentiment_TewxtBlob" with the sentiment value from TextBlob
data_copy["sentiment_Vader"] = sentiment_Vader
print(str(len(data_copy)) + " records")
print("\nTotal time taken in minutes: {:.4f}".format((time.time()-start) / 60))

48948 records

Total time taken in minutes: 0.3976


# LSTM SA

## Load saved processed embedding matrix (GloVe twitter 27b 200d)

In [None]:
# Load embedding matrix file by Shared URL key
# https://drive.google.com/file/d/1740LvMrdxGEKFlobMW-4ZuGOzcdh5zlI/view?usp=sharing - V3
embedding_matrix_file_id = "1740LvMrdxGEKFlobMW-4ZuGOzcdh5zlI"
embedding_matrix_file = service.files().get_media(fileId=embedding_matrix_file_id).execute()

# Load EMBEDDING_MATRIX(numpy array) from npy file
EMBEDDING_MATRIX = load(BytesIO(embedding_matrix_file))
print("EMBEDDING_MATRIX SHAPE:", EMBEDDING_MATRIX.shape)

VOCAB_SIZE = EMBEDDING_MATRIX.shape[0]
EMBED_DIM = EMBEDDING_MATRIX.shape[1]
MAX_SEQUENCE_LENGTH = EMBEDDING_MATRIX.shape[1]

print("VOCAB_SIZE - Embedding Matrix:", VOCAB_SIZE)
print("EMBED_DIM: ", EMBED_DIM)
print("MAX_SEQUENCE_LENGTH: ", MAX_SEQUENCE_LENGTH)

(484294, 200)


## Load pretrained LSTM weights

In [None]:
# Model function to construct model
def get_LSTM_Model(embedding_layer):
    model = Sequential()
    model.add(InputLayer(input_shape=(MAX_SEQUENCE_LENGTH,), dtype='int32'))
    
    # Non-trainable embedding layer
    model.add(embedding_layer)
    
    # LSTM layer
    model.add(LSTM(128, return_sequences=True))
    model.add(GlobalMaxPool1D())
    model.add(Dropout(0.1))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [None]:
# Load embedding layer and load trained lstm weights
embedding_layer = Embedding(VOCAB_SIZE,
                            EMBED_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            weights=[EMBEDDING_MATRIX],
                            trainable=False)
model = get_LSTM_Model(embedding_layer)
lstm_model_file_path = "lstm_main_model_V3.h5"
model.load_weights(lstm_model_file_path) # Can't seem to properly load_weights by file. Can only provide direct filepath link to file

## Load Tokenizer and define utility functions for model

In [None]:
# Load tokenizer file by Shared URL key
# https://drive.google.com/file/d/106RIuB05hSHQtYYjTV4szFSa5EhlvoaS/view?usp=sharing - V3
tokenizer_file_id = "106RIuB05hSHQtYYjTV4szFSa5EhlvoaS"
tokenizer_file = service.files().get_media(fileId=tokenizer_file_id).execute()
tokenizer = pickle.load(BytesIO(tokenizer_file))

In [None]:
# Define utility models to test the model
def decode_sentiment(score, include_neutral=True):
    SENTIMENT_THRESHOLDS = (0.4, 0.6) # "neutral" will be in between 0.4 and 0.6
    if include_neutral:        
        label = "neutral"
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = "negative"
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = "positive"
        return label
    else:
        return "negative" if score < 0.5 else "positive"

def predict(model, text, include_neutral=True): # Original/Base/V3 version
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=MAX_SEQUENCE_LENGTH)
    # Predict
    score = model.predict([x_test])[0] # score is a value from 0 to 1 (e.g. 0.56)
    # Decode sentiment
    label = decode_sentiment(score, include_neutral=include_neutral)
    return {"label": label, "score": float(score)}  
    
# def predict(model, text, include_neutral=True): # V2 version (softmax. 1hot-encoded labels)
#     # Tokenize text
#     # x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=MAX_SEQUENCE_LENGTH)
#     x_test = pad_sequences(tokenizer.texts_to_sequences([text]), padding='post',  maxlen=MAX_SEQUENCE_LENGTH) # V2
#     # Predict (Note: This method can only have positive or negative)
#     label = "negative"
#     label_val = np.argmax(model.predict(x_test), axis=-1) # Predict 0 or 1 (0=negative, 1=positive)
#     if label_val == 1:
#         label = "positive"
#     return {"label": label}  

In [None]:
predict(model, "i love the music")

{'label': 'positive', 'score': 0.9586430788040161}

In [None]:
predict(model, "i hate the rain")

{'label': 'negative', 'score': 0.02842596173286438}

In [None]:
orig_text = data_copy.iloc[0]["text"]
sample_text = data_copy.iloc[0]["processed_text"]
print("orig_text: " + orig_text)
print(predict(model, orig_text))
print("------------------------------------------------------")
print("\nsample_text: " + sample_text)
print(predict(model, sample_text))

orig_text: We have a list of 200 kids that signed up for the backpack giveaway. I’m in need of hand sanitizer to include. If you’re willing to donate you can receive a receipt 🧾 for a tax deduction. I need by Friday September… https://t.co/9e6LQ1h0nx

sample_text: list kid signed backpack giveaway need hand sanitizer include willing donate receive receipt receipt tax deduction need friday_september


{'label': 'positive', 'score': 0.8696259260177612}

## Perform LSTM SA Predictions on Data

In [None]:
import time

start = time.time()
sentiment_LSTM = []
for i in range(len(data_copy)):
    result = predict(model, data_copy.iloc[i]["processed_text"])
    sentiment_LSTM.append(result["label"].lower())
        
# Add in new column called "sentiment_LSTM" with the sentiment value from trained LSTM model
data_copy["sentiment_LSTM"] = sentiment_LSTM

print(str(len(data_copy)) + " records")
print("\nTotal time taken in minutes: {:.4f}".format((time.time()-start) / 60))

48948 records

Total time taken in minutes: 40.9246


# Perform MAX-VOTING SA based on all the classifiers

In [None]:
start = time.time()

data_temp = data_copy.copy()
columns = ["sentiment_TextBlob", "sentiment_Vader", "sentiment_LSTM", "sentiment_Google"]
data_temp = data_temp[columns]
# Update new column "sentiment_MAX_VOTE"
data_copy["MAX_VOTE"] = data_temp.mode(axis='columns')[0]

print("\nTotal time taken in minutes: {:.4f}".format((time.time()-start) / 60))

data_copy.head(3)


Total time taken in minutes: 0.2991


In [None]:
data_copy.head(1)

Unnamed: 0,created_at,id,id_str,full_text,truncated,display_text_range,entities,source,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,in_reply_to_screen_name,user,geo,coordinates,place,contributors,is_quote_status,retweet_count,favorite_count,favorited,retweeted,possibly_sensitive,lang,extended_entities,quoted_status_id,quoted_status_id_str,quoted_status_permalink,quoted_status,country,processed_text,sentiment_TextBlob,sentiment_Vader,sentiment_LSTM,sentiment_MAX_VOTE
0,2020-09-01 04:25:47+00:00,1300651040526733314,1300651040526733312,We have a list of 200 kids that signed up for the backpack giveaway. I’m in need of hand sanitizer to include. If you’re willing to donate you can receive a receipt 🧾 for a tax deduction. I need by Friday September… https://t.co/9e6LQ1h0nx,False,"[0, 239]","{'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/9e6LQ1h0nx', 'expanded_url': 'https://www.instagram.com/p/CElJeg4s3db/?igshid=1jjktp8i9ifji', 'display_url': 'instagram.com/p/CElJeg4s3db/…', 'indices': [216, 239]}]}","<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",,,,,,"{'id': 191658725, 'id_str': '191658725', 'name': '1st Lady Lisa', 'screen_name': '1stLadyLisa1', 'location': 'Sacramento', 'description': ""CEO of FANS Radio 916 📻🎶🎶 Listen https://t.co/J8GIkpsppw & Saturday's from 10pm - 12am on KUBU 96.5 https://t.co/v86AQVfbGN"", 'url': 'https://t.co/lqznIqgv9e', 'entities': {'url': {'urls': [{'url': 'https://t.co/lqznIqgv9e', 'expanded_url': 'http://www.numberonemusic.com/1stLadyLisa', 'display_url': 'numberonemusic.com/1stLadyLisa', 'indices': [0, 23]}]}, 'description': {'urls': [{'url': 'https://t.co/J8GIkpsppw', 'expanded_url': 'http://www.saucedupradio.com', 'display_url': 'saucedupradio.com', 'indices': [33, 56]}, {'url': 'https://t.co/v86AQVfbGN', 'expanded_url': 'http://www.accesssacramento.org', 'display_url': 'accesssacramento.org', 'indices': [100, 123]}]}}, 'protected': False, 'followers_count': 1923, 'friends_count': 1726, 'listed_count': 41, 'created_at': 'Fri Sep 17 00:43:01 +0000 2010', 'favourites_count': 4640, 'utc_offset': None, 'time_zone': None, 'geo_enabled': True, 'verified': False, 'statuses_count': 27123, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': '642D8B', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme10/bg.gif', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme10/bg.gif', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/464632798062460928/GJMGdgbr_normal.jpeg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/464632798062460928/GJMGdgbr_normal.jpeg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/191658725/1478454203', 'profile_image_extensions_alt_text': None, 'profile_banner_extensions_alt_text': None, 'profile_link_color': 'FF0000', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': '7AC3EE', 'profile_text_color': '3D1957', 'profile_use_background_image': True, 'has_extended_profile': False, 'default_profile': False, 'default_profile_image': False, 'following': False, 'follow_request_sent': False, 'notifications': False, 'translator_type': 'none'}","{'type': 'Point', 'coordinates': [38.577, -121.4947]}","{'type': 'Point', 'coordinates': [-121.4947, 38.577]}","{'id': 'b71fac2ee9792cbe', 'url': 'https://api.twitter.com/1.1/geo/id/b71fac2ee9792cbe.json', 'place_type': 'city', 'name': 'Sacramento', 'full_name': 'Sacramento, CA', 'country_code': 'US', 'country': 'United States', 'contained_within': [], 'bounding_box': {'type': 'Polygon', 'coordinates': [[[-121.576613, 38.43792], [-121.362715, 38.43792], [-121.362715, 38.6855236], [-121.576613, 38.6855236]]]}, 'attributes': {}}",,False,0,0,False,False,0.0,en,,,,,,United States,list kid signed backpack giveaway need hand sanitizer include willing donate receive receipt receipt tax deduction need friday_september,positive,positive,positive,positive


# Save Final Results into Google Drive main file

In [None]:
# save_dataframe_to_csv_results_by_sharedurlkey(data_copy, csv_file_id)