In [2]:
# Mount My Google Drive files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import matplotlib.pyplot as plt
import numpy as np
import re
import random
import pandas as pd
from sklearn.dummy import DummyClassifier
import xgboost as xgb
pd.options.mode.chained_assignment = None  # default='warn'
import seaborn as sns
from collections import Counter, defaultdict
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from tqdm import tqdm
import re
import altair as alt
alt.renderers.enable('default')
import warnings
warnings.filterwarnings('ignore')

RANDOM_SEED = 42

# Set current device
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [7]:
# Read reviews label dataset
train_review_df = pd.read_excel(f'/content/drive/MyDrive/Realtime Dreamer/train reviews.xlsx',engine='openpyxl',sheet_name='train', skiprows=0)
print(train_review_df.shape)
print(train_review_df['emotion'].value_counts())

train_review_df.head()

# Create dictionary for class labels
label_dict = {'positive':2,'neutral': 1, 'negative':0}
label_dict_inverse = {v: k for k, v in label_dict.items()}

def add_label_to_df(df):

    """Create function add_label_to_df to add label to reviews label dataset."""  


    df['label'] = df['emotion'].replace(label_dict)

    return df

 

def data_split(df):

    """
    Create function data_split to 
    1. splict review lable dataset into train and validation data. 
    2. stratify the data to handle imbalance class issue.
    """


    df=add_label_to_df(df)
    X_train, X_val1, y_train, y_val1 = train_test_split(df.index.values,
    df['label'].values,
    test_size=0.20,
    random_state=RANDOM_SEED,
    stratify=df['label'].values)
    
    X_val, X_test, y_val, y_test = train_test_split(X_val1,
                                                    y_val1,
                                                    test_size=0.50,
                                                    random_state=RANDOM_SEED,
                                                    stratify=y_val1)
  
    return X_train, X_val, X_test,  y_train, y_val, y_test


def set_data_category_in_df(df):
  
    """Create function set_data_category_in_df to set data categary inside the reviews label data."""


    X_train, X_val, X_test,  y_train, y_val, y_test =data_split(df)    
    df['data_category'] = ['unset']*df.shape[0]
    df.loc[X_train, 'data_category'] = 'train'
    df.loc[X_val, 'data_category'] = 'val'
    df.loc[X_test, 'data_category'] = 'test'
  
    return df

def vectorize_train_val_test_data(vec,df,col):

    """Create function vectorize_train_val_test_data to vectorize train and validation dataset"""
    

    X_train    = vec.fit_transform(df[df.data_category=='train'][col].values )
    X_val  = vec.transform(df[df.data_category=='val'][col].values)
    X_test = vec.transform(df[df.data_category=='test'][col].values)
    
    return X_train,X_val,X_test

def calcualte_F1_scores(vec,clf,X_train,y_train,X_val,y_val):

    """Create function calcualte_F1_scores to calcuate 1_score_macro,f1_score_micro,f1_score_weighted."""
  
  
    if clf == gnb:
        clf = clf.fit(X_train.toarray(), y_train)
        predicted_y = clf.predict(X_val.toarray())
    else:  
        clf = clf.fit(X_train, y_train)    
        predicted_y = clf.predict(X_val)
        
    f1_score_macro = f1_score(y_val,predicted_y, average='macro')
    f1_score_micro = f1_score(y_val,predicted_y, average='micro')
    f1_score_weighted = f1_score(y_val,predicted_y, average='weighted')

    return f1_score_macro,f1_score_micro,f1_score_weighted

def make_scores_dataset(vec):
    
    """
    Create function make_scores_dataset to 
    generate dataframe scores_df with F1_score_macro, F1_score_micro and F1_score_weighted.
    """


    f1_macro_list=[]
    f1_micro_list=[]
    f1_weighted_list=[]

    for clf in clf_list:
        f1_macro,f1_micro,f1_weighted =calcualte_F1_scores(vec,clf,X_train,y_train,X_val,y_val)
        f1_macro_list.append(f1_macro)
        f1_micro_list.append(f1_micro)
        f1_weighted_list.append(f1_weighted)

    scores_df=pd.DataFrame()
    scores_df['ML Classfier']=clf_name_list
    scores_df['F1_score_macro']=f1_macro_list
    scores_df['F1_score_micro']=f1_micro_list
    scores_df['F1_score_weighted']=f1_weighted_list
    scores_df.sort_values(by=['F1_score_macro','F1_score_micro','F1_score_weighted'],ascending=False, inplace=True)

    return scores_df

# Initiate TfidfVectorizer to convert a collection of review records to a matrix of TF-IDF features.
vec = TfidfVectorizer(ngram_range=(1,3))

# Splict review lable dataset into train and validation data
_, _, _, y_train, y_val, y_test = data_split(train_review_df)

# Set data categary inside the reviews label data
emotion_df = set_data_category_in_df(train_review_df)

# Vectorize train and validation dataset
X_train, X_val, X_test = vectorize_train_val_test_data(vec, emotion_df, 'Review Content')

# Initiate DummyClassifier, set strategy="uniform"
dummy_clf_uni = DummyClassifier(strategy="uniform", random_state=RANDOM_SEED)

# Initiate DummyClassifier, set strategy="most_frequent"
dummy_clf_mfrq = DummyClassifier(strategy="most_frequent", random_state=RANDOM_SEED)

# Initiate LogisticRegression, set solver='lbfgs',multi_class='auto'
lr_clf = LogisticRegression(random_state=RANDOM_SEED, solver='lbfgs',multi_class='auto',n_jobs=-1)

# Initiate RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=-1)

# Initiate GaussianNB
gnb = GaussianNB()

# Initiate XGBClassifier
xgb_clf = xgb.sklearn.XGBClassifier()

# Create clf_list for ML classifier
clf_list = [dummy_clf_uni, dummy_clf_mfrq, lr_clf, rf_clf, gnb, xgb_clf]

# Create clf_name_list for ML classifier names
clf_name_list = ['dummy_clf_uniform', 'dummy_clf_most_frequent', 'LogisticRegression',
                 'RandomForestClassifier', 'GaussianNB', 'XGBClassifier']

# Create make_scores_df with F1_score_macro, F1_score_micro and F1_score_weighted
make_scores_df = make_scores_dataset(vec)

# Create sorted list sorted_clf with sorted name of ML classifers sorted by F1 score macro descending.
sorted_clf = list(make_scores_df['ML Classfier'])

# Create make_scores_df_long dataframe as long form of make_scores_df.
make_scores_df_long = pd.melt(make_scores_df, 
                              id_vars=['ML Classfier'], 
                              value_vars=['F1_score_macro','F1_score_micro','F1_score_weighted'],
                              var_name=['f1 score average type'])

# Create graph for ML classifer model evaluation.
graph = alt.Chart(make_scores_df_long).mark_bar(size=10).encode(
        x = alt.X('ML Classfier:N', sort=sorted_clf, axis=alt.Axis(labelAngle=-90)),
        y = alt.Y('value:Q', title='',scale=alt.Scale(domain=(0,1))),
        color = alt.Color('f1 score average type:N',
                          scale=alt.Scale(scheme='redyellowgreen'),
                          legend=alt.Legend(orient='bottom',
                                            titleFontSize=11,
                                            titleColor='black',
                                            labelFontSize=10.5,
                                            labelColor='black',
                                            direction='horizontal')),
        column=alt.Column('f1 score average type:N',title='',sort=sorted_clf),
        tooltip = ['ML Classfier',
                   'f1 score average type',
                   'value']
        ).interactive(
        ).properties(width=140,
                     height=150
                     )

title = alt.Chart({"values": [{"text": "Sentiment Analysis Supervised ML Model Evaluation"}]}
                      ).mark_text(size=15, dx=150, dy=0, color="black"
                      ).encode(text='text:N'
                      ).properties(width=140,height=20)

chart = (title & graph
        ).configure_view(stroke=None
        ).configure_concat(spacing=15
        ).configure_title(fontSize=12)

chart

(6018, 4)
positive    5250
negative     499
neutral      269
Name: emotion, dtype: int64
