<code>
<br>
Model List<br>
<br/>
</code>

| Case | Dataset | Classification | Algorithm |
| --- | --- | --- | --- |
|1-1| Simple-Labeled | Multi-Classification (Single-Output) | SVM |
|1-2| Simple-Labeled | Multi-Classification (Single-Output) | DT |
|2-1| Simple-Labeled | One-Versus-Rest Binary-Classification | SVM |
|2-2| Simple-Labeled | One-Versus-Rest Binary-Classification | DT |
|3-1| Multi-Labeled | Multi-Classification (Multi-Output) | SVM |
|3-2| Multi-Labeled | Multi-Classification (Multi-Output) | DT |
|4-1| Multi-Labeled | One-Versus-Rest Binary-Classification | SVM |
|4-2| Multi-Labeled | One-Versus-Rest Binary-Classification | DT |

- Import Module

In [1]:
# ML package
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

import re, string, nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer  
stop_words = stopwords.words('english')

# evaluation matrix
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# basic package
import numpy as np, pandas as pd, random
import matplotlib.pyplot as plt
import seaborn as sn
from IPython.display import clear_output
import warnings
warnings.filterwarnings('ignore')

# reproducibility
SEED = 42

- Dataset

In [2]:
def DATA(single_label = True):
    # read review datasets 
    def READ(game_name):
        temp = pd.read_excel(f'data_cleaned\\labeled data\\{game_name}.xlsx')[:1000]
        temp['game'] = game_name
        if single_label:
            return temp[temp.check == 1]
        else:
            return temp

    # combine datasets
    game_list = ['Warhammer 40,000 - Darktide','Brotato','Cult of the Lamb','Teardown','Mount & Blade 2 - Bannerlord']
    df_train = pd.concat([READ(game) for game in game_list])
    df_train = df_train[df_train.Spam == 0] # drop spam review
    df_train.drop(columns = ['Spam', 'date', 'helpful_votes', 'check'], inplace = True) # drop unneccessary column
    df_train.dropna(inplace = True)

    # sampe 
    def SAMPLE(i, num, df = df_train, seed = SEED):
        tag_list = ['Not helpful','Suggestion','Pro','Con','Bug']
        df_temp = df[df[tag_list[i]]==1]
        df_temp['Class'] = tag_list[i]
        return df_temp[df_temp[tag_list[i]]==1].sample(n = num, random_state = SEED)
    
    if single_label:
        df_train = pd.concat([SAMPLE(i, 140) for i in range(5)])
    else:
        df_train = pd.concat([df_train[df_train['Not helpful']!=1], SAMPLE(0, 250)])

    # data preprocessing
    sw = stopwords.words('english')
    lemmatizer = WordNetLemmatizer() 

    def clean_text(text):
        text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text.lower()) # only keep (a-z, A-Z, ".", "?", "!", ",")
        text = re.sub(r"http\S+", "", text) #Removing URLs     
        html = re.compile(r'<.*?>') 
        text = html.sub(r'', text) #Removing html tags
        for punct in '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_':
            text = text.replace(punct, '') #Removing punctuations
        text = [word.lower() for word in text.split() if word.lower() not in sw]
        text = " ".join([lemmatizer.lemmatize(word) for word in text]) #removing stopwords
        emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"  # emoticons
                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            u"\U00002702-\U000027B0"
                            u"\U000024C2-\U0001F251"
                            "]+", flags = re.UNICODE) 
        return emoji_pattern.sub(r'', text) #Removing emojis  

    df_train.review_clear = df_train.review_clear.apply(lambda x: clean_text(x))

    return df_train

- Model

In [3]:
# train/test 
def TRAIN_TEST(df):  
    train_dataset, test_dataset = train_test_split(df, test_size = 0.3, random_state = SEED)
    return train_dataset, test_dataset

# vectorization
def REPRESENTATION(df, train_dataset, test_dataset):
    vectorizer = TfidfVectorizer()
    vectorizer.fit(df.review_clear.values.tolist())
    X_train = vectorizer.transform(train_dataset.review_clear.values.tolist())
    X_test = vectorizer.transform(test_dataset.review_clear.values.tolist())
    return X_train, X_test

def CLASSIFIER(X, Y, multi_output = False):
    if multi_output == False:
        DT, SVM = DecisionTreeClassifier(random_state = SEED), SVC(random_state = SEED)
        DT.fit(X, Y), SVM.fit(X, Y)
        return DT, SVM
    else:
        DT, SVM = DecisionTreeClassifier(random_state = SEED), MultiOutputClassifier(SVC(random_state = SEED))
        DT.fit(X, Y), SVM.fit(X, Y)
        return DT, SVM

- Inference

In [4]:
# the validation dataset
def RESULT(y_pred, y_true, case = 0):
    label_list = ['Not helpful','Suggestion','Pro','Con','Bug']
    if case == 1:
        print(classification_report(y_true ,y_pred))
        print(np.transpose(confusion_matrix(y_true, y_pred, labels = label_list)))
        for label in ['Not helpful','Suggestion','Pro','Con','Bug']:
            y_true_01 = np.where(y_true == label, 1, 0)
            y_pred_01 = np.where(y_pred == label, 1, 0)
            print(f'the confusion matrix of {label}')
            tn, fp, fn, tp = confusion_matrix(y_true_01, y_pred_01).ravel()
            print(np.array([[tp,fp],[fn,tn]]))
        print('---'*15 + '\n')
    else:
        print(f"Number of prediction/real: {y_pred.sum()} / {y_true.sum()}")
        print("Accuracy:", accuracy_score(y_pred, y_true))
        print("Precision:", precision_score(y_pred, y_true))
        print("Recall:", recall_score(y_pred, y_true))
        print("F1 score:", f1_score(y_pred, y_true))
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        print('the confusion matrix:', np.array([[tp,fp],[fn,tn]]), sep = '\n')
        print('---'*15 + '\n')

In [5]:
if __name__ == '__main__':
    tag_list, case = ['Not helpful','Suggestion','Pro','Con','Bug'], 4

    if case == 1: # single-label dataset, multi-class
        # training
        df_train = DATA(single_label = True)
        train_dataset, test_dataset = TRAIN_TEST(df_train)
        X_train, X_test = REPRESENTATION(df_train, train_dataset, test_dataset)
        DT, SVM = CLASSIFIER(X_train, train_dataset.Class.values)
        # result
        print('---Case1 DT---\n')
        RESULT(DT.predict(X_test), test_dataset.Class.values, case = 1)
        print('---Case1 SVM---\n')
        RESULT(SVM.predict(X_test), test_dataset.Class.values, case = 1)

    elif case == 2: # single-label dataset, One-vs-Rest
        df_train = DATA(single_label = True)
        for i in range(1,5):
            # training
            train_dataset, test_dataset = TRAIN_TEST(df_train)
            X_train, X_test = REPRESENTATION(df_train, train_dataset, test_dataset)
            DT, SVM = CLASSIFIER(X_train, train_dataset[tag_list[i]].values)
            # result
            print(f'the Case2 DT confusion matrix of {tag_list[i]}')
            RESULT(DT.predict(X_test), test_dataset[tag_list[i]].values)
            print(f'the Case2 SVM confusion matrix of {tag_list[i]}')
            RESULT(SVM.predict(X_test), test_dataset[tag_list[i]].values)


    elif case == 3: # multi-label dataset, multi-class
        df_train = DATA(single_label = False)
        # training
        train_dataset, test_dataset = TRAIN_TEST(df_train)
        X_train, X_test = REPRESENTATION(df_train, train_dataset, test_dataset)
        DT, SVM = CLASSIFIER(X_train, train_dataset[tag_list].values, multi_output = True)
        # result
        for i in range(0,5):
            print(f'the Case3 DT confusion matrix of {tag_list[i]}')
            RESULT(DT.predict(X_test)[:,2], test_dataset[tag_list[i]].values)
            print(f'the Case3 SVM confusion matrix of {tag_list[i]}')
            RESULT(SVM.predict(X_test)[:,2], test_dataset[tag_list[i]].values)
    

    elif case == 4: # multi-label dataset, One-vs-Rest
        df_train = DATA(single_label = False)
        for i in range(1,5):
            # training
            train_dataset, test_dataset = TRAIN_TEST(df_train)
            X_train, X_test = REPRESENTATION(df_train, train_dataset, test_dataset)
            print(X_test.shape)
            DT, SVM = CLASSIFIER(X_train, train_dataset[tag_list[i]].values)
            # result
            print(f'the Case4 DT confusion matrix of {tag_list[i]}')
            RESULT(DT.predict(X_test), test_dataset[tag_list[i]].values)
            print(f'the Case4 SVM confusion matrix of {tag_list[i]}')
            RESULT(SVM.predict(X_test), test_dataset[tag_list[i]].values)

(617, 9516)
the Case4 DT confusion matrix of Suggestion
Number of prediction/real: 60 / 56
Accuracy: 0.893030794165316
Precision: 0.44642857142857145
Recall: 0.4166666666666667
F1 score: 0.43103448275862066
the confusion matrix:
[[ 25  35]
 [ 31 526]]
---------------------------------------------

the Case4 SVM confusion matrix of Suggestion
Number of prediction/real: 3 / 56
Accuracy: 0.9108589951377634
Precision: 0.03571428571428571
Recall: 0.6666666666666666
F1 score: 0.06779661016949153
the confusion matrix:
[[  2   1]
 [ 54 560]]
---------------------------------------------

(617, 9516)
the Case4 DT confusion matrix of Pro
Number of prediction/real: 278 / 303
Accuracy: 0.6904376012965965
Precision: 0.6435643564356436
Recall: 0.7014388489208633
F1 score: 0.6712564543889845
the confusion matrix:
[[195  83]
 [108 231]]
---------------------------------------------

the Case4 SVM confusion matrix of Pro
Number of prediction/real: 278 / 303
Accuracy: 0.8038897893030794
Precision: 0.759