In [1]:
import pandas as pd                            # to analyse data that are stored in a csv file
import numpy as np                             # to provide a large set of numeric datatypes that can be used to construct arrays
import nltk                                    # a platform for building Python programs to work with human language data
from nltk.corpus import stopwords              # to remove stopwords
from nltk.stem import WordNetLemmatizer        # to lemmatize
from nltk.corpus import wordnet                # used to check whether the word is an adjective, noun, verb or adverb
import re   # regex model

In [2]:
dat = pd.read_csv('preprocessed_review.csv')
dat['processed'] = np.nan
dat = dat.drop(columns = ['Unnamed: 0'])      # drop unnecessary column

In [3]:
def get_wordnet_pos(word):
    """    
    This function gets the wordnet postag of each words.
    :param word: word in each review texts
    :returns: the postag of each word 
    """
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


In [9]:
stop_words = set(stopwords.words('english'))     # define the stop words
lemmatizer = WordNetLemmatizer()                 # define the lemmatizer
def preprocess(review):
    """
    This function takes in a list and preprocess accordingly. 
    :param review: list as input
    :returns: preprocessed words
    """
    result = re.sub(r'\d+','', review)      # Remove numbers/ digits
    result = re.sub(r'[^\w\s]','',result)   # Remove puntuations
    val = result.lower()                    # Convert all the reviews to lowercase
    new_list = []
    for word in val.split():                # tokenize and stop words removal
        if word not in stop_words:
            new_list.append(word)
    
    return new_list
    
    
def lemmatize_it(series_list):
    """
    This function is to carry out lemmatization on the
    tokenized review
    :series_list: series object that contains token to be lemmatized
    :returns: lemmatized word
    """
    stem_it = []
    for i in series_list:
        lem = lemmatizer.lemmatize(i, get_wordnet_pos(i))    # lemmatize based on the POS tag
        stem_it.append(lem)
        
    return stem_it

In [10]:
# Text preprocessing 
dat['processed'] = dat['review'].apply(preprocess)
dat['processed'] = dat['processed'].apply(lemmatize_it)
print(dat['processed'][0])    # to make sure the result is as expected

['part', 'magic', 'grow', 'boy', 'buy', 'give', 'new', 'hornby', 'catalogue', 'every', 'year', 'even', 'include', 'product', 'previous', 'year', 'ive', 'still', 'get', 'old', 'one', 'date', 'back', 'somewhere', 'day', 'catalogue', 'especially', 'informative', 'tell', 'vintage', 'roll', 'stock', 'useful', 'dedicate', 'railway', 'one', 'particular', 'era', 'train', 'company']


In [11]:
# tf-idf using built-in function
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

X_1 = dat['processed'].values
Y_1 = dat['rating'].values

def dummy_func(docs):
    """
    Works as a dummy function as the name implies
    """
    return docs

vectorizer = TfidfVectorizer(analyzer='word',tokenizer=dummy_func, preprocessor=dummy_func, token_pattern=None)
transformer = TfidfTransformer()
X = transformer.fit_transform(vectorizer.fit_transform(X_1))    # do tfidf transformer after tfidf vectorizer
feature_names = vectorizer.get_feature_names()
corpus_index = [n for n in range(len(X_1))]
rows, cols = X.nonzero()
print(X.shape)

(28212, 28732)


In [18]:
"""
Train model with kfold cross-validation
After getting the tfidf for each review, do kfold, undersample/ oversample
and pass the result into the model for training
"""

from sklearn.model_selection import StratifiedKFold         # to perform kfold cross validation
from imblearn.under_sampling import NearMiss                # to perform undersampling
from imblearn.over_sampling import ADASYN                   # to perform oversampling
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score # score for evalution purposes
from sklearn.neural_network import MLPClassifier            # to train with MLP classifier
import time                                                 # to keep track of the time for program to execute
from sklearn.svm import SVC, LinearSVC                      #to train support vector machine models
from sklearn.multiclass import OneVsRestClassifier    #a multiclass strategy that fits one classifier per class
from sklearn.feature_selection import SelectKBest     #a feature selection method
from sklearn.feature_selection import chi2            #compute chi-squared stats between each non-negative feature and class
from imblearn.pipeline import make_pipeline           #construct a pipeline from given estimators. Automates a machine learning workflow
import datetime


# list declaration
Y = Y_1
accuracy = []
precision_micro = []
recall_micro = []
f1_micro =[]
precision_macro = []
recall_macro = []
f1_macro =[]
precision_weighted = []
recall_weighted = []
f1_weighted =[]

def train_model(X, Y):
    """
    We will be experimenting on Support Vector Machine and Multilayer Perceptron Neural Network 
    to determine which algorithm builds a better model. We will also perform both oversampling(ADASYN)
    and undersampling (NearMiss) to determine which resampling method is more accurate. Besides that, 
    we will be using SelectKBest method to select features. We will be testing out OneVsRestClassifier
    on SVC models to see whether it improves the accuracy and precision values. The usage of pipeline is
    to ensure that the workflow is properly followed.
    
    Some lines of code for LinearSVC, SVC and MLP are commented out to prevent confusion. 
    To test out a specific line of code, comment out the current line of code and uncomment the you want to try.
    To run the code without feature selection, remove SelectKBest() from the respective line of code.
    
    Some lines of code for LinearSVC, SVC and MLP are commented out to prevent confusion. 
    
    :params X: the tfidf values to be trained
    :params Y: the ratings to be trained
    """
    # k fold cross validation
    skf = StratifiedKFold(n_splits=10)
    for train_index, test_index in skf.split(X, Y):
        start_fold = datetime.datetime.now()
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        
        ''' SVC
        For our research, we switched the kernel between 'rbf' and 'linear' with different numbers of features selected.
        The first and second line of code performs oversampling whereas the third and fourth line of code performs undersampling.
        The first and third line of code contains OneVsRestClassifier whereas the second and fourth line of code does not.
        '''
        #pipeline = make_pipeline(SelectKBest(chi2, k=10000),ADASYN(),OneVsRestClassifier(SVC(kernel='rbf',gamma='auto',cache_size=1000)))
        #pipeline = make_pipeline(SelectKBest(chi2, k=10000),ADASYN(),SVC(kernel='rbf',gamma='auto',cache_size=1000))
        #pipeline = make_pipeline(SelectKBest(chi2, k=10000),NearMiss(),OneVsRestClassifier(SVC(kernel='rbf',gamma='auto',cache_size=1000)))
        #pipeline = make_pipeline(SelectKBest(chi2, k=10000),NearMiss(),SVC(kernel='rbf',gamma='auto',cache_size=1000))
        
        ''' LinearSVC
        The first line of code performs oversampling whereas the second line of code performs undersampling.
        The third line of code does not perform any feature selection.
        '''
        #pipeline = make_pipeline(SelectKBest(chi2, k=25000),ADASYN(),LinearSVC(C=1,max_iter=10000,random_state=42))
        #pipeline = make_pipeline(SelectKBest(chi2, k=10000),NearMiss(),LinearSVC(C=1,max_iter=10000,random_state=42))
        pipeline = make_pipeline(ADASYN(random_state=42),LinearSVC(C=1,random_state=42))
        
        ''' MLPClassifier
        For our research, we switched the solver parameter between 'sgd' and 'adam' and tried different values of hidden_layer_sizes.
        The first line of code performs oversampling whereas the second line of code performs undersampling
        '''
        #pipeline = make_pipeline(SelectKBest(chi2, k=10000),ADASYN(),MLPClassifier(solver='sgd', hidden_layer_sizes= (5,5), max_iter=1000,random_state=1))
        #pipeline = make_pipeline(SelectKBest(chi2, k=10000),NearMiss(),MLPClassifier(solver='sgd', hidden_layer_sizes= (5,5), max_iter=1000,random_state=1))
        
        #fit the model
        pipeline.fit(X_train, Y_train)
        #using testing data to predict the results
        y_pred = pipeline.predict(X_test)
        
        print(confusion_matrix(Y_test, y_pred))
        print(classification_report(Y_test,y_pred))

        #appends the results into respective lists
        accuracy.append(accuracy_score(Y_test,y_pred))
        #micro average
        precision_micro.append(precision_score(Y_test,y_pred,average='micro',labels=np.unique(y_pred)))
        recall_micro.append(recall_score(Y_test,y_pred,average='micro'))
        f1_micro.append(f1_score(Y_test,y_pred,average='micro',labels=np.unique(y_pred)))
        #macro average
        precision_macro.append(precision_score(Y_test,y_pred,average='macro',labels=np.unique(y_pred)))
        recall_macro.append(recall_score(Y_test,y_pred,average='macro'))
        f1_macro.append(f1_score(Y_test,y_pred,average='macro',labels=np.unique(y_pred)))
        #weighted average
        precision_weighted.append(precision_score(Y_test,y_pred,average='weighted',labels=np.unique(y_pred)))
        recall_weighted.append(recall_score(Y_test,y_pred,average='weighted'))
        f1_weighted.append(f1_score(Y_test,y_pred,average='weighted',labels=np.unique(y_pred)))
        
        print("duration for this fold: ",datetime.datetime.now() - start_fold)
        
    # prints out the mean of the result from respective lists    
    print("accuracy testing: {}".format(np.mean(accuracy)))
    print("precision_micro: {}".format(np.mean(precision_micro)))
    print("recall_micro: {}".format(np.mean(recall_micro)))
    print("f1_micro: {}".format(np.mean(f1_micro)))
    print("")
    print("precision_macro: {}".format(np.mean(precision_macro)))
    print("recall_macro: {}".format(np.mean(recall_macro)))
    print("f1_macro: {}".format(np.mean(f1_macro)))
    print("")
    print("precision_weighted: {}".format(np.mean(precision_weighted)))
    print("recall_weighted: {}".format(np.mean(recall_weighted)))
    print("f1_weighted: {}".format(np.mean(f1_weighted)))

In [19]:
# To train the data
train_model(X, Y)

[[  14    9   16   10   12]
 [   4    9   17    9   16]
 [   5   13   61   43   38]
 [   9   13   61  185  227]
 [  17   20  128  462 1425]]
              precision    recall  f1-score   support

           1       0.29      0.23      0.25        61
           2       0.14      0.16      0.15        55
           3       0.22      0.38      0.28       160
           4       0.26      0.37      0.31       495
           5       0.83      0.69      0.76      2052

   micro avg       0.60      0.60      0.60      2823
   macro avg       0.35      0.37      0.35      2823
weighted avg       0.67      0.60      0.63      2823

duration for this fold:  0:00:41.301006
[[  24    6   16    8    7]
 [  11    3   18   14    9]
 [  18    4   72   40   26]
 [  19   14   92  158  212]
 [  48   40  167  431 1366]]
              precision    recall  f1-score   support

           1       0.20      0.39      0.27        61
           2       0.04      0.05      0.05        55
           3       0.20   