In [21]:
import pandas as pd                            # to analyse data that are stored in a csv file
import numpy as np                             # to provide a large set of numeric datatypes that can be used to construct arrays
import nltk                                    # a platform for building Python programs to work with human language data
from nltk.corpus import stopwords              # to remove stopwords
from nltk.stem import WordNetLemmatizer        # to lemmatize
from nltk.corpus import wordnet                # used to check whether the word is an adjective, noun, verb or adverb
import re   # regex model

In [22]:
dat = pd.read_csv('review_ver2.csv')
dat['processed'] = np.nan
dat = dat.drop(columns = ['Unnamed: 0'])      # drop unnecessary column

In [23]:
def get_wordnet_pos(word):
    """    
    This function gets the wordnet postag of each words.
    :param word: word in each review texts
    :returns: the postag of each word 
    """
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


In [24]:
stop_words = set(stopwords.words('english'))     # define the stop words
lemmatizer = WordNetLemmatizer()                 # define the lemmatizer
def preprocess(review):
    """
    This function takes in a series object and 
    preprocess accordingly. 
    :param review: series object
    :returns: preprocessed words
    """
    result = review.str.replace(r'\d+', '')     # Remove numbers/ digits
    result = result.str.replace(r'\W', ' ')     # Remove puntuations
    val = result.str.lower()                    # Convert all the reviews to lowercase 
    
    return val.apply(lambda row: [word for word in row.split() if word not in stop_words])   # tokenize and stop words removal
    
    
def lemmatize_it(series_list):
    """
    This function is to carry out lemmatization on the
    tokenized review
    :series_list: series object that contains token to be lemmatized
    :returns: lemmatized word
    """
    stem_it = []
    for i in series_list:
        lem = lemmatizer.lemmatize(i, get_wordnet_pos(i))    # lemmatize based on the POS tag
        stem_it.append(lem)
        
    return stem_it

In [25]:
# Text preprocessing 
dat['processed'] = preprocess(dat['review'])
dat['processed'] = dat['processed'].apply(lemmatize_it)
print(dat['processed'][0])    # to make sure the result is as expected

['part', 'magic', 'grow', 'boy', 'buy', 'give', 'new', 'hornby', 'catalogue', 'every', 'year', 'even', 'include', 'product', 'previous', 'year', 'still', 'get', 'old', 'one', 'date', 'back', 'somewhere', 'day', 'catalogue', 'especially', 'informative', 'tell', 'vintage', 'roll', 'stock', 'useful', 'dedicate', 'railway', 'one', 'particular', 'era', 'train', 'company']


In [26]:
# tf-idf using built-in function
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

X_1 = dat['processed'].values
Y_1 = dat['rating'].values

def dummy_func(docs):
    """
    Works as a dummy function as the name implies
    """
    return docs

vectorizer = TfidfVectorizer(analyzer='word',tokenizer=dummy_func, preprocessor=dummy_func, token_pattern=None)
transformer = TfidfTransformer()
X = transformer.fit_transform(vectorizer.fit_transform(X_1))    # do tfidf transformer after tfidf vectorizer
feature_names = vectorizer.get_feature_names()
corpus_index = [n for n in range(len(X_1))]
rows, cols = X.nonzero()
print(X.shape)

(28212, 18905)


In [29]:
"""
Train model with kfold cross-validation
After getting the tfidf for each review, do kfold, undersample/ oversample
and pass the result into the model for training
"""

from sklearn.model_selection import StratifiedKFold         # to perform kfold cross validation
from imblearn.under_sampling import NearMiss                # to perform undersampling
from imblearn.over_sampling import ADASYN                   # to perform oversampling
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score # score for evalution purposes
from sklearn.neural_network import MLPClassifier            # to train with MLP classifier
import time                                                 # to keep track of the time for program to execute
from sklearn import svm                                     # to train with SVC
from sklearn.svm import SVC

# list declaration
Y = Y_1
accuracy = []
precision_micro = []
recall_micro = []
f1_micro =[]
precision_macro = []
recall_macro = []
f1_macro =[]
precision_weighted = []
recall_weighted = []
f1_weighted =[]

def train_model(X, Y):
    """
    We will be experimenting on Support Vector Machine and Multilayer Perceptron Neural Network 
    to determine which algorithm builds a better model. We will also perform both oversampling(ADASYN)
    and undersampling (NearMiss) to determine which resampling method is more accurate. 
    
    Some lines of code for LinearSVC, MLP are commented out to prevent confusion. 
    
    :params X: the tfidf values to be trained
    :params Y: the ratings to be trained
    """
    # k fold cross validation
    skf = StratifiedKFold(n_splits=10)
    for train_index, test_index in skf.split(X, Y):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        
        """
        # Do undersampling for training data
        start = time.time()
        nm = NearMiss()
        X_res, Y_res = nm.fit_resample(X_train, Y_train)
        done = time.time()
        elapsed = done - start
        print("Undersampling: ",elapsed)
        print(len(Y_res))
        """

        # Do oversampling for training data
        start = time.time()
        ada = ADASYN(random_state = 4, sampling_strategy='auto')
        X_res, Y_res = ada.fit_resample(X_train, Y_train)
        done = time.time()
        elapsed = done - start
        print("Oversampling: ",elapsed)
        print(len(Y_res))

        """
        # Do Linear SVC
        """
        start1 = time.time()
        classifier = svm.LinearSVC()
        classifier.fit(X_res, Y_res) 
        y_pred = classifier.predict(X_test)

        """
        # Do MLP
        start1 = time.time()
        mlp = MLPClassifier(solver='sgd', hidden_layer_sizes= (3,2), random_state=1, max_iter = 1000)
        mlp.fit(X_res, y_res)
        y_pred = mlp.predict(X_test)
        """
        
        print(confusion_matrix(Y_test, y_pred))
        print(classification_report(Y_test,y_pred))

        #appends the results into respective lists
        accuracy.append(accuracy_score(Y_test,y_pred))
        #micro average
        precision_micro.append(precision_score(Y_test,y_pred,average='micro',labels=np.unique(y_pred)))
        recall_micro.append(recall_score(Y_test,y_pred,average='micro'))
        f1_micro.append(f1_score(Y_test,y_pred,average='micro',labels=np.unique(y_pred)))
        #macro average
        precision_macro.append(precision_score(Y_test,y_pred,average='macro',labels=np.unique(y_pred)))
        recall_macro.append(recall_score(Y_test,y_pred,average='macro'))
        f1_macro.append(f1_score(Y_test,y_pred,average='macro',labels=np.unique(y_pred)))
        #weighted average
        precision_weighted.append(precision_score(Y_test,y_pred,average='weighted',labels=np.unique(y_pred)))
        recall_weighted.append(recall_score(Y_test,y_pred,average='weighted'))
        f1_weighted.append(f1_score(Y_test,y_pred,average='weighted',labels=np.unique(y_pred)))
        
        done1 = time.time()
        elapsed1 = done1 - start1
        print("Linear SVM: ", elapsed1)
        
    # prints out the mean of the result from respective lists    
    print("accuracy testing: {}".format(np.mean(accuracy)))
    print("precision_micro: {}".format(np.mean(precision_micro)))
    print("recall_micro: {}".format(np.mean(recall_micro)))
    print("f1_micro: {}".format(np.mean(f1_micro)))
    print("")
    print("precision_macro: {}".format(np.mean(precision_macro)))
    print("recall_macro: {}".format(np.mean(recall_macro)))
    print("f1_macro: {}".format(np.mean(f1_macro)))
    print("")
    print("precision_weighted: {}".format(np.mean(precision_weighted)))
    print("recall_weighted: {}".format(np.mean(recall_weighted)))
    print("f1_weighted: {}".format(np.mean(f1_weighted)))

In [30]:
# To train the data
train_model(X, Y)

TRAIN: [ 2762  2763  2765 ... 28209 28210 28211] TEST: [   0    1    2 ... 3282 3295 3307]
Oversampling:  37.185096740722656
91256
[[  15    8   15   10   13]
 [   5    7   14   13   16]
 [   7   11   57   42   43]
 [   7   12   65  196  215]
 [  16   27  127  452 1430]]
              precision    recall  f1-score   support

           1       0.30      0.25      0.27        61
           2       0.11      0.13      0.12        55
           3       0.21      0.36      0.26       160
           4       0.27      0.40      0.32       495
           5       0.83      0.70      0.76      2052

   micro avg       0.60      0.60      0.60      2823
   macro avg       0.34      0.36      0.35      2823
weighted avg       0.67      0.60      0.63      2823

Linear SVM:  4.822035789489746
TRAIN: [    0     1     2 ... 28209 28210 28211] TEST: [2762 2763 2765 ... 5803 5804 5820]
Oversampling:  36.0153603553772
91239
[[  23    6   16    8    8]
 [   9    3   17   14   12]
 [  20    4   67   42  