In [2]:
import pandas as pd
import numpy as np
import nltk
import re
import datetime
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import words
from nltk.corpus import stopwords
from contractions import CONTRACTION_MAP
from stopwords import stop_words
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

#Read the csv file
dat = pd.read_csv('review_ver2.csv', encoding = "ISO-8859-1")

# Change the display size
pd.set_option('display.max_columns',20)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

In [3]:
def expand_contractions(word):
    """
    This function expands words such as I'll to I will.
    :param word: a single review
    :returns: the expanded words
    """
    expanded = ' '.join([CONTRACTION_MAP[t] if t in CONTRACTION_MAP else t for t in word.split(" ") ])
    return expanded

def get_wordnet_pos(word):
    """
    This function gets the wordnet postag of each words.
    :param word: word in each review texts
    :returns: the postag of each word 
    """
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N":wordnet.NOUN, "V":wordnet.VERB, "R":wordnet.ADV}
    
    return tag_dict.get(tag,wordnet.NOUN)
    
def preprocessing_text(text):
    """
    This function preprocesses the review texts by performing contractions, removing numbers and
    punctuations, make all the characters into lowercase, tokenization, lemmatization as well as removing stopwords.
    :param text: a single text review
    :returns: a list of preprocessed words
    """
    #contractions
    expanded_text=expand_contractions(text)
    #remove numbers
    numbers_removed = re.sub(r'\d+','',expanded_text)
    #remove punctuation
    punct_removed = re.sub(r'[^\w\s]','',numbers_removed)
    #tokenization
    tokens = nltk.word_tokenize(punct_removed.lower())
    
    #remove stop words and lemmatization
    lem_words = []
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    for word in tokens:
        temp_word = lemmatizer.lemmatize(word,get_wordnet_pos(word))
        if  temp_word not in stop_words:
            lem_words.append(temp_word)

    return lem_words


In [4]:
def get_bigram(lem_words):
    """
    This function gets the bigram of the review texts.
    :param lem_words: a list of preprocessed words
    :returns: a list of bigram words or just a single word (if unable to perform bigram)
    """
    
    if len(lem_words) <= 1: #the review contains a single word only, hence unable to perform bigram
        return lem_words
    
    else:
        #gets the bigram in the form of [('wordA','wordB'),('wordB,'wordC'),...]
        bigrm = list(nltk.bigrams(lem_words))

        #make the bigram in this format ['wordA wordB','wordB wordC',...]
        bigrm_list = []
        separator = ' '
        for i in range(len(bigrm)):
            bigrm_list.append(separator.join(bigrm[i]))   
        return bigrm_list

In [5]:
#applies the preprocessing_text function on all items in the review column
start_text_preprocessing = datetime.datetime.now()
lem_tokens = dat['review'].apply(preprocessing_text)
print("duration for text preprocessing: ",datetime.datetime.now() - start_text_preprocessing)

#applies the get_bigram function on all the items in the review column
bigram_list = lem_tokens.apply(get_bigram)

duration for text preprocessing:  0:15:28.856058
duration for generating bigram:  0:00:00.617649


In [6]:

def check_rating(rates,bigram):
    """
    This function ensures that all the ratings are integers.
    :param rates: a list of ratings
    :param bigram: list of bigram words
    :returns: a new list of ratings and bigram words
    """
    new_bigram = []
    new_rating = []
    
    for i in range(len(rates)):
        try:
            new_rating.append(int(rates[i]))
            new_bigram.append(bigram[i])
        except:
            pass
        
    return new_bigram, new_rating


rate_list=dat['rating'].values.tolist()
new_bigram, new_rating = check_rating(rate_list,bigram_list)


#putting the series of review texts into data frame
df_bigram = pd.DataFrame({'review':new_bigram})
df_rate = pd.DataFrame({'rating':new_rating})
#concatenating the new data frame with ratings column
result = pd.concat([df_bigram,df_rate],axis=1)
print(result.head(10))

                                              review  rating
0  [part magic, magic grow, grow boy, boy buy, bu...       4
1  [amaze detail, detail every, every credit, cre...       5
2  [purchase behalf, behalf dad, dad always, alwa...       5
3  [everything really, really need, need see, see...       5
4  [collect glossy, glossy picture, picture great...       5
5  [great book, book extremely, extremely useful,...       5
6  [useful info, info someonelike, someonelike st...       5
7  [well produce, produce good, good quality, qua...       5
8     [happy communication, communication funkybuys]       4
9                                        [great buy]       5


In [7]:
#Calculating the tf-idf values

X_train_1 = result['review'].values
Y_train_1 = result['rating'].values

def identity_tokenizer(text):
    """
    Just a dummy function.
    """
    return text

tfidf_1 = TfidfVectorizer(tokenizer=identity_tokenizer, analyzer='word',preprocessor = identity_tokenizer,lowercase=True)    
X_1 = tfidf_1.fit_transform(X_train_1)

print("shape of the vectorizer: ",X_1.shape)


shape of the vextorizer:  (28212, 288098)


In [9]:
#Resampling methods

from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import NearMiss

def oversampling(X_train,y_train):
    """
    This function performs oversampling using ADASYN.
    :params X_train: the tfidf values to be trained
    :params y_train: the ratings to be trained
    :returns: the oversampled X_train and y_train data
    """
    ada = ADASYN()
    X_train_smt,y_train_smt = ada.fit_sample(X_train,y_train)
    return X_train_smt,y_train_smt

def undersampling(X_train,y_train):
    """
    This function performs undersampling using NearMiss.
    :params X_train: the tfidf values to be trained
    :params y_train: the ratings to be trained
    :returns: the undersampled X_train and y_train data
    """
    nr = NearMiss()
    X_train_nr,y_train_nr = nr.fit_sample(X_train,y_train)
    return X_train_nr,y_train_nr

In [8]:
#Plot graph to show top 10 features associated with respective ratings

# import matplotlib.pyplot as plt

# def plot_coefficients(classifier, feature_names, top_features=10):
#     coef = classifier.coef_
#     print(coef)
#     for i in range(len(coef)):
#         top_positive_coefficients = np.argsort(coef[i])[-top_features:]
#         # create plot
#         plt.figure(figsize=(15, 5))
#         colors = ['red' if c < 0 else 'blue' for c in coef[i][top_positive_coefficients]]
#         plt.bar(np.arange(2 * top_features), coef[i][top_positive_coefficients], color=colors)
#         feature_names = np.array(feature_names)
#         plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_positive_coefficients], rotation=60, ha='right')
#         plt.show()

In [9]:
from sklearn.model_selection import KFold,StratifiedKFold
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import NearMiss
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report,confusion_matrix
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from imblearn.pipeline import make_pipeline


def train_model(X,Y):
    """
    We will be experimenting on Support Vector Machine and Multilayer Perceptron Neural Network 
    to determine which algorithm builds a better model. We will also perform both oversampling(ADASYN)
    and undersampling (NearMiss) to determine which resampling method is more accurate. Besides that, 
    we will be using SelectKBest method to select features. We will be testing out OneVsRestClassifier
    on SVC models to see whether it improves the accuracy and precision values. The usage of pipeline is
    to prevent any data leakage.
    
    Some lines of code for LinearSVC, SVC and MLP are commented out to prevent confusion. 
    To test out a specific line of code, comment out the current line of code and uncomment the you want to try.
    To run the code without feature selection, remove SelectKBest() from the respective line of code.
    
    :params X: the tfidf values to be trained
    :params Y: the ratings to be trained
    """
    accuracy = []
    precision_micro = []
    recall_micro = []
    f1_micro =[]
    precision_macro = []
    recall_macro = []
    f1_macro =[]
    precision_weighted = []
    recall_weighted = []
    f1_weighted =[]
    
    #using stratified kfold to split the data into 10 folds
    cv = StratifiedKFold(n_splits=10, random_state=42, shuffle = False)
    
    for train_index, test_index in cv.split(X,Y):
        start_fold = datetime.datetime.now()
        
        #setting the training and testing data
        X_train,X_test = X[train_index],X[test_index]
        y_train,y_test = Y[train_index],Y[test_index]
        
        ''' SVC
        For our research, we switched the kernel between 'rbf' and 'linear' with different numbers of features selected.
        The first and second line of code performs oversampling whereas the third and fourth line of code performs undersampling.
        The first and third line of code contains OneVsRestClassifier whereas the second and fourth line of code does not.
        '''
        #pipeline = make_pipeline(SelectKBest(chi2, k=50000),ADASYN(),OneVsRestClassifier(SVC(kernel='rbf',gamma='auto',cache_size=1000)))
        #pipeline = make_pipeline(SelectKBest(chi2, k=50000),ADASYN(),SVC(kernel='rbf',gamma='auto',cache_size=1000))
        #pipeline = make_pipeline(SelectKBest(chi2, k=50000),NearMiss(),OneVsRestClassifier(SVC(kernel='rbf',gamma='auto',cache_size=1000)))
        #pipeline = make_pipeline(SelectKBest(chi2, k=50000),NearMiss(),SVC(kernel='rbf',gamma='auto',cache_size=1000))
        
        ''' LinearSVC
        The first line of code performs oversampling whereas the second line of code performs undersampling
        '''
        pipeline = make_pipeline(SelectKBest(chi2, k=50000),ADASYN(),LinearSVC(C=1,max_iter=10000,random_state=42))
        #pipeline = make_pipeline(SelectKBest(chi2, k=50000),NearMiss(),LinearSVC(C=1,max_iter=10000,random_state=42))
        
        ''' MLPClassifier
        For our research, we switched the solver parameter between 'sgd' and 'adam' and tried different values of hidden_layer_sizes.
        The first line of code performs oversampling whereas the second line of code performs undersampling
        '''
        #pipeline = make_pipeline(SelectKBest(chi2, k=50000),ADASYN(),MLPClassifier(solver='sgd', hidden_layer_sizes= (5,5), max_iter=1000,random_state=1))
        #pipeline = make_pipeline(SelectKBest(chi2, k=50000),NearMiss(),MLPClassifier(solver='sgd', hidden_layer_sizes= (5,5), max_iter=1000,random_state=1))
        
        #fit the model
        pipeline.fit(X_train, y_train)
        #using testing data to predict the results
        y_pred = pipeline.predict(X_test)
        
        #prints the confusion matrix
        print(confusion_matrix(y_test,y_pred)) 
        #prints the classification report
        print(classification_report(y_test,y_pred))
        
        #appends the results into respective lists
        accuracy.append(accuracy_score(y_test,y_pred))
        #micro average
        precision_micro.append(precision_score(y_test,y_pred,average='micro',labels=np.unique(y_pred)))
        recall_micro.append(recall_score(y_test,y_pred,average='micro'))
        f1_micro.append(f1_score(y_test,y_pred,average='micro',labels=np.unique(y_pred)))
        #macro average
        precision_macro.append(precision_score(y_test,y_pred,average='macro',labels=np.unique(y_pred)))
        recall_macro.append(recall_score(y_test,y_pred,average='macro'))
        f1_macro.append(f1_score(y_test,y_pred,average='macro',labels=np.unique(y_pred)))
        #weighted average
        precision_weighted.append(precision_score(y_test,y_pred,average='weighted',labels=np.unique(y_pred)))
        recall_weighted.append(recall_score(y_test,y_pred,average='weighted'))
        f1_weighted.append(f1_score(y_test,y_pred,average='weighted',labels=np.unique(y_pred)))
        
        print("duration for this fold: ",datetime.datetime.now() - start_fold)

    #prints out the mean of the result from respective lists    
    print("accuracy testing: {}".format(np.mean(accuracy)))
    print("precision_micro: {}".format(np.mean(precision_micro)))
    print("recall_micro: {}".format(np.mean(recall_micro)))
    print("f1_micro: {}".format(np.mean(f1_micro)))
    print("")
    print("precision_macro: {}".format(np.mean(precision_macro)))
    print("recall_macro: {}".format(np.mean(recall_macro)))
    print("f1_macro: {}".format(np.mean(f1_macro)))
    print("")
    print("precision_weighted: {}".format(np.mean(precision_weighted)))
    print("recall_weighted: {}".format(np.mean(recall_weighted)))
    print("f1_weighted: {}".format(np.mean(f1_weighted)))
    


start = datetime.datetime.now()
train_model(X_1,Y_train_1)
end = datetime.datetime.now()
duration = end - start
print("duration of training: ",duration)

[[   5    0    4   16   36]
 [   1    1    5   22   26]
 [   0    0   16   65   79]
 [   0    0    8  178  309]
 [   0    1   25  329 1697]]
              precision    recall  f1-score   support

           1       0.83      0.08      0.15        61
           2       0.50      0.02      0.04        55
           3       0.28      0.10      0.15       160
           4       0.29      0.36      0.32       495
           5       0.79      0.83      0.81      2052

   micro avg       0.67      0.67      0.67      2823
   macro avg       0.54      0.28      0.29      2823
weighted avg       0.67      0.67      0.66      2823

duration of this fold:  0:00:56.064191
[[   6    0    2   22   31]
 [   0    0    3   28   24]
 [   0    0   15   58   87]
 [   0    0   19  156  320]
 [   1    1   33  281 1736]]
              precision    recall  f1-score   support

           1       0.86      0.10      0.18        61
           2       0.00      0.00      0.00        55
           3       0.21    