In [1]:
import pandas as pd
import numpy as np
import os
import time
# SENTIMENT ANALYSIS USING VADER
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import LinearSVC, SVC
from gensim.models import Word2Vec
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import RidgeClassifier, SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv("../Datasets/amazon_reviews_labelled.csv")

In [3]:
len(df)

21000

In [4]:
df.columns

Index(['Unnamed: 0.4', 'Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1',
       'Unnamed: 0', 'RATING', 'VERIFIED_PURCHASE', 'REVIEW_TITLE',
       'REVIEW_TEXT', 'NUM_NOUNS', 'NUM_VERBS', 'NUM_ADJECTIVES',
       'NUM_ADVERBS', 'REVIEW_LENGTH', 'SENTIMENT_SCORE', 'TITLE_LENGTH',
       'AVERAGE_RATING', 'RATING_DEVIATION', 'NUM_REVIEWS', 'READABILITY_FRE',
       'SENTIMENT_CATEGORY_ENCODED', 'RATING_CATEGORY_ENCODED',
       'COHERENT_ENCODED', 'AVG_WORD_LENGTH', 'LABEL_ENCODED',
       'NUM_NAMED_ENTITIES', 'CAPITAL_CHAR_COUNT', 'PUNCTUATION_COUNT',
       'PREPROCESSED_REVIEW_TEXT', 'WORD_COUNT', 'SENTIMENT_SCORE_TITLE',
       'SENTIMENT_LABEL_TITLE', 'AVG_RATING_VERIFIED',
       'AVG_RATING_NON_VERIFIED', 'DEVIATION_VERIFIED',
       'DEVIATION_NON_VERIFIED'],
      dtype='object')

In [5]:
features_text = df['PREPROCESSED_REVIEW_TEXT']
features_numeric = df[[
    'RATING', 'VERIFIED_PURCHASE', 
        'NUM_NOUNS', 'NUM_VERBS', 'NUM_ADJECTIVES',
       'NUM_ADVERBS', 'REVIEW_LENGTH', 'SENTIMENT_SCORE', 'TITLE_LENGTH',
       'AVERAGE_RATING', 'RATING_DEVIATION', 'NUM_REVIEWS', 'READABILITY_FRE',
       'SENTIMENT_CATEGORY_ENCODED', 'RATING_CATEGORY_ENCODED',
       'COHERENT_ENCODED', 'AVG_WORD_LENGTH',
       'NUM_NAMED_ENTITIES', 'CAPITAL_CHAR_COUNT', 'PUNCTUATION_COUNT',
        'WORD_COUNT', 'SENTIMENT_SCORE_TITLE',
       'SENTIMENT_LABEL_TITLE', 'AVG_RATING_VERIFIED',
       'AVG_RATING_NON_VERIFIED', 'DEVIATION_VERIFIED',
       'DEVIATION_NON_VERIFIED'
]]
labels = df['LABEL_ENCODED']

In [6]:
# TRAIN-TEST SPLIT
X_text_train, X_text_test, X_numeric_train, X_numeric_test, y_train, y_test = train_test_split(
    features_text, features_numeric, labels, test_size=0.2, random_state=42
)

In [7]:
#FUNCTION FOR COMBINING WORD2VEC AND TFIDF

def vectorize_text(X_text_train, X_text_test):
    #TOKENIZE THE TEXT
    tokenized_text_train = [t.split() for t in X_text_train]
    tokenized_text_test = [t.split() for t in X_text_test]
    
    #LOAD PRE-TRAINED WORD2VEC MODEL
    model_path = "word2vec_model.bin"
    if os.path.isfile(model_path):
        w2v_model = Word2Vec.load(model_path)
    
    else:
        #TRAIN THE WORD2VEC MODEL
        w2v_model = Word2Vec(sentences=tokenized_text_train,vector_size=100,window=5,min_count=1)
        #SAVE THE MODEL
        w2v_model.save("word2vec_model.bin")
        
    #W2V VECTORISATION
    w2v_vectors_train = []
    
    for review in tokenized_text_train:
        review_vectors = [w2v_model.wv[word] for word in review if word in w2v_model.wv]
        
        #AVERAGE METHOD
        if len(review_vectors) > 0:
            review_vector = np.mean(review_vectors, axis=0)  # Average the word vectors
            w2v_vectors_train.append(review_vector)
            
    #W2V VECTORISATION
    w2v_vectors_test = []
    
    for review in tokenized_text_test:
        review_vectors = [w2v_model.wv[word] for word in review if word in w2v_model.wv]
        
        #AVERAGE METHOD
        if len(review_vectors) > 0:
            review_vector = np.mean(review_vectors, axis=0)  # Average the word vectors
            w2v_vectors_test.append(review_vector)
    final_vectors_train = w2v_vectors_train
    final_vectors_test = w2v_vectors_test
        
    return final_vectors_train, final_vectors_test

In [8]:
X_text_train, X_text_test = vectorize_text(X_text_train, X_text_test)

In [9]:
#APPLY FEATURE SCALING TO MATRICES
sc_numeric = StandardScaler()
X_numeric_train = sc_numeric.fit_transform(X_numeric_train)
X_numeric_test = sc_numeric.transform(X_numeric_test)

sc_text= StandardScaler()
X_text_train = sc_text.fit_transform(X_text_train)
X_text_test = sc_text.transform(X_text_test)

In [10]:
X_text_train.shape

(16800, 100)

In [11]:
X_text_test.shape

(4200, 100)

In [12]:
#DIMENSIONALITY REDUCTION: [NOT NEEDED, ONLY 100 COMPONENTS]

In [13]:
#CONCATENATE THE TEXTUAL FEATURE AND NUMERIC FEATURE MATRIX
X_train = np.hstack((X_text_train, X_numeric_train))
X_test = np.hstack((X_text_test, X_numeric_test))

In [14]:
# DICTIONARY WITH NAME AND COMMAND TO INSTANTIATE DIFFERENT MODELS
classifiers = {}
#classifiers.update({"XGBClassifier": XGBClassifier(eval_metric='logloss',objective='binary:logistic',use_label_encoder=False)})
#classifiers.update({"CatBoostClassifier": CatBoostClassifier(silent=True)})
#classifiers.update({"LinearSVC": LinearSVC(max_iter=10000)})
#classifiers.update({"MultinomialNB": MultinomialNB()})
#classifiers.update({"LGBMClassifier": LGBMClassifier()})
classifiers.update({"RandomForestClassifier": RandomForestClassifier()})
classifiers.update({"DecisionTreeClassifier": DecisionTreeClassifier()})
classifiers.update({"ExtraTreeClassifier": ExtraTreeClassifier()})
classifiers.update({"AdaBoostClassifier": AdaBoostClassifier()})
classifiers.update({"KNeighborsClassifier": KNeighborsClassifier()})
classifiers.update({"RidgeClassifier": RidgeClassifier()})
classifiers.update({"SGDClassifier": SGDClassifier()})
#classifiers.update({"BaggingClassifier": BaggingClassifier()})
#classifiers.update({"BernoulliNB": BernoulliNB()})
classifiers.update({"LogisticRegression": LogisticRegression()})
classifiers.update({"SVM": SVC()})

In [15]:
# CREATE A DATAFRAME OF MODELS WITH RUN TIME AND AUC SCORES
df_models = pd.DataFrame(
    columns=['model', 'run_time', 'accuracy', 'precision', 'f1_score']
)

i = 1
for key in classifiers:
    # STARTING TIME
    start_time = time.time()
    # CURRENT CLASSIFIER
    clf = classifiers[key]
    #TRAIN CLASSIFIER ON TRAINING DATA
    clf.fit(X_train, y_train)
    # MAKE PREDICTIONS USING CURRENT CLASSIFIER
    predictions = clf.predict(X_test)
    # CALCULATE ACCURACY
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    f1score = f1_score(y_test, predictions)
    
    row = {'model': key,
           'run_time': format(round((time.time() - start_time)/60, 2)),
           'accuracy': accuracy,
           'precision': precision,
           'f1_score': f1score
           }

    df_models = df_models._append(row, ignore_index=True)
    print(f"{i}th Model Done")
    i+=1

df_models = df_models.sort_values(by='accuracy', ascending=False)

1th Model Done
2th Model Done
3th Model Done
4th Model Done
5th Model Done
6th Model Done
7th Model Done


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


8th Model Done
9th Model Done


In [16]:
df_models

Unnamed: 0,model,run_time,accuracy,precision,f1_score
0,RandomForestClassifier,0.22,0.816905,0.803786,0.819101
3,AdaBoostClassifier,0.19,0.81,0.797229,0.812235
8,SVM,0.69,0.809524,0.789806,0.81404
7,LogisticRegression,0.01,0.795238,0.779298,0.798971
5,RidgeClassifier,0.0,0.784762,0.75956,0.792661
6,SGDClassifier,0.02,0.78119,0.755926,0.789365
1,DecisionTreeClassifier,0.05,0.732381,0.73382,0.728502
4,KNeighborsClassifier,0.01,0.675238,0.679264,0.666992
2,ExtraTreeClassifier,0.0,0.672143,0.67319,0.666505
