In [1]:
#GOAL - CREATE TEXT EMBEDDINGS OF REVIEW TEXT USING FASTTEXT
#INSTEAD OF PREDICTION THROUGH THE FASTTEXT MODEL; USE LOGREG WITH THE EMBEDDINGS

In [3]:
#IMPORT REQUIRED MODULES
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, precision_score, recall_score, classification_report
from gensim.models import FastText
import numpy as np
import pandas as pd
import time
from sklearn.preprocessing import StandardScaler, MinMaxScaler
#IMPORT SUPERVISED CLASSIFIER ALGORITHMS
from sklearn.svm import LinearSVC, SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import RidgeClassifier, SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

In [4]:
#READ FILE
df = pd.read_csv("Datasets/amazon_reviews_3.csv")

In [5]:
#FEATURES AND LABELS
X = df['PREPROCESSED_REVIEW_TEXT']
Y = df['LABEL_ENCODED']

In [6]:
#TRAIN-TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

In [7]:
#TRAINING OF FASTTEXT MODEL
fasttext_model = FastText(sentences=X_train, vector_size=100, window=5, min_count=5, epochs=10)

In [8]:
#CONVERT THE TEXT INTO EMBEDDINGS [USING FASTTEXT VECTORS]
X_train_embeddings = np.array([np.mean([fasttext_model.wv[word] for word in sentence.split() if word in fasttext_model.wv], axis=0) for sentence in X_train])
X_test_embeddings = np.array([np.mean([fasttext_model.wv[word] for word in sentence.split() if word in fasttext_model.wv], axis=0) for sentence in X_test])

In [None]:
#NOTE - NO SCALING AS EMBEDDINGS OF FASTTEXT ARE ALREADY NORMALISED

In [9]:
#CHOOSE MODEL HERE TO COMBINE WITH FASTTEXT [HERE - LOGREG]
clf = LogisticRegression()

#TRAIN THE MODEL USING TEXT EMBEDDINGS
clf.fit(X_train_embeddings, y_train)

In [10]:
X_train_embeddings = X_train_embeddings.astype(float)
X_train_embeddings

array([[-4.03138343e-04,  4.06128704e-04, -5.96475729e-05, ...,
         1.45523483e-03,  5.99556544e-04, -1.11395156e-03],
       [ 5.97906357e-04, -5.77636005e-04, -4.66467063e-05, ...,
         1.54219405e-03, -4.74301603e-04,  5.22994414e-05],
       [-4.12865513e-04, -6.06556016e-04,  4.58826631e-04, ...,
        -3.76118784e-04,  5.84062829e-04, -1.34283720e-04],
       ...,
       [ 4.16092342e-04,  2.53350154e-04, -6.51807350e-04, ...,
         1.73388631e-04, -3.69961344e-04,  1.45691141e-04],
       [-4.90103557e-04, -9.03398250e-05,  1.92788386e-04, ...,
        -5.61288092e-04, -1.32098823e-04,  5.18169138e-04],
       [ 3.14794335e-04, -1.03877916e-03, -5.99701423e-04, ...,
        -5.05275966e-04, -1.98538564e-05,  5.12823462e-04]])

In [11]:
#MAKE PREDICTIONS
predictions = clf.predict(X_test_embeddings)


#PRINT ACCURACY
print(f"Accuracy = {accuracy_score(y_test, predictions)*100}%")

Accuracy = 53.19047619047619%


In [15]:
#TEST WITH ALL TO GET MAXIMUM ACCURACY WITH FASTTEXT
# DICTIONARY WITH NAME AND COMMAND TO INSTANTIATE DIFFERENT MODELS
classifiers = {}
#classifiers.update({"XGBClassifier": XGBClassifier(eval_metric='logloss',objective='binary:logistic',use_label_encoder=False)})
#classifiers.update({"CatBoostClassifier": CatBoostClassifier(silent=True)})
#classifiers.update({"LinearSVC": LinearSVC(max_iter=10000)})
#classifiers.update({"MultinomialNB": MultinomialNB()})
#classifiers.update({"LGBMClassifier": LGBMClassifier()})
#classifiers.update({"RandomForestClassifier": RandomForestClassifier()})
#classifiers.update({"DecisionTreeClassifier": DecisionTreeClassifier()})
#classifiers.update({"ExtraTreeClassifier": ExtraTreeClassifier()})
#classifiers.update({"AdaBoostClassifier": AdaBoostClassifier()})
#classifiers.update({"KNeighborsClassifier": KNeighborsClassifier()})
#classifiers.update({"RidgeClassifier": RidgeClassifier()})
#classifiers.update({"SGDClassifier": SGDClassifier()})
#classifiers.update({"BaggingClassifier": BaggingClassifier()})
classifiers.update({"BernoulliNB": BernoulliNB()})
classifiers.update({"LogisticRegression": LogisticRegression()})
classifiers.update({"SVM": SVC()})

In [16]:
# CREATE A DATAFRAME OF MODELS WITH RUN TIME AND AUC SCORES
df_models = pd.DataFrame(
    columns=['model', 'run_time', 'accuracy', 'precision', 'f1_score'])

for key in classifiers:
    # STARTING TIME
    start_time = time.time()
    # CURRENT CLASSIFIER
    clf = classifiers[key]
    #TRAIN CLASSIFIER ON TRAINING DATA
    clf.fit(X_train_embeddings, y_train)
    # MAKE PREDICTIONS USING CURRENT CLASSIFIER
    predictions = clf.predict(X_test_embeddings)
    # CALCULATE ACCURACY
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    f1score = f1_score(y_test, predictions)
    
    row = {'model': key,
           'run_time': format(round((time.time() - start_time)/60, 2)),
           'accuracy': accuracy,
           'precision': precision,
           'f1_score': f1score
           }

    df_models = df_models._append(row, ignore_index=True)

df_models = df_models.sort_values(by='accuracy', ascending=False)

In [14]:
df_models

Unnamed: 0,model,run_time,accuracy,precision,f1_score
1,SVM,0.81,0.534524,0.612457,0.265866
0,LogisticRegression,0.0,0.531905,0.594295,0.276141
