In [None]:
#GOAL - IMPLEMENT WORD2VEC FOR VECTORISATION AND STANDARD SCALING FOR NUMERICAL FEATURES
#TRAIN THE LIST OF SUPERVISED MODELS ON THIS AND COMPARE WITH TF-IDF

In [24]:
import gensim
import time
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [2]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import LinearSVC, SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import RidgeClassifier, SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

In [6]:
#READ THE DATA FILE
df = pd.read_csv("Datasets/amazon_reviews_3.csv")

In [7]:
df.columns

Index(['Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'RATING',
       'VERIFIED_PURCHASE', 'REVIEW_TITLE', 'REVIEW_TEXT', 'NUM_NOUNS',
       'NUM_VERBS', 'NUM_ADJECTIVES', 'NUM_ADVERBS', 'REVIEW_LENGTH',
       'SENTIMENT_SCORE', 'TITLE_LENGTH', 'AVERAGE_RATING', 'RATING_DEVIATION',
       'NUM_REVIEWS', 'READABILITY_FRE', 'SENTIMENT_CATEGORY_ENCODED',
       'RATING_CATEGORY_ENCODED', 'COHERENT_ENCODED', 'AVG_WORD_LENGTH',
       'LABEL_ENCODED', 'NUM_NAMED_ENTITIES', 'CAPITAL_CHAR_COUNT',
       'PUNCTUATION_COUNT', 'PREPROCESSED_REVIEW_TEXT'],
      dtype='object')

In [8]:
#FEATURES
text_features = df['PREPROCESSED_REVIEW_TEXT']
numerical_features = df[['REVIEW_LENGTH', 'VERIFIED_PURCHASE']]

In [9]:
labels = df['LABEL_ENCODED']

In [14]:
def vectorize_text(text):
    #TOKENIZE THE TEXT
    tokenized_text = [text.split() for text in text]

    w2v_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1)
    
    # CREATE WORD VECTORS
    text_vectors = []
    for review in tokenized_text:
        review_vectors = [w2v_model.wv[word] for word in review if word in w2v_model.wv]
        if len(review_vectors) > 0:
            review_vector = np.mean(review_vectors, axis=0)  # Example: Average the word vectors
            text_vectors.append(review_vector)
    
    return np.array(text_vectors)

In [15]:
#VECTORISE THE REVIEW TEXTS
vectorised_texts = vectorize_text(text_features)

In [20]:
#TRAIN-TEST SPLIT FOR THE MODELS

In [18]:
X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
     vectorised_texts,
     numerical_features,
     labels,
     test_size = 0.2,
     random_state = 42
)

In [28]:
#FOR THE TF-IDF VECTORISATION
X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
     text_features,
     numerical_features,
     labels,
     test_size = 0.2,
     random_state = 42
)

vectoriser = TfidfVectorizer()
X_text_train = vectoriser.fit_transform(X_text_train)
X_text_test = vectoriser.transform(X_text_test)

In [34]:
#SCALE THE NUMERICAL FEATURES
X_text_train

<16800x29758 sparse matrix of type '<class 'numpy.float64'>'
	with 408765 stored elements in Compressed Sparse Row format>

In [30]:
sc = StandardScaler()
X_num_train = sc.fit_transform(X_num_train)
X_num_test = sc.transform(X_num_test)

In [35]:
# COMBINE TEXT AND NUMERICAL FEATURES
X_train = np.concatenate((X_text_train.toarray(), X_num_train), axis=1)
X_test = np.concatenate((X_text_test.toarray(), X_num_test), axis=1)

In [36]:
# DICTIONARY WITH NAME AND COMMAND TO INSTANTIATE DIFFERENT MODELS
classifiers = {}
classifiers.update({"XGBClassifier": XGBClassifier(eval_metric='logloss',objective='binary:logistic',use_label_encoder=False)})
classifiers.update({"CatBoostClassifier": CatBoostClassifier(silent=True)})
classifiers.update({"LinearSVC": LinearSVC(max_iter=10000)})
classifiers.update({"LGBMClassifier": LGBMClassifier()})
classifiers.update({"RandomForestClassifier": RandomForestClassifier()})
classifiers.update({"DecisionTreeClassifier": DecisionTreeClassifier()})
classifiers.update({"ExtraTreeClassifier": ExtraTreeClassifier()})
classifiers.update({"AdaBoostClassifier": AdaBoostClassifier()})
classifiers.update({"KNeighborsClassifier": KNeighborsClassifier()})
classifiers.update({"RidgeClassifier": RidgeClassifier()})
classifiers.update({"SGDClassifier": SGDClassifier()})
classifiers.update({"BaggingClassifier": BaggingClassifier()})
classifiers.update({"BernoulliNB": BernoulliNB()})
classifiers.update({"LogisticRegression": LogisticRegression()})
classifiers.update({"SVM": SVC()})



In [None]:
# CREATE A DATAFRAME OF MODELS WITH RUN TIME AND AUC SCORES
df_models = pd.DataFrame(
    columns=['model', 'run_time', 'accuracy', 'precision', 'f1_score'])

for key in classifiers:
    # STARTING TIME
    start_time = time.time()
    # CURRENT CLASSIFIER
    clf = classifiers[key]
    #TRAIN CLASSIFIER ON TRAINING DATA
    clf.fit(X_train, y_train)
    # MAKE PREDICTIONS USING CURRENT CLASSIFIER
    predictions = clf.predict(X_test)
    # CALCULATE ACCURACY
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    f1score = f1_score(y_test, predictions)
    
    row = {'model': key,
           'run_time': format(round((time.time() - start_time)/60, 2)),
           'accuracy': accuracy,
           'precision': precision,
           'f1_score': f1score
           }

    df_models = df_models._append(row, ignore_index=True)

df_models = df_models.sort_values(by='accuracy', ascending=False)

In [None]:
df_models