In [33]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
import time
import gensim
from gensim.models import Word2Vec
# SENTIMENT ANALYSIS USING VADER
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import LinearSVC, SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import RidgeClassifier, SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
import pandas as pd

In [6]:
df = pd.read_csv("Datasets/amazon_reviews_3.csv")
X = df['PREPROCESSED_REVIEW_TEXT']
Y = df['LABEL_ENCODED']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2,
                                                   random_state = 42)

In [19]:
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [23]:
# DICTIONARY WITH NAME AND COMMAND TO INSTANTIATE DIFFERENT MODELS
classifiers = {}
classifiers.update({"XGBClassifier": XGBClassifier(eval_metric='logloss',objective='binary:logistic',use_label_encoder=False)})
classifiers.update({"CatBoostClassifier": CatBoostClassifier(silent=True)})
classifiers.update({"LinearSVC": LinearSVC(max_iter=10000)})
#classifiers.update({"MultinomialNB": MultinomialNB()})
#classifiers.update({"LGBMClassifier": LGBMClassifier()})
classifiers.update({"RandomForestClassifier": RandomForestClassifier()})
classifiers.update({"DecisionTreeClassifier": DecisionTreeClassifier()})
classifiers.update({"ExtraTreeClassifier": ExtraTreeClassifier()})
classifiers.update({"AdaBoostClassifier": AdaBoostClassifier()})
classifiers.update({"KNeighborsClassifier": KNeighborsClassifier()})
classifiers.update({"RidgeClassifier": RidgeClassifier()})
classifiers.update({"SGDClassifier": SGDClassifier()})
classifiers.update({"BaggingClassifier": BaggingClassifier()})
classifiers.update({"BernoulliNB": BernoulliNB()})
classifiers.update({"LogisticRegression": LogisticRegression()})
classifiers.update({"SVM": SVC()})



In [22]:
X_train_dtm

<16800x29758 sparse matrix of type '<class 'numpy.int64'>'
	with 408765 stored elements in Compressed Sparse Row format>

In [24]:
# CREATE A DATAFRAME OF MODELS WITH RUN TIME AND AUC SCORES
df_models = pd.DataFrame(
    columns=['model', 'run_time', 'accuracy', 'precision', 'f1_score'])

for key in classifiers:
    # STARTING TIME
    start_time = time.time()
    # CURRENT CLASSIFIER
    clf = classifiers[key]
    #TRAIN CLASSIFIER ON TRAINING DATA
    clf.fit(X_train_dtm, y_train)
    # MAKE PREDICTIONS USING CURRENT CLASSIFIER
    predictions = clf.predict(X_test_dtm)
    # CALCULATE ACCURACY
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    f1score = f1_score(y_test, predictions)
    
    row = {'model': key,
           'run_time': format(round((time.time() - start_time)/60, 2)),
           'accuracy': accuracy,
           'precision': precision,
           'f1_score': f1score
           }

    df_models = df_models._append(row, ignore_index=True)

df_models = df_models.sort_values(by='accuracy', ascending=False)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
df_models

Unnamed: 0,model,run_time,accuracy,precision,f1_score
13,SVM,1.49,0.641905,0.639731,0.638809
3,RandomForestClassifier,0.55,0.635238,0.636543,0.627251
1,CatBoostClassifier,0.9,0.625952,0.630855,0.612003
0,XGBClassifier,0.02,0.623571,0.625498,0.613731
12,LogisticRegression,0.01,0.620238,0.622623,0.609356
9,SGDClassifier,0.0,0.611905,0.608488,0.610234
11,BernoulliNB,0.0,0.598095,0.693659,0.457235
6,AdaBoostClassifier,0.02,0.59619,0.616957,0.547492
2,LinearSVC,0.06,0.592143,0.594705,0.576933
8,RidgeClassifier,0.02,0.591429,0.597567,0.56841


In [27]:
#COMPARE WITH TF-IDF
vect = TfidfVectorizer()
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [37]:
# DICTIONARY WITH NAME AND COMMAND TO INSTANTIATE DIFFERENT MODELS
classifiers = {}
classifiers.update({"XGBClassifier": XGBClassifier(eval_metric='logloss',objective='binary:logistic',use_label_encoder=False)})
classifiers.update({"CatBoostClassifier": CatBoostClassifier(silent=True)})
classifiers.update({"LinearSVC": LinearSVC(max_iter=10000)})
#classifiers.update({"MultinomialNB": MultinomialNB()})
#classifiers.update({"LGBMClassifier": LGBMClassifier()})
classifiers.update({"RandomForestClassifier": RandomForestClassifier()})
classifiers.update({"DecisionTreeClassifier": DecisionTreeClassifier()})
classifiers.update({"ExtraTreeClassifier": ExtraTreeClassifier()})
classifiers.update({"AdaBoostClassifier": AdaBoostClassifier()})
classifiers.update({"KNeighborsClassifier": KNeighborsClassifier()})
classifiers.update({"RidgeClassifier": RidgeClassifier()})
classifiers.update({"SGDClassifier": SGDClassifier()})
classifiers.update({"BaggingClassifier": BaggingClassifier()})
classifiers.update({"BernoulliNB": BernoulliNB()})
classifiers.update({"LogisticRegression": LogisticRegression()})
classifiers.update({"SVM": SVC()})



In [29]:
# CREATE A DATAFRAME OF MODELS WITH RUN TIME AND AUC SCORES
df_models = pd.DataFrame(
    columns=['model', 'run_time', 'accuracy', 'precision', 'f1_score'])

for key in classifiers:
    # STARTING TIME
    start_time = time.time()
    # CURRENT CLASSIFIER
    clf = classifiers[key]
    #TRAIN CLASSIFIER ON TRAINING DATA
    clf.fit(X_train_dtm, y_train)
    # MAKE PREDICTIONS USING CURRENT CLASSIFIER
    predictions = clf.predict(X_test_dtm)
    # CALCULATE ACCURACY
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    f1score = f1_score(y_test, predictions)
    
    row = {'model': key,
           'run_time': format(round((time.time() - start_time)/60, 2)),
           'accuracy': accuracy,
           'precision': precision,
           'f1_score': f1score
           }

    df_models = df_models._append(row, ignore_index=True)

df_models = df_models.sort_values(by='accuracy', ascending=False)

In [30]:
df_models

Unnamed: 0,model,run_time,accuracy,precision,f1_score
13,SVM,2.36,0.644286,0.636616,0.648305
12,LogisticRegression,0.01,0.638571,0.634043,0.638571
9,SGDClassifier,0.0,0.638095,0.629766,0.64336
1,CatBoostClassifier,1.79,0.636905,0.633715,0.63508
8,RidgeClassifier,0.0,0.631905,0.62864,0.630144
3,RandomForestClassifier,0.44,0.631667,0.637245,0.617553
0,XGBClassifier,0.04,0.627381,0.626829,0.621524
2,LinearSVC,0.0,0.621667,0.620272,0.616831
6,AdaBoostClassifier,0.04,0.607381,0.591983,0.629854
11,BernoulliNB,0.0,0.598095,0.693659,0.457235


In [34]:
#COMPARE WITH WORD2VEC
def vectorize_text(text):
    #TOKENIZE THE TEXT
    tokenized_text = [text.split() for text in text]

    w2v_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1)
    
    # CREATE WORD VECTORS
    text_vectors = []
    for review in tokenized_text:
        review_vectors = [w2v_model.wv[word] for word in review if word in w2v_model.wv]
        if len(review_vectors) > 0:
            review_vector = np.mean(review_vectors, axis=0)  # Example: Average the word vectors
            text_vectors.append(review_vector)
    
    return np.array(text_vectors)

In [35]:
X = vectorize_text(df['PREPROCESSED_REVIEW_TEXT'])

In [36]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y,
    test_size = 0.2,
    random_state = 42
    )

In [40]:
#SCALING AFTER VECTORISING TEXT
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# CREATE A DATAFRAME OF MODELS WITH RUN TIME AND AUC SCORES
df_models = pd.DataFrame(
    columns=['model', 'run_time', 'accuracy', 'precision', 'f1_score'])

for key in classifiers:
    # STARTING TIME
    start_time = time.time()
    # CURRENT CLASSIFIER
    clf = classifiers[key]
    #TRAIN CLASSIFIER ON TRAINING DATA
    clf.fit(X_train, y_train)
    # MAKE PREDICTIONS USING CURRENT CLASSIFIER
    predictions = clf.predict(X_test)
    # CALCULATE ACCURACY
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    f1score = f1_score(y_test, predictions)
    
    row = {'model': key,
           'run_time': format(round((time.time() - start_time)/60, 2)),
           'accuracy': accuracy,
           'precision': precision,
           'f1_score': f1score
           }

    df_models = df_models._append(row, ignore_index=True)

df_models = df_models.sort_values(by='accuracy', ascending=False)



In [39]:
df_models

Unnamed: 0,model,run_time,accuracy,precision,f1_score
1,CatBoostClassifier,0.25,0.615714,0.612733,0.613321
2,LinearSVC,0.31,0.614048,0.605168,0.622233
8,RidgeClassifier,0.0,0.61381,0.604989,0.621911
13,SVM,1.31,0.613571,0.611165,0.610137
12,LogisticRegression,0.01,0.611667,0.603938,0.617943
3,RandomForestClassifier,0.48,0.608571,0.608889,0.6
0,XGBClassifier,0.16,0.596667,0.592261,0.59705
6,AdaBoostClassifier,0.32,0.595714,0.591317,0.596099
7,KNeighborsClassifier,0.01,0.591905,0.585602,0.596896
10,BaggingClassifier,0.49,0.57381,0.585211,0.530922
