In [1]:
#GOAL - COMBINE WORD2VEC & TF-IDF FOR VECTORISATION OF TEXT
#AND APPLY STANDARD SCALING ON FINAL VECTOR

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
import time

In [4]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, precision_score, recall_score, classification_report
from sklearn.svm import LinearSVC, SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import RidgeClassifier, SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

In [5]:
#READ THE FILE
df = pd.read_csv("Datasets/amazon_reviews_3.csv")

In [6]:
#FEATURE AND LABEL
X = df['PREPROCESSED_REVIEW_TEXT']
Y = df['LABEL_ENCODED']

In [7]:
#FUNCTION FOR COMBINING WORD2VEC AND TFIDF

def vectorize_text(text):
    #TOKENIZE THE TEXT
    tokenized_text = [t.split() for t in text]
    
    #TRAIN THE WORD2VEC MODEL
    w2v_model = Word2Vec(sentences=tokenized_text,
                        vector_size=100,
                         window=5,
                         min_count=1
                        )
    
    #W2V VECTORISATION
    w2v_vectors = []
    
    for review in tokenized_text:
        review_vectors = [w2v_model.wv[word] for word in review if word in w2v_model.wv]
        
        #AVERAGE METHOD
        if len(review_vectors) > 0:
            review_vector = np.mean(review_vectors, axis=0)  # Average the word vectors
            w2v_vectors.append(review_vector)
            
    #VECTORISE USING TF-IDF
    vect = TfidfVectorizer()
    tfidf_vectors = vect.fit_transform(text).toarray() #SHOULD ONLY RUN ON TRAIN? FIX LATER
    
    #CONCATENATE
    final_vectors = np.concatenate(
        (w2v_vectors, tfidf_vectors),
        axis = 1 #COMBINE COLUMNS -> HORIZONTAL
    )
        
    return final_vectors

In [8]:
X = vectorize_text(X)

In [9]:
#TRAIN_TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(
    X, Y,
    test_size = 0.2,
    random_state = 42
)

In [12]:
#SCALE THE VECTORS
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [13]:
#TRAIN 1 MODEL
clf = LogisticRegression()
clf.fit(X_train, y_train)

#MAKE PREDICTIONS
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.65      0.62      0.63      2115
           1       0.63      0.66      0.64      2085

    accuracy                           0.64      4200
   macro avg       0.64      0.64      0.64      4200
weighted avg       0.64      0.64      0.64      4200



In [14]:
#TRAIN AND EVALUATE ALL MODELS

In [13]:
# DICTIONARY WITH NAME AND COMMAND TO INSTANTIATE DIFFERENT MODELS
classifiers = {}
classifiers.update({"XGBClassifier": XGBClassifier(eval_metric='logloss',objective='binary:logistic',use_label_encoder=False)})
classifiers.update({"CatBoostClassifier": CatBoostClassifier(silent=True)})
classifiers.update({"LinearSVC": LinearSVC(max_iter=10000)})
classifiers.update({"LGBMClassifier": LGBMClassifier()})
classifiers.update({"RandomForestClassifier": RandomForestClassifier()})
classifiers.update({"DecisionTreeClassifier": DecisionTreeClassifier()})
classifiers.update({"ExtraTreeClassifier": ExtraTreeClassifier()})
classifiers.update({"AdaBoostClassifier": AdaBoostClassifier()})
classifiers.update({"KNeighborsClassifier": KNeighborsClassifier()})
classifiers.update({"RidgeClassifier": RidgeClassifier()})
classifiers.update({"SGDClassifier": SGDClassifier()})
classifiers.update({"BaggingClassifier": BaggingClassifier()})
classifiers.update({"BernoulliNB": BernoulliNB()})
classifiers.update({"LogisticRegression": LogisticRegression()})
classifiers.update({"SVM": SVC()})



In [11]:
X

array([[ 0.18196908,  0.66751683,  0.40509704, ...,  0.        ,
         0.        ,  0.        ],
       [-0.20301332,  0.360917  ,  0.21874006, ...,  0.        ,
         0.        ,  0.        ],
       [-0.22365123,  0.92590171,  0.34388262, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.13463682,  0.64746755,  0.32169113, ...,  0.        ,
         0.        ,  0.        ],
       [-0.1005867 ,  0.56262767,  0.23973559, ...,  0.        ,
         0.        ,  0.        ],
       [-0.09297816,  0.54184949,  0.32076782, ...,  0.        ,
         0.        ,  0.        ]])

In [None]:
# CREATE A DATAFRAME OF MODELS WITH RUN TIME AND AUC SCORES
df_models = pd.DataFrame(
    columns=['model', 'run_time', 'accuracy', 'precision', 'f1_score'])

for key in classifiers:
    # STARTING TIME
    start_time = time.time()
    # CURRENT CLASSIFIER
    clf = classifiers[key]
    #TRAIN CLASSIFIER ON TRAINING DATA
    clf.fit(X_train, y_train)
    # MAKE PREDICTIONS USING CURRENT CLASSIFIER
    predictions = clf.predict(X_test)
    # CALCULATE ACCURACY
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    f1score = f1_score(y_test, predictions)
    
    row = {'model': key,
           'run_time': format(round((time.time() - start_time)/60, 2)),
           'accuracy': accuracy,
           'precision': precisaion,
           'f1_score': f1score
           }

    df_models = df_models._append(row, ignore_index=True)

df_models = df_models.sort_values(by='accuracy', ascending=False)

In [None]:
df_models