In [15]:
import time
# SENTIMENT ANALYSIS USING VADER
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import LinearSVC, SVC
from gensim.models import Word2Vec
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import RidgeClassifier, SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import os
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [3]:
df = pd.read_csv("../Datasets/amazon_reviews_unlabelled.csv")

In [4]:
df.columns

Index(['Unnamed: 0.4', 'Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1',
       'Unnamed: 0', 'UNNAMED: 0', 'REVIEW_TITLE', 'RATINGS', 'REVIEW',
       'VERIFIED', 'USER_NAME', 'USER_ID', 'MAX_REVIEWS_DAY', 'HELPFUL_VOTES',
       'PRODUCT', 'REVIEW_SENTIMENT', 'AVERAGE_RATING', 'RATING_DEVIATION',
       'REVIEW_LENGTH', 'TITLE_LENGTH', 'TOTAL_USER_REVIEWS', 'DATETIME',
       'REVIEW_DATE_DIFF', 'DATE', 'AVG_WORD_LENGTH', 'TOTAL_PRODUCT_REVIEWS',
       'NUM_NOUNS', 'NUM_VERBS', 'NUM_ADJECTIVES', 'NUM_ADVERBS',
       'READABILITY_FRE', 'CAPITAL_CHAR_COUNT', 'PUNCTUATION_COUNT',
       'REVIEW_WORD_COUNT', 'SENTIMENT_SCORE_TITLE', 'NUM_NAMED_ENTITIES',
       'LEXICAL_DIVERSITY', 'WORD_COUNT', 'RATING_CATEGORY',
       'SENTIMENT_CATEGORY', 'COHERENCE', 'TOKENIZED_REVIEW', 'NGRAMS',
       'TOTAL_VERIFIED_REVIEWS', 'TOTAL_USER_HELPFUL_VOTES',
       'PREPROC_REVIEW_TEXT', 'COSINE_DUPLICATE', 'SOM OUTLIER'],
      dtype='object')

In [5]:
df['SOM OUTLIER'].value_counts()

SOM OUTLIER
0.0    7104
1.0     215
Name: count, dtype: int64

In [6]:
#PERCENTAGE OF OUTLIERS
percent_outliers = (215*100)/(215+7104)

In [7]:
percent_outliers

2.937559775925673

In [30]:
X_text = df['PREPROC_REVIEW_TEXT']
X_numeric = df[[
    'RATINGS', 
       'VERIFIED', 'MAX_REVIEWS_DAY', 'HELPFUL_VOTES',
       'REVIEW_SENTIMENT', 'AVERAGE_RATING', 'RATING_DEVIATION',
       'REVIEW_LENGTH', 'TITLE_LENGTH', 'TOTAL_USER_REVIEWS',
       'REVIEW_DATE_DIFF', 'AVG_WORD_LENGTH', 'TOTAL_PRODUCT_REVIEWS',
       'READABILITY_FRE', 'CAPITAL_CHAR_COUNT', 'PUNCTUATION_COUNT',
       'REVIEW_WORD_COUNT', 'SENTIMENT_SCORE_TITLE', 'NUM_NAMED_ENTITIES',
       'LEXICAL_DIVERSITY', 'WORD_COUNT', 'RATING_CATEGORY',
       'SENTIMENT_CATEGORY', 'COHERENCE',
       'TOTAL_VERIFIED_REVIEWS', 'TOTAL_USER_HELPFUL_VOTES',
       'COSINE_DUPLICATE'
]]
labels = df['SOM OUTLIER']

In [31]:
# TRAIN-TEST SPLIT
X_text_train, X_text_test, X_numeric_train, X_numeric_test, y_train, y_test = train_test_split(
    X_text, X_numeric, labels, test_size=0.2, random_state=42
)

In [32]:
def vectorize_text(X_text_train, X_text_test):
    #TOKENIZE THE TEXT
    tokenized_text_train = [t.split() for t in X_text_train]
    tokenized_text_test = [t.split() for t in X_text_test]
    
    #LOAD PRE-TRAINED WORD2VEC MODEL
    model_path = "word2vec_model.bin"
    if os.path.isfile(model_path):
        w2v_model = Word2Vec.load(model_path)
    
    else:
        #TRAIN THE WORD2VEC MODEL
        w2v_model = Word2Vec(sentences=tokenized_text_train,vector_size=100,window=5,min_count=1)
        #SAVE THE MODEL
        w2v_model.save("word2vec_model.bin")
        
    #W2V VECTORISATION
    w2v_vectors_train = []
    
    for review in tokenized_text_train:
        review_vectors = [w2v_model.wv[word] for word in review if word in w2v_model.wv]
        
        #AVERAGE METHOD
        if len(review_vectors) > 0:
            review_vector = np.mean(review_vectors, axis=0)  # Average the word vectors
            w2v_vectors_train.append(review_vector)
            
    #W2V VECTORISATION
    w2v_vectors_test = []
    
    for review in tokenized_text_test:
        review_vectors = [w2v_model.wv[word] for word in review if word in w2v_model.wv]
        
        #AVERAGE METHOD
        if len(review_vectors) > 0:
            review_vector = np.mean(review_vectors, axis=0)  # Average the word vectors
            w2v_vectors_test.append(review_vector)
    final_vectors_train = w2v_vectors_train
    final_vectors_test = w2v_vectors_test
        
    return final_vectors_train, final_vectors_test

In [33]:
X_text_train, X_text_test = vectorize_text(X_text_train, X_text_test)

In [34]:
#APPLY FEATURE SCALING TO MATRICES
sc_numeric = StandardScaler()
X_numeric_train = sc_numeric.fit_transform(X_numeric_train)
X_numeric_test = sc_numeric.transform(X_numeric_test)

sc_text= StandardScaler()
X_text_train = sc_text.fit_transform(X_text_train)
X_text_test = sc_text.transform(X_text_test)

In [35]:
#CONCATENATE THE TEXTUAL FEATURE AND NUMERIC FEATURE MATRIX
X_train = np.hstack((X_text_train, X_numeric_train))
X_test = np.hstack((X_text_test, X_numeric_test))

In [37]:
one_class_svm = OneClassSVM(nu=percent_outliers/100, kernel = 'rbf', gamma = 'auto').fit(X_train)

In [38]:
prediction = one_class_svm.predict(X_test)

In [39]:
prediction

array([1, 1, 1, ..., 1, 1, 1])

In [44]:
prediction = [1 if i==-1 else 0 for i in prediction]


In [45]:
print(classification_report (y_test, prediction))

              precision    recall  f1-score   support

         0.0       0.98      0.97      0.97      1430
         1.0       0.06      0.09      0.07        34

    accuracy                           0.95      1464
   macro avg       0.52      0.53      0.52      1464
weighted avg       0.96      0.95      0.95      1464

