In [33]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import precision_recall_fscore_support, classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("../Datasets/amazon_reviews_unlabelled.csv")

In [5]:
df.columns

Index(['Unnamed: 0.4', 'Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1',
       'Unnamed: 0', 'UNNAMED: 0', 'REVIEW_TITLE', 'RATINGS', 'REVIEW',
       'VERIFIED', 'USER_NAME', 'USER_ID', 'MAX_REVIEWS_DAY', 'HELPFUL_VOTES',
       'PRODUCT', 'REVIEW_SENTIMENT', 'AVERAGE_RATING', 'RATING_DEVIATION',
       'REVIEW_LENGTH', 'TITLE_LENGTH', 'TOTAL_USER_REVIEWS', 'DATETIME',
       'REVIEW_DATE_DIFF', 'DATE', 'AVG_WORD_LENGTH', 'TOTAL_PRODUCT_REVIEWS',
       'NUM_NOUNS', 'NUM_VERBS', 'NUM_ADJECTIVES', 'NUM_ADVERBS',
       'READABILITY_FRE', 'CAPITAL_CHAR_COUNT', 'PUNCTUATION_COUNT',
       'REVIEW_WORD_COUNT', 'SENTIMENT_SCORE_TITLE', 'NUM_NAMED_ENTITIES',
       'LEXICAL_DIVERSITY', 'WORD_COUNT', 'RATING_CATEGORY',
       'SENTIMENT_CATEGORY', 'COHERENCE', 'TOKENIZED_REVIEW', 'NGRAMS',
       'TOTAL_VERIFIED_REVIEWS', 'TOTAL_USER_HELPFUL_VOTES',
       'PREPROC_REVIEW_TEXT', 'COSINE_DUPLICATE', 'SOM OUTLIER'],
      dtype='object')

In [17]:
features = [
    'RATINGS', 
       'VERIFIED',  'MAX_REVIEWS_DAY', 'HELPFUL_VOTES',
       'REVIEW_SENTIMENT', 'AVERAGE_RATING', 'RATING_DEVIATION',
       'REVIEW_LENGTH', 'TITLE_LENGTH', 'TOTAL_USER_REVIEWS',
       'REVIEW_DATE_DIFF', 'AVG_WORD_LENGTH', 'TOTAL_PRODUCT_REVIEWS',
       'READABILITY_FRE', 'CAPITAL_CHAR_COUNT', 'PUNCTUATION_COUNT',
       'REVIEW_WORD_COUNT', 'SENTIMENT_SCORE_TITLE', 'NUM_NAMED_ENTITIES',
       'LEXICAL_DIVERSITY', 'WORD_COUNT', 'RATING_CATEGORY',
       'SENTIMENT_CATEGORY', 'COHERENCE', 
       'TOTAL_VERIFIED_REVIEWS', 'TOTAL_USER_HELPFUL_VOTES',
       'COSINE_DUPLICATE', 'SOM OUTLIER'
]

In [18]:
X_train = df[df['SOM OUTLIER'] == 0] #USING NORMAL RECORDS FOR TRAINING

In [19]:
X_train = X_train[features]

In [20]:
X_test = df[df['SOM OUTLIER'] == 1]
X_test = X_test[features]

In [21]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [4]:
isolation_forest = IsolationForest(n_estimators=100, contamination=0.05)

In [22]:
isolation_forest.fit(X_train)

In [23]:
predictions = isolation_forest.predict(X_test)

In [24]:
predictions

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,
        1,  1,  1,  1, -1,  1, -1,  1, -1, -1,  1, -1,  1, -1,  1, -1, -1,
       -1, -1,  1, -1,  1, -1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1,  1,
        1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1,
       -1, -1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1,  1,
        1, -1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1, -1, -1,
        1, -1, -1, -1, -1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1, -1,
        1,  1,  1, -1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1, -1,
        1,  1,  1,  1,  1,  1, -1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1,  1, -1,
       -1,  1, -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,
        1, -1,  1,  1,  1,  1, -1,  1, -1,  1,  1])

In [25]:
actual = df[df['SOM OUTLIER']==1]['SOM OUTLIER']

In [26]:
actual

431     1.0
483     1.0
505     1.0
512     1.0
513     1.0
       ... 
7306    1.0
7307    1.0
7308    1.0
7311    1.0
7312    1.0
Name: SOM OUTLIER, Length: 215, dtype: float64

In [41]:
_, recall, f1, _ = precision_recall_fscore_support(actual, predictions, zero_division=0)
print(f"RECALL FOR PREDICTING OUTLIERS = {recall*100}% \nF1 SCORE FOR PREDICTING OUTLIERS = {f1*100}%")

RECALL FOR PREDICTING OUTLIERS = [ 0.         70.23255814]% 
F1 SCORE FOR PREDICTING OUTLIERS = [ 0.        82.5136612]%


In [32]:
print(confusion_matrix(actual, predictions))

[[  0   0]
 [ 64 151]]
