In [43]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import spacy
import numpy as np

In [2]:
df = pd.read_csv("../Datasets/amazon_reviews_unlabelled.csv")

In [3]:
df.columns

Index(['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'UNNAMED: 0',
       'REVIEW_TITLE', 'RATINGS', 'REVIEW', 'VERIFIED', 'USER_NAME', 'USER_ID',
       'MAX_REVIEWS_DAY', 'HELPFUL_VOTES', 'PRODUCT', 'REVIEW_SENTIMENT',
       'AVERAGE_RATING', 'RATING_DEVIATION', 'REVIEW_LENGTH', 'TITLE_LENGTH',
       'TOTAL_USER_REVIEWS', 'DATETIME', 'REVIEW_DATE_DIFF', 'DATE',
       'AVG_WORD_LENGTH', 'TOTAL_PRODUCT_REVIEWS', 'NUM_NOUNS', 'NUM_VERBS',
       'NUM_ADJECTIVES', 'NUM_ADVERBS', 'READABILITY_FRE',
       'CAPITAL_CHAR_COUNT', 'PUNCTUATION_COUNT', 'REVIEW_WORD_COUNT',
       'SENTIMENT_SCORE_TITLE', 'NUM_NAMED_ENTITIES', 'LEXICAL_DIVERSITY',
       'WORD_COUNT', 'RATING_CATEGORY', 'SENTIMENT_CATEGORY', 'COHERENCE',
       'TOKENIZED_REVIEW', 'NGRAMS', 'TOTAL_VERIFIED_REVIEWS',
       'TOTAL_USER_HELPFUL_VOTES', 'PREPROC_REVIEW_TEXT'],
      dtype='object')

In [4]:
df.dropna(inplace = True)

In [5]:
X = df['PREPROC_REVIEW_TEXT']

In [6]:
X.isnull().sum()

0

In [7]:
def duplicates_tfidf_cosine(X, threshold):
    vect = TfidfVectorizer()
    X = vect.fit_transform(X)
    
    similarity_matrix = cosine_similarity(X)
    
    duplicates = []
    
    for i in range(len(similarity_matrix)):
        for j in range(i+1, len(similarity_matrix)):
            if(similarity_matrix[i, j] >= threshold):
                duplicates.append(i)
                duplicates.append(j)
                
    return set(duplicates)

In [10]:
duplicates = duplicates_tfidf_cosine(X, 0.9)

In [11]:
len(duplicates)

1091

In [12]:
#ADD LABEL: DUPLICATE OR NOT [ALL CONSIDERED AS POSITIVELY FAKE]
df['COSINE_DUPLICATE'] = [1 if index in duplicates else 0 for index in df.index]

In [16]:
features = [
    'RATINGS',
    'VERIFIED',  'MAX_REVIEWS_DAY',
    'HELPFUL_VOTES','REVIEW_SENTIMENT', 'AVERAGE_RATING',
    'RATING_DEVIATION', 'REVIEW_LENGTH', 'TITLE_LENGTH',
    'TOTAL_USER_REVIEWS',  'REVIEW_DATE_DIFF',
       'AVG_WORD_LENGTH', 'TOTAL_PRODUCT_REVIEWS', 'READABILITY_FRE',
       'CAPITAL_CHAR_COUNT', 'PUNCTUATION_COUNT', 'REVIEW_WORD_COUNT',
       'SENTIMENT_SCORE_TITLE', 'NUM_NAMED_ENTITIES', 'LEXICAL_DIVERSITY',
       'WORD_COUNT', 'RATING_CATEGORY', 'SENTIMENT_CATEGORY', 'COHERENCE',
        'TOTAL_VERIFIED_REVIEWS',
       'TOTAL_USER_HELPFUL_VOTES'
]
for col in features:
    df[col] = df[col].astype(int)

In [17]:
df.to_csv("../Datasets/amazon_reviews_unlabelled.csv")

In [18]:
X = df[features]
Y = df['COSINE_DUPLICATE']

In [15]:
#MODEL CHOSEN: RANDOM FOREST
classifier = RandomForestClassifier(n_estimators=200, bootstrap=True)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    test_size = 0.2,
    random_state = 42
)

In [23]:
#TRAIN MODEL
classifier.fit(X_train, y_train)

In [37]:
importances = classifier.feature_importances_
importances = np.array(importances)

In [38]:
sorted_indices = np.argsort(importances)[::-1]

In [39]:
sorted_features = np.array(features)[sorted_indices]
sorted_scores = importances[sorted_indices]

In [40]:
sorted_features

array(['REVIEW_LENGTH', 'WORD_COUNT', 'REVIEW_WORD_COUNT',
       'READABILITY_FRE', 'TOTAL_VERIFIED_REVIEWS',
       'TOTAL_PRODUCT_REVIEWS', 'TOTAL_USER_REVIEWS', 'REVIEW_DATE_DIFF',
       'MAX_REVIEWS_DAY', 'TOTAL_USER_HELPFUL_VOTES', 'PUNCTUATION_COUNT',
       'CAPITAL_CHAR_COUNT', 'TITLE_LENGTH', 'AVG_WORD_LENGTH',
       'AVERAGE_RATING', 'LEXICAL_DIVERSITY', 'NUM_NAMED_ENTITIES',
       'RATINGS', 'SENTIMENT_CATEGORY', 'COHERENCE', 'HELPFUL_VOTES',
       'VERIFIED', 'RATING_DEVIATION', 'REVIEW_SENTIMENT',
       'RATING_CATEGORY', 'SENTIMENT_SCORE_TITLE'], dtype='<U24')