### Fake Review Detection


#### Plan:
1. Load the training and development data 
2. Clean data (following the repo below)
3. Feature engineering - will create new features for analyzing data 
4. Undersampling/ Oversampling (read up on this more)
5. Model
6. Confusion Matrix, auROC, AP evaluation methods

https://github.com/darshandagly/Fake-Review-Detection/blob/master/Code/main.py

In [2]:
import os
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import NearMiss

In [28]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/SumedhaRai/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# loading csv files
train_data = pd.read_csv("train.csv")

In [15]:
train_data.head(10)

Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review
0,0,923,0,3.0,1,2014-12-08,The food at snack is a selection of popular Gr...
1,1,924,0,3.0,1,2013-05-16,This little place in Soho is wonderful. I had ...
2,2,925,0,4.0,1,2013-07-01,ordered lunch for 15 from Snack last Friday. ...
3,3,926,0,4.0,1,2011-07-28,This is a beautiful quaint little restaurant o...
4,4,927,0,4.0,1,2010-11-01,Snack is great place for a casual sit down lu...
5,5,928,0,4.0,1,2009-09-02,A solid 4 stars for this greek food spot. If ...
6,7,930,0,4.0,1,2007-05-20,Love this place! Try the Chicken sandwich or ...
7,8,931,0,4.0,1,2005-12-27,My friend and I were intrigued by the nightly ...
8,10,933,0,5.0,1,2014-01-21,pretty cool place...good food...good people
9,12,935,0,5.0,1,2011-01-31,Fabulous Authentic Greek Food!!! This little s...


In [13]:
# summarise columns by unique values
train_data['rating'].value_counts()

5.0    98468
4.0    94547
3.0    33261
2.0    14537
1.0    10061
Name: rating, dtype: int64

In [22]:
# count number of null values in every column of the df
pd.isnull(train_data).sum()

# Result - No null values 

ex_id      0
user_id    0
prod_id    0
rating     0
label      0
date       0
review     0
dtype: int64

In [23]:
temp = train_data

In [31]:
# Data  Cleaning - understand this later

# remove stop words
stop = stopwords.words('english')
temp['review'] = temp['review'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop))

# remove all punctuations
tokenizer = RegexpTokenizer(r'\w+')
temp['review'] = temp['review'].apply(lambda x: ' '.join(word for word in tokenizer.tokenize(x)))

# make the words lowercase
temp['review'] = temp['review'].apply(lambda x: x.lower())

print("Data Cleaning Complete")


Data Cleaning Complete


In [32]:
temp.head(10)

Unnamed: 0,ex_id,user_id,prod_id,rating,label,date,review
0,0,923,0,3.0,1,2014-12-08,the food snack selection popular greek dishes ...
1,1,924,0,3.0,1,2013-05-16,this little place soho wonderful i lamb sandwi...
2,2,925,0,4.0,1,2013-07-01,ordered lunch 15 snack last friday on time not...
3,3,926,0,4.0,1,2011-07-28,this beautiful quaint little restaurant pretty...
4,4,927,0,4.0,1,2010-11-01,snack great place casual sit lunch especially ...
5,5,928,0,4.0,1,2009-09-02,a solid 4 stars greek food spot if fan lamb li...
6,7,930,0,4.0,1,2007-05-20,love place try chicken sandwich roasted chicke...
7,8,931,0,4.0,1,2005-12-27,my friend i intrigued nightly entree special l...
8,10,933,0,5.0,1,2014-01-21,pretty cool place good food good people
9,12,935,0,5.0,1,2011-01-31,fabulous authentic greek food this little spot...


In [None]:
# Feature Engineering techniques

In [37]:
# Bag of Words model
# try both count and tfid
vectorizer = CountVectorizer()

In [41]:
corpus = temp['review']
type(corpus)

pandas.core.series.Series

In [42]:
X = vectorizer.fit_transform(corpus) # understand sparse matrices

In [46]:
type(X)

scipy.sparse.csr.csr_matrix

In [47]:
print(X)

  (0, 95959)	2
  (0, 38119)	1
  (0, 88445)	1
  (0, 84863)	1
  (0, 74151)	1
  (0, 43006)	2
  (0, 29325)	1
  (0, 6876)	1
  (0, 98416)	1
  (0, 42167)	1
  (0, 82548)	1
  (0, 104302)	1
  (0, 100384)	1
  (0, 58278)	1
  (0, 24270)	1
  (0, 96115)	1
  (0, 94039)	1
  (0, 89118)	1
  (0, 44790)	1
  (0, 41084)	1
  (0, 84593)	1
  (1, 95959)	2
  (1, 38119)	1
  (1, 96330)	1
  (1, 56458)	2
  :	:
  (250873, 42626)	1
  (250873, 67877)	1
  (250873, 105024)	1
  (250873, 23716)	1
  (250873, 100735)	2
  (250873, 64297)	1
  (250873, 72926)	3
  (250873, 32701)	6
  (250873, 18221)	1
  (250873, 78756)	1
  (250873, 25348)	1
  (250873, 83353)	2
  (250873, 29757)	1
  (250873, 94389)	1
  (250873, 29256)	1
  (250873, 27096)	1
  (250873, 39601)	1
  (250873, 53732)	1
  (250873, 20603)	1
  (250873, 91674)	1
  (250873, 62972)	3
  (250873, 64646)	1
  (250873, 45678)	2
  (250873, 23639)	1
  (250873, 77438)	1


In [8]:
# trying out undersampling - Random Sampler

temp_X = train_data.drop(['label'], axis=1)
temp_y = train_data['label']


In [9]:
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(temp_X, temp_y)

In [18]:
(train_data['label'].value_counts())/len(train_data) * 100

0    89.70838
1    10.29162
Name: label, dtype: float64

In [20]:
(y_resampled.value_counts())/len(y_resampled) * 100

1    50.0
0    50.0
Name: label, dtype: float64

In [22]:
len(temp_y)

250874