## Environment Setup

In [131]:
!pip install gdown pandas nltk numpy sklearn
import nltk
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jiachenx/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Library imports and download trained model

In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
from datetime import datetime
from collections import OrderedDict
import numpy as np
import gdown

### Download and load word2vec model

In [135]:
gdown.download('https://drive.google.com/uc?id=1FgashJqm39epit1taqYa0vYtUR-S8yyO', 'word2vec.zip', quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1FgashJqm39epit1taqYa0vYtUR-S8yyO
To: /Users/jiachenx/Desktop/lin/FakeReviewDetection/word2vec.zip
484MB [00:42, 11.3MB/s] 


'word2vec.zip'

In [137]:
!unzip -qq word2vec.zip -d .

In [2]:
from gensim.models import KeyedVectors

w2vmodel = KeyedVectors.load('word2vec/glove-100d.model')

### Load fake review detection model

In [3]:
import pickle
model = None
with open('models/semi_sl/random_forest_model_all.pickle', 'rb') as f:
    model = pickle.load(f)

## Data load and preprocessing

In [4]:
eateries_df = pd.read_csv('data/eateries.csv')
reviews_df = pd.read_csv('data/reviews.csv')
df = reviews_df.merge(eateries_df[['res_id','rating']], on='res_id', how='inner')
df.head()

Unnamed: 0,res_id,page,name,review page,review,rating_x,id,comment,helpful,arriving time,review time,rating_y
0,1,1,Le Noir Bar & Lounge,1.0,"I came here with some friends the other night,...",5,EmelineM89,1,0,Feb-20,4 days ago,5.0
1,1,1,Le Noir Bar & Lounge,1.0,Wonderful atmosphere very small place . Excell...,5,Tufayelhu,162,47,Feb-20,1 week ago,5.0
2,1,1,Le Noir Bar & Lounge,1.0,"Nice services, good meal. Small place, but int...",4,sorin g,0,0,Feb-20,2 weeks ago,5.0
3,1,1,Le Noir Bar & Lounge,1.0,Our first bar stop in Singapore and it did not...,5,AshleighMcGrath,1,0,Feb-20,3 weeks ago,5.0
4,1,1,Le Noir Bar & Lounge,1.0,"We had just a great time in this bar, especial...",5,b_rginp2020,1,0,Feb-20,3 weeks ago,5.0


In [5]:
def get_contentVector(content):
    tokenizer = RegexpTokenizer(r'\w+')
    cutWords = tokenizer.tokenize(content)
    vector_list = [w2vmodel.wv[k] for k in cutWords if k in w2vmodel]
    contentVector = np.array(vector_list).mean(axis=0)
    return pd.Series(contentVector)

In [6]:
def preprocess(df):
    stop = stopwords.words('english')
    df['review'] = df['review'].apply(
        lambda x: ' '.join(word for word in x.split() if word not in stop))

    tokenizer = RegexpTokenizer(r'\w+')
    df['review'] = df['review'].apply(
        lambda x: ' '.join(word for word in tokenizer.tokenize(x)))

    df['review'] = df['review'].apply(
        lambda x: x.lower())
    
    review2v = df.review.apply(get_contentVector)
    df = pd.concat([df,review2v], axis=1)
    
    mnr_df1 = df[['id', 'arriving time']].copy()
    mnr_df2 = mnr_df1.groupby(by=['arriving time', 'id']).size().reset_index(name='mnr')
    mnr_df2['mnr'] = mnr_df2['mnr'] / mnr_df2['mnr'].max()
    df = df.merge(mnr_df2, on=['id', 'arriving time'], how='inner')

    df['rl'] = df['review'].apply(
        lambda x: len(x.split()))

    df['rd'] = abs(df['rating_x'] - df['rating_y']) / 4

    review_data = df

    res = OrderedDict()

    for row in review_data.iterrows():
        if row[1].id in res:
            res[row[1].id].append(row[1].review)
        else:
            res[row[1].id] = [row[1].review]

    individual_reviewer = [{'id': k, 'review': v} for k, v in res.items()]
    df2 = dict()
    df2['id'] = pd.Series([])
    df2['Maximum Content Similarity'] = pd.Series([])
    vector = TfidfVectorizer(min_df=0)
    count = -1
    for reviewer_data in individual_reviewer:
        count = count + 1
        try:
            tfidf = vector.fit_transform(reviewer_data['review'])
        except:
            pass
        cosine = 1 - pairwise_distances(tfidf, metric='cosine')

        np.fill_diagonal(cosine, -np.inf)
        max = cosine.max()
        
        if max == -np.inf:
            max = 0
        df2['id'][count] = reviewer_data['id']
        df2['Maximum Content Similarity'][count] = max

    df3 = pd.DataFrame(df2, columns=['id', 'Maximum Content Similarity'])

    df = pd.merge(review_data, df3, on="id", how="left")

    df.drop(index=np.where(pd.isnull(df))[0], axis=0, inplace=True)
    print("Feature Engineering Complete")
    
    
    df.drop(['rating_y'], axis=1, inplace=True)
    df.rename(columns={'rating_x':'rating'},inplace=True)
    
    _df = df.copy()
    _df.drop(['res_id', 'page', 'name', 'review page', 'id','arriving time','review','review time'], axis=1, inplace=True)
    _df.rename(columns={'helpful':'reviewUsefulCount'},inplace=True)
    _df.rename(columns={'comment':'reviewCount'},inplace=True)
    reviewCount = _df[['reviewCount']]
    _df = _df.drop('reviewCount',axis=1)
    _df.insert(2,'reviewCount',reviewCount)
    
    
    return df,_df

In [7]:
result_df,model_df = preprocess(df)

  after removing the cwd from sys.path.


Feature Engineering Complete


## Fake Review Detection and output

In [17]:
predicted_labels = model.predict(model_df)
flagged = []
for i in predicted_labels:
    flagged.append(i)
result_df['flagged'] = flagged

In [18]:
result_df.flagged.value_counts()

Y    8955
N    7975
Name: flagged, dtype: int64

In [19]:
non_fake_review_df = result_df[result_df.flagged == 'N']

In [20]:
non_fake_review_df = non_fake_review_df.filter(regex=("^[a-zA-Z]"))
non_fake_review_df.drop(['mnr', 'rl', 'rd', 'Maximum Content Similarity'], axis=1, inplace=True)
non_fake_review_df

Unnamed: 0,res_id,page,name,review page,review,rating,id,comment,helpful,arriving time,review time,flagged
1,1,1,Le Noir Bar & Lounge,1.0,wonderful atmosphere small place excellent foo...,5,Tufayelhu,162,47,Feb-20,1 week ago,N
6,1,1,Le Noir Bar & Lounge,1.0,if looking venue iconic view singapore skyline...,5,Otium54,48,34,Jan-20,4 weeks ago,N
7,1,1,Le Noir Bar & Lounge,1.0,stumbled across first night key spot clarke qu...,4,PrettyPacha,92,30,Feb-20,4 weeks ago,N
9,1,1,Le Noir Bar & Lounge,1.0,very nice place drink very good customer servi...,5,107xavierd,2,1,Jan-20,26-Jan-20,N
10,1,1,Le Noir Bar & Lounge,1.0,great position marina bay super people watchin...,5,Dillybop,277,138,Jan-20,26-Jan-20,N
...,...,...,...,...,...,...,...,...,...,...,...,...
16902,39,2,Mitzo,11.0,this second visit mitzo while first visit exce...,2,romes88,5,3,Nov-19,22-Nov-19,N
16913,39,2,Mitzo,12.0,while location ambience something quite notewo...,1,josephnguyen116,4,1,Oct-19,14-Nov-19,N
16915,39,2,Mitzo,12.0,chanced upon restaurant i chose ala carte menu...,5,yingshia,3,2,Nov-19,9-Nov-19,N
16921,39,2,Mitzo,13.0,my 2nd visit we like deco classy atmosphere go...,2,Desmondcheng,262,131,Oct-19,26-Oct-19,N


In [21]:
non_fake_review_df.drop(['flagged'], axis=1, inplace=True)
non_fake_review_df.to_csv('data/filtered_review.csv',index=False)