# Fraudulent User Detection Using Amazon Dataset
### Penghao Xu, Yuan Chen, Jiawei Wu, Haojing Lu

## Part 1. Dataset preprocessing

This script is used to clean the Amazon review dataset (http://jmcauley.ucsd.edu/data/amazon/links.html) and generate data for baseline and the new proposed model.


In [1]:
import json
import pandas as pd
import gzip
import os
import numpy as np
from collections import Counter

Download data if needed

In [2]:
# Uncomment to download data
# !wget http://snap.stanford.edu/data/amazon/productGraph/kcore_5.json.gz

In [3]:
## 5-core data is used in this study
# DO NOT extract the dataset. gzip format is required
filename = 'kcore_5.json.gz'
assert filename.endswith('gz'), 'Gzipped dataset is required!'

# set output folder
folder = 'dataset'
if not os.path.isdir(folder):
    os.mkdir(folder)

## 1. Generate rating-only dataset
The rating-only dataset has 4 columns: User, item, and rating. This dataset is used for baseline model REV2

In [4]:
# Process data and generate helpfulness score
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def get_df(path, benign=0.8, fraudulent=0.2):
    i = 0
    df = {}
    for d in parse(path):
        i += 1
        # report every 5m
        if not i % 5000000:
            print(f'{i} reviews processed!')
        # skip if no helpful information
        if not d['helpful'][1]:
            continue
        # extract useful features
        df[i] = {}
        for k in ['reviewerID', 'asin']:
            df[i][k] = d[k]
        df[i]['rating'] = (d['overall'] - 3) / 2
        df[i]['helpfulness'] = d['helpful'][0]/d['helpful'][1]
    df = pd.DataFrame.from_dict(df, orient='index')
    return df
df = get_df(filename)
df

5000000 reviews processed!
10000000 reviews processed!
15000000 reviews processed!
20000000 reviews processed!
25000000 reviews processed!
30000000 reviews processed!
35000000 reviews processed!
40000000 reviews processed!


Unnamed: 0,reviewerID,asin,rating,helpfulness
2,A2SUAM1J3GNN3B,0000013714,1.0,0.666667
6,A14A5Q8VJK5NLR,0000029831,0.5,1.000000
7,A3W2PX96K1BA3M,0000029831,1.0,1.000000
8,A2GKR2Q7MD8DG4,0000029831,1.0,1.000000
9,A1MC4E00RO5E9T,0000029831,1.0,1.000000
...,...,...,...,...
41135696,A3PLMYQCFRHU24,BT00DDVMVQ,0.5,0.333333
41135697,A2TNQ87GWTKOON,BT00DDVMVQ,-1.0,0.166667
41135698,A6D3XGIXKU5HQ,BT00DDVMVQ,1.0,1.000000
41135699,A1CC2HQ8XCA28P,BT00DDVMVQ,1.0,1.000000


Check the benign and fraudulent user counts in the original dataset.

In [5]:
users = df.groupby('reviewerID').helpfulness.mean()
benign = users[users > 0.8]
fraudulent = users[users < 0.2]
print(f'Benign users: {len(benign)}')
print(f'Fraudulent users: {len(fraudulent)}')

Benign users: 1139382
Fraudulent users: 151789


Select only benign and fraudulent users. Discard the other users without label.

In [6]:
df_benign = df[df.reviewerID.isin(set(benign.index))].copy()
df_benign['label'] = 'Benign'
df_fra = df[df.reviewerID.isin(set(fraudulent.index))].copy()
df_fra['label'] = 'Fraudulent'
df = pd.concat([df_benign, df_fra])
df

Unnamed: 0,reviewerID,asin,rating,helpfulness,label
6,A14A5Q8VJK5NLR,0000029831,0.5,1.000000,Benign
7,A3W2PX96K1BA3M,0000029831,1.0,1.000000,Benign
9,A1MC4E00RO5E9T,0000029831,1.0,1.000000,Benign
49,A2RAGC7VLO78QG,0000031887,0.5,1.000000,Benign
52,A12OFS8WQP86O5,0000031887,1.0,0.869565,Benign
...,...,...,...,...,...
41132360,A32PHKD604WRG7,B00LTFG8EC,0.0,0.000000,Fraudulent
41132943,A1KBQ2GO5TN1VH,B00LUXND82,-1.0,0.000000,Fraudulent
41133093,A1PQH8Z7XBRGD8,B00LVEZYOQ,-1.0,0.000000,Fraudulent
41133297,A08001923S5BQH48HJ5FF,B00LWRN8SQ,1.0,0.000000,Fraudulent


Check the number of reviews from benign and fraudulent users.

In [None]:
counts = Counter(df.label)
print(f'Reviews from benign users: {counts["Benign"]}')
print(f'Reviews from fraudulent users: {counts["Fraudulent"]}')

Generate k-core dataset

In [7]:
def generate_kcore(df, k):
    # Repeatly remove the users with less than k reviews, 
    # then remove the items with less than k reviews until
    # no one is removed.
    diff = 1
    while diff:
        cache = len(df)
        counts = df.groupby('reviewerID').asin.count()
        counts = counts[counts >= k]
        df = df[df.reviewerID.isin(set(counts.index))]
        counts = df.groupby('asin').reviewerID.count()
        counts = counts[counts >= k]
        df = df[df.asin.isin(set(counts.index))]
        diff = cache - len(df)
    return df.copy()

In [9]:
k = 3
split_rate = (0.4, 0.3, 0.3)
output_base = f'{folder}/processed_{k}-core_80_20'

# perform a random shuffle
np.random.seed(3407)
df = df.iloc[np.random.permutation(len(df))]

# split
dfs = {}
n_train = int(len(df) * split_rate[0])
n_val = int(len(df) * split_rate[1])
dfs['train'] = df.iloc[:n_train]
dfs['val'] = df.iloc[n_train:n_train+n_val]
dfs['test'] = df.iloc[n_train+n_val:]
dfs['all'] = df

for curr in ('train', 'val', 'test', 'all'):
    dfs[curr] = generate_kcore(dfs[curr], k)
    df_out = dfs[curr][['reviewerID', 'asin', 'rating']]
    df_out.to_csv(f'{output_base}_{curr}.csv', index=False)
    df_out

dfs

{'train':               reviewerID        asin  rating  helpfulness       label
 26652243  A27KQ8MMLNFD1B  B003Y5H5H4     0.0     1.000000      Benign
 18349343   ALHKJO0ZB0WXV  B000RP4LDU     1.0     1.000000      Benign
 7907781    AMUNMW13QBFVS  1463539657     0.5     1.000000      Benign
 41075694  A273VM3VWW4KNW  B00KNQLU14     1.0     1.000000      Benign
 41071413   A3KYT0XEFRZ71  B00KLPW3K4     0.5     1.000000      Benign
 ...                  ...         ...     ...          ...         ...
 16839414  A2LXC1T8XS23JY  B000HGKF34    -1.0     0.000000  Fraudulent
 41081538  A3L4FSPEGNMP3B  B00KPVQ6E8     1.0     1.000000      Benign
 40124674   AGSVJM3JA24FV  B00FZU1UL8     1.0     1.000000      Benign
 24228014   A7W97TBAJSXB5  B002UUT39M     0.5     1.000000      Benign
 8649108    A3JQCC0ZKYIBM  1491221542     1.0     0.833333      Benign
 
 [888322 rows x 5 columns],
 'val':               reviewerID        asin  rating  helpfulness   label
 40050488  A2QQDV7Y545VZ3  B00FQQ1A

Check the number of reviews from benign and fraudulent users again

In [12]:
benign = {}
fraudulent = {}
for t in ['train', 'val', 'test', 'all']:
    df = dfs[t]
    benign[t] = set(df[df.label == 'Benign'].reviewerID.unique())
    fraudulent[t] = set(df[df.label == 'Fraudulent'].reviewerID.unique())
    print(t)
    print(f'Benign users: {len(benign[t])}')
    print(f'Fraudulent users: {len(fraudulent[t])}')
    counts = Counter(df.label)
    print(f'Reviews from benign users: {counts["Benign"]}')
    print(f'Reviews from fraudulent users: {counts["Fraudulent"]}')

train
Benign users: 131915
Fraudulent users: 2012
Reviews from benign users: 880567
Reviews from fraudulent users: 7755
val
Benign users: 62719
Fraudulent users: 683
Reviews from benign users: 416444
Reviews from fraudulent users: 2791
test
Benign users: 63317
Fraudulent users: 684
Reviews from benign users: 419477
Reviews from fraudulent users: 2731
all
Benign users: 601030
Fraudulent users: 22752
Reviews from benign users: 5071608
Reviews from fraudulent users: 92922


Output user labels

In [None]:
userfile_base = f'{folder}/user_label'
for t in ['train', 'val', 'test', 'all']:
    with open(userfile_base + f'_{t}.csv', 'w') as fw:
        fw.write('reviewerID,fairness,label\n')
        for u in benign[t]:
            fw.write(f'{u},{users[u]},Benign\n')
        for u in fraudulent[t]:
            fw.write(f'{u},{users[u]},Fraudulent\n')

## 2. Generate toy datasets for coding
Here, toy datasets are generated to speed up model design and debugging.

In [14]:
# select users
n_benign = 50000
n_fraudulent = 5000


# output names
toy_out_base = f'{folder}/toy_{k}-core_80_20'

# generate toy train val and test data
toys = {}
curr_benign = 0
curr_fraudulent = 0
for t in ['train', 'val', 'test']:
    toy_users = set(sorted(list(benign['all']))[curr_benign:curr_benign+n_benign] + \
                        sorted(list(fraudulent['all']))[curr_fraudulent:curr_fraudulent+n_fraudulent])
    curr_benign += n_benign
    curr_fraudulent += n_fraudulent
    df = dfs['all']
    toys[t] = df[df.reviewerID.isin(toy_users)].copy()
    toys[t] = generate_kcore(toys[t], k)
    df_toy_out = toys[t][['reviewerID', 'asin', 'rating']]
    df_toy_out.to_csv(f'{toy_out_base}_{t}.csv', index=False)
    toys[t]

Statistics for toy dataset.

In [15]:
toy_benign = {}
toy_fraudulent = {}
for t in ['train', 'val', 'test']:
    toy_benign[t] = set(toys[t][toys[t].label == 'Benign'].reviewerID.unique())
    toy_fraudulent[t] = set(toys[t][toys[t].label == 'Fraudulent'].reviewerID.unique())
    print(t)
    print(f'Benign users: {len(toy_benign[t])}')
    print(f'Fraudulent users: {len(toy_fraudulent[t])}')
    counts = Counter(toys[t].label)
    print(f'Reviews from benign users: {counts["Benign"]}')
    print(f'Reviews from fraudulent users: {counts["Fraudulent"]}')

train
Benign users: 11908
Fraudulent users: 826
Reviews from benign users: 79687
Reviews from fraudulent users: 3146
val
Benign users: 12674
Fraudulent users: 897
Reviews from benign users: 90526
Reviews from fraudulent users: 3442
test
Benign users: 11937
Fraudulent users: 833
Reviews from benign users: 78357
Reviews from fraudulent users: 3161


Output user labels

In [16]:
toy_user_base = f'{folder}/toy_label'
for t in ['train', 'val', 'test']:
    with open(toy_user_base + f'_{t}.csv', 'w') as fw:
        fw.write('reviewerID,fairness,label\n')
        for u in benign[t]:
            fw.write(f'{u},{users[u]},Benign\n')
        for u in fraudulent[t]:
            fw.write(f'{u},{users[u]},Fraudulent\n')

## 3. Generate dataset with text reviews

Here, we generate the dataset with text reviews of same review, which can help us to incorporate text embeddings.

In [17]:
# output name
output_base = f'{folder}/processed_{k}-core_80_20'
output_toy_base = f'{folder}/toy_{k}-core_80_20'

# Only output the reviews from selected entries for processed dataset
products = {}
reviewers = {}
fws = {}
for t in ['train', 'val', 'test', 'all']:
    products[t] = set(dfs[t].asin.unique())
    reviewers[t] = set(dfs[t].reviewerID.unique())
    fws[t] = open(f'{output_base}_{t}_with_text.csv', 'w')
    fws[t].write('reviewerID,asin,rating,reviewText\n')
    
# toy dataset
toy_products = {}
toy_reviewers = {}
toy_fws = {}
for t in ['train', 'val', 'test']:
    toy_products[t] = set(toys[t].asin.unique())
    toy_reviewers[t] = set(toys[t].reviewerID.unique())
    toy_fws[t] = open(f'{output_toy_base}_{t}_with_text.csv', 'w')
    toy_fws[t].write('reviewerID,asin,rating,reviewText\n')

# check all reviews
i=0
for d in parse(filename):
    i += 1
    # report every 5m
    if not i % 5000000:
        print(f'{i} reviews processed!')
    if not d['helpful'][1]:
        continue
    for t in ['train', 'val', 'test', 'all']:
        if d['reviewerID'] in reviewers[t] and d['asin'] in products[t]:
            fws[t].write(','.join([d['reviewerID'], d['asin'], str((d['overall']-3)/2), \
                           d['reviewText'].replace('\n',' ').replace(',', ' ')]) + '\n')
    for t in ['train', 'val', 'test']:
        if d['reviewerID'] in toy_reviewers[t] and d['asin'] in toy_products[t]:
            toy_fws[t].write(','.join([d['reviewerID'], d['asin'], str((d['overall']-3)/2), \
                           d['reviewText'].replace('\n',' ').replace(',', ' ')]) + '\n')

# close files
for fw in fws.values():
    fw.close()
for fw in toy_fws.values():
    fw.close()

5000000 reviews processed!
10000000 reviews processed!
15000000 reviews processed!
20000000 reviews processed!
25000000 reviews processed!
30000000 reviews processed!
35000000 reviews processed!
40000000 reviews processed!
