In [None]:
from google.colab import drive
drive.mount('/content/drive')

path = 'drive/MyDrive/CSE6240' 
%cd $path

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/CSE6240


In [None]:
import json
import pandas as pd
import os
import networkx as nx
import numpy as np
import subprocess
from collections import defaultdict
from collections import Counter
from sklearn.metrics import *
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle


# Hyperparameters
alpha1 = 0
alpha2 = 1
beta1 = 1
beta2 = 2
gamma1 = 0
gamma2 = 2
gamma3 = 1

max_epochs = 1000
k = 3

# Files
# ori_data = f'dataset/processed_{k}-core_80_20_all.csv'
toy_data = f'dataset/toy_{k}-core_80_20_all.csv'
# ori_label_data = f'dataset/user_label_all.csv'
toy_label_data = f'dataset/toy_label_all.csv'

benign = []
fraudulent = []
total_users = 0

with open(toy_label_data, 'r') as f:
    f.readline()
    for l in f:
        total_users += 1
        s = l.rstrip('\n').split(',')
        if s[2] == 'Benign':
            benign.append(s[0])
        else:
            fraudulent.append(s[0])
print(total_users)

# Convert to networkx
df = pd.read_csv(toy_data)
df['reviewerID'] = 'u' + df['reviewerID'].astype(str)
df['asin'] = 'p' + df['asin'].astype(str)
X = df[['reviewerID', 'asin', 'rating']]
print(X)
G = nx.from_pandas_edgelist(X, source = 'reviewerID', target = 'asin', edge_attr=True, create_using=nx.DiGraph())

nodes = G.nodes()
edges = G.edges(data=True)
print(f"Amazon Network has {len(nodes)} nodes and {len(edges)} edges")

39075
            reviewerID         asin  rating
0      uA13VEY6BF0P4BP  pB00HOIH342     1.0
1      uA17MU1C2AW54W7  p0002007770     1.0
2      uA195TYXXR7OH6G  p0316014508    -1.0
3      uA15PN0C30P5SFL  pB000MMK448     1.0
4      uA12JO0QYS3AO6B  pB00BB0ZTJA     0.5
...                ...          ...     ...
82828  uA14T55DC1KL7AU  pB0009V1YR8     1.0
82829  uA121X1GOQV01DW  p000224053X     1.0
82830  uA1A404IZK06OKH  pB0037ZG3DS     0.5
82831  uA16NRZ5OV48LKP  pB000087QVV     1.0
82832  uA175D4TARP540W  pB007LCPNSO     0.5

[82833 rows x 3 columns]
Amazon Network has 31259 nodes and 82833 edges


In [None]:
user_names = [node for node in nodes if node.startswith('u')]
product_names = [node for node in nodes if node.startswith('p')]
num_users = len(user_names)
num_products = len(product_names)
user_map = dict(zip(user_names, range(len(user_names))))
product_map = dict(zip(product_names, range(len(product_names))))

# Give initial values to each node, product and edge
for node in nodes:
      if node.startswith("u"):
          G.nodes[node]["fairness"] = 1
      else:
          G.nodes[node]["goodness"] = 1

for edge in edges:
    G[edge[0]][edge[1]]["fairness"] = 1

In [None]:
# Update product goodness scores
def Updating_goodness_of_product():
    print('Updating goodness of product')

    def calc_Gp(node):
        inedges = G.in_edges(node, data=True)

        ftotal = len(inedges)
        gtotal = np.sum([edge[2]["fairness"] * edge[2]["rating"]
                          for edge in inedges])

        if ftotal > 0.0:
            mean_rating_fairness = (
                gtotal + beta1*median_gvals) / (ftotal + beta1)
        else:
            mean_rating_fairness = 0.0

        mean_rating_fairness = np.clip(
            mean_rating_fairness, -1, 1)

        return mean_rating_fairness

    # goodness of the curretn products
    currentgvals = [G.nodes[node]["goodness"]
                    for node in nodes if 'p' == node[0]]
    # median
    median_gvals = np.median(currentgvals)

    dp = 0
    for node in nodes:
        if "p" not in node[0]:
            continue

        x = calc_Gp(node)
        dp += abs(G.nodes[node]["goodness"] - x)
        G.nodes[node]["goodness"] = x

    return dp

In [None]:
# Update rating fairness scores
def Updating_fairness_of_ratings():
    print("Updating fairness of ratings")

    def calc_FR(edge):
        user_fairness = G.nodes[edge[0]]["fairness"]  # F(u)

        rating_distance = 1 - \
            (abs(edge[2]["rating"] - G.nodes[edge[1]]["goodness"]) / 2.0)

        x = (gamma1 * user_fairness + gamma2 *
              rating_distance) / (gamma1 + gamma2)
        x = np.clip(x, 0, 1)
        return x

    dr = 0

    for edge in edges:
        x = calc_FR(edge)
        dr += abs(edge[2]["fairness"] - x)
        G.adj[edge[0]][edge[1]]["fairness"] = x
    return dr

In [None]:
# Update user fairness scores
def Updating_fairness_of_users():
    print('updating fairness of users')

    def calc_FU(node):
        outedges = G.out_edges(node, data=True)
        rating_fairness_sum = np.sum(
            [edge[2]["fairness"] for edge in outedges])

        x = (rating_fairness_sum + alpha1*median_fvals) / \
            (len(outedges) + alpha1)
        x = np.clip(x, 0, 1)
        return x

    currentfvals = [G.nodes[node]["fairness"]
                    for node in nodes if 'u' == node[0]]
    # Alternatively, we can use mean here, intead of median
    median_fvals = np.median(currentfvals)

    du = 0

    for node in nodes:
        if "u" not in node[0]:
            continue

        x = calc_FU(node)
        du += abs(G.nodes[node]["fairness"] - x)
        G.nodes[node]["fairness"] = x

    return du

In [None]:
# REV2 algorithm
# Iterate to update parameters
du = 0
dp = 0
dr = 0

for epoch in range(max_epochs):
    print('-----------------')
    print("Epoch number %d with du = %f, dp = %f, dr = %f, for (%d,%d,%d,%d,%d,%d,%d)" % (
        epoch, du, dp, dr, alpha1, alpha2, beta1, beta2, gamma1, gamma2, gamma3))
    if np.isnan(du) or np.isnan(dp) or np.isnan(dr):
        break

    dp = Updating_goodness_of_product()
    dr = Updating_fairness_of_ratings()
    du = Updating_fairness_of_users()

    if du < 0.01 and dp < 0.01 and dr < 0.01:
        break

-----------------
Epoch number 0 with du = 0.000000, dp = 0.000000, dr = 0.000000, for (0,1,1,2,0,2,1)
Updating goodness of product
Updating fairness of ratings
updating fairness of users
-----------------
Epoch number 1 with du = 2170.649781, dp = 5271.014792, dr = 12815.179681, for (0,1,1,2,0,2,1)
Updating goodness of product
Updating fairness of ratings
updating fairness of users
-----------------
Epoch number 2 with du = 268.257641, dp = 1453.616489, dr = 3056.051541, for (0,1,1,2,0,2,1)
Updating goodness of product
Updating fairness of ratings
updating fairness of users
-----------------
Epoch number 3 with du = 115.329273, dp = 576.612030, dr = 1199.806461, for (0,1,1,2,0,2,1)
Updating goodness of product
Updating fairness of ratings
updating fairness of users
-----------------
Epoch number 4 with du = 55.682056, dp = 277.312045, dr = 569.354296, for (0,1,1,2,0,2,1)
Updating goodness of product
Updating fairness of ratings
updating fairness of users
-----------------
Epoch number

In [None]:
# Make a directory to save results
Path = './rev2_results/'
if not os.path.exists(Path):
    os.mkdir(Path)
os.listdir('./')

['HW1',
 'HW3',
 'HW5',
 'dataset',
 '2_Review_Embedding.ipynb',
 '4_Baseline.ipynb',
 'rev2_results']

In [None]:
# Make tables to see the score of each user and product
currentfvals = []
for node in nodes:
  if "u" not in node[0]:
    continue
  currentfvals.append(G.nodes[node]["fairness"])
median_fvals = np.median(currentfvals)

user_fairness_df = pd.DataFrame([(x[0], x[1]['fairness']) for x in G.nodes(
                data=True) if x[0].startswith('u')], columns=['source', 'fairness'])
product_goodness_df = pd.DataFrame([(x[0], x[1]['goodness']) for x in G.nodes(data=True) if x[0].startswith('p')],
                    columns=['target', 'goodness'])
edge_fairness_df = nx.to_pandas_edgelist(G)

In [None]:
all_node_vals = []
fair_node_vals = []

for node in nodes:
    if "u" not in node[0]:
        continue
    f = G.nodes[node]["fairness"]
    all_node_vals.append(
        [node, (f - median_fvals) * np.log(G.out_degree(node) + 1), f, G.out_degree(node)])
    fair_node_vals.append(
        [node[1:], (f - median_fvals) * np.log(G.out_degree(node) + 1)])

all_node_vals_sorted = sorted(all_node_vals, key = lambda x: (float(x[1]), float(x[2]), -1 * float(x[3])), reverse=True)
fair_node_vals_sorted = sorted(fair_node_vals, key = lambda x: (float(x[1])), reverse=True)

In [None]:
# Write the score results to two separate csv files
fw = open(Path + f"fng-sorted-users-{alpha1}-{alpha2}-{beta1}-{beta2}-{gamma1}-{gamma2}-{gamma3}.csv", "w")

for i, sl in enumerate(all_node_vals_sorted):
    # if sl[3] in badusers or sl[3] in goodusers:  # dont store users for which we dont have ground truth
    fw.write(f"{str(sl[0])},{str(sl[1])},{str(sl[2])},{str(sl[3])}\n")
fw.close()

fw = open(Path + f"only_fairness-sorted-users-{alpha1}-{alpha2}-{beta1}-{beta2}-{gamma1}-{gamma2}-{gamma3}.csv", "w")

for i, sl in enumerate(fair_node_vals_sorted):
    # if sl[3] in badusers or sl[3] in goodusers:  # dont store users for which we dont have ground truth
    fw.write(f"{str(sl[0])},{str(sl[1])}\n")
fw.close()

os.listdir('./rev2_results/')
print("Number of ground truth bad users = %d, good users = %d" % (len(fraudulent), len(benign)))

Number of ground truth bad users = 2556, good users = 36519


In [None]:
# Calculate the average precision score for fraudulent and benign user prediction (Unsupervised)

fname = "./rev2_results/fng-sorted-users-%d-%d-%d-%d-%d-%d-%d.csv" % (alpha1, alpha2, beta1, beta2, gamma1, gamma2, gamma3)
TOTAL = 100
bashCommand = 'wc -l %s' % fname
NLINES = total_users
bottom_precs = []
top_precs = []
X = []

for K in range(1, TOTAL):
    i = -1
    f = open(fname, "r")
    
    c11, c12, c21, c22 = 0, 0, 0, 0
    x = 0
    for l in f:
        i += 1
        l = l.strip().split(',')
        l[0] = l[0][1:]
        if i < K:
            if l[0] in benign:
                c11 += 1
            elif l[0] in fraudulent:
                c12 += 1
        elif i >= NLINES - K:
            x += 1
            if l[0] in benign:
                c21 += 1
            elif l[0] in fraudulent:
                c22 += 1
    f.close()
    X.append(c21+c22+1)
    bottom_precs.append((c22+0.001)*1.0/(c21+c22+0.001))
    top_precs.append((c11+0.001)*1.0/(c11+c12+0.001))
   
print("Mean average precision:\nFor fraudulent user prediction = %f\nFor benign user prediction = %f" % (np.mean(bottom_precs), np.mean(top_precs)))

Mean average precision:
For fraudulent user prediction = 1.000000
For benign user prediction = 1.000000


In [None]:
# Calculate the AUC score for fraudulent user prediction (Supervised)

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score

scores = defaultdict(list)
f = open("./rev2_results/fng-sorted-users-%d-%d-%d-%d-%d-%d-%d.csv" % (alpha1, alpha2, beta1, beta2, gamma1, gamma2, gamma3), "r")
for l in f:
    l = l.strip().split(",")
    if l[1] == "nan": l[1] = "0"
    scores[l[0]].append(float(l[1]))
    if l[2] == "nan": l[2] = "0"
    scores[l[0]].append(float(l[2]))

X = []
Y = []
for b in benign:
    b_name = 'u' + b
    if len(scores[b_name]) > 0:
        Y.append(0)
        X.append(scores[b_name])
for f in fraudulent:
    f_name = 'u' + f
    if len(scores[f_name]) > 0:
        Y.append(1)
        X.append(scores[f_name])

X = np.array(X)
Y = np.array(Y)
X, Y = shuffle(X, Y)
skf = StratifiedKFold(n_splits=10)
accuracy_scores = []
f1_scores = []
precision_scores = []
aucscores = []
for train, test in skf.split(X, Y):
    train_X = X[train]
    train_Y = Y[train]
    test_X = X[test]
    test_Y = Y[test]

    clf = RandomForestClassifier(n_estimators=500)
    clf.fit(train_X, train_Y)
    preds = clf.predict(test_X)
    accuracy_scores.append(accuracy_score(y_true=test_Y,y_pred=preds))
    f1_scores.append(f1_score(y_true=test_Y, y_pred=preds))
    precision_scores.append(precision_score(y_true=test_Y, y_pred=preds))
    try:
        pred_Y = clf.predict_proba(test_X)
        false_positive_rate, true_positive_rate, th =  roc_curve(test_Y, pred_Y[:,1])
        aucscores.append(auc(false_positive_rate, true_positive_rate))
    except:
        pass
    print(scores[-1], aucscores[-1])

print("Accuracy scores", accuracy_scores, np.mean(accuracy_scores))
print("AUC scores", aucscores, np.mean(aucscores))
print("F1 scores", f1_scores, np.mean(f1_scores))
print("Precision scores", precision_scores, np.mean(precision_scores))

[] 0.5803314011714363
[] 0.5933760229836221
[] 0.596734545233832
[] 0.6010996125560176
[] 0.5842804775654808
[] 0.5652556777456944
[] 0.5863641948762057
[] 0.6482357518789293
[] 0.6325149336843171
[] 0.6070618608889339
Accuracy scores [0.8971742543171115, 0.8995290423861853, 0.9097331240188383, 0.9128728414442701, 0.8986645718774549, 0.9112333071484682, 0.9018067556952082, 0.9033778476040848, 0.9120188531029065, 0.8994501178318932] 0.9045860715426421
AUC scores [0.5803314011714363, 0.5933760229836221, 0.596734545233832, 0.6010996125560176, 0.5842804775654808, 0.5652556777456944, 0.5863641948762057, 0.6482357518789293, 0.6325149336843171, 0.6070618608889339] 0.5995254478584469
F1 scores [0.15483870967741936, 0.058823529411764705, 0.14814814814814814, 0.21276595744680854, 0.13422818791946306, 0.18705035971223025, 0.07407407407407407, 0.11510791366906475, 0.2328767123287671, 0.15789473684210525] 0.14758083292298455
Precision scores [0.16666666666666666, 0.07547169811320754, 0.192307692307