In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import json
import sys

## Step1: build the initial state of the entire user network, as well as the purchae history of the users
Input: sample_dataset/batch_log.json

In [2]:
batchlogfile = 'log_input/batch_log.json'
df_batch = pd.read_json(batchlogfile, lines=True)

In [3]:
df_batch

Unnamed: 0,D,T,amount,event_type,id,id1,id2,timestamp
0,3.0,50.0,,,,,,NaT
1,,,16.83,purchase,1.0,,,2017-06-13 11:33:01
2,,,59.28,purchase,1.0,,,2017-06-13 11:33:01
3,,,,befriend,,1.0,2.0,2017-06-13 11:33:01
4,,,,befriend,,3.0,1.0,2017-06-13 11:33:01
5,,,11.2,purchase,1.0,,,2017-06-13 11:33:01
6,,,,unfriend,,1.0,3.0,2017-06-13 11:33:01


In [4]:
df_purchase = df_batch[df_batch['event_type'] == 'purchase']
df_purchase

Unnamed: 0,D,T,amount,event_type,id,id1,id2,timestamp
1,,,16.83,purchase,1.0,,,2017-06-13 11:33:01
2,,,59.28,purchase,1.0,,,2017-06-13 11:33:01
5,,,11.2,purchase,1.0,,,2017-06-13 11:33:01


In [5]:
df_purchase['amount'].std()

26.284931678308265

In [6]:
np.std(df_purchase['amount'])

21.461556845257576

In [7]:
df_batch.describe()

Unnamed: 0,D,T,amount,id,id1,id2
count,1.0,1.0,3.0,3.0,3.0,3.0
mean,3.0,50.0,29.103333,1.0,1.666667,2.0
std,,,26.284932,0.0,1.154701,1.0
min,3.0,50.0,11.2,1.0,1.0,1.0
25%,3.0,50.0,14.015,1.0,1.0,1.5
50%,3.0,50.0,16.83,1.0,1.0,2.0
75%,3.0,50.0,38.055,1.0,2.0,2.5
max,3.0,50.0,59.28,1.0,3.0,3.0


In [8]:
# Read D and T
df_DT=df_batch[df_batch['D'].notnull()]
df_DT=df_DT[['D','T']]
D = df_DT.values[0][0]
T = df_DT.values[0][1]
print(D)
print(T)
#df_DT.head()

3.0
50.0


In [9]:
# check D and T values
if D < 1:
    print('Program terminated because of D < 1')
    sys.exit()
if T < 2:
    print('Program terminated because of T < 2')
    sys.exit()

In [10]:
#for possible_value in set(df['event_type'].tolist()):
#    print(possible_value)

In [11]:
df_purchase = df_batch[df_batch['event_type']=='purchase']
df_purchase = df_purchase[['event_type','id','timestamp','amount']]
# If sort on the timestamp is needed, commentout the following line
# df_purchase = df_purchase.sort_values('timestamp')
#df_purchase.shape

In [12]:
df_friend=df_batch[(df_batch['event_type']=='befriend') | (df_batch['event_type']=='unfriend')]
df_friend=df_friend[['event_type','id1','id2','timestamp']]
# If sort on the timestamp is needed, commentout the following line
#df_friend=df_friend.sort_values('timestamp')
#df_friend.shape

In [13]:
G = nx.Graph()

In [14]:
id1list = df_friend.id1.tolist()
id2list = df_friend.id2.tolist()
idlist = set(id1list + id2list)
G.add_nodes_from(idlist)
#len(list(G.nodes()))

In [15]:
def Add_edges(data):
    for row in data.itertuples():
        id10 = row.id1
        id20 = row.id2
        event_type0 = row.event_type
        if event_type0 == 'befriend':
            G.add_edge(id10,id20)
        if event_type0 == 'unfriend':
            if G.has_edge(id10,id20):
                G.remove_edge(id10,id20)  

In [16]:
Add_edges(df_friend)

In [17]:
len(list(G.edges()))

1

In [18]:
G[2.0]

{1.0: {}}

In [19]:
G.number_of_nodes()

3

In [20]:
G.number_of_edges()

1

In [21]:
list(nx.ego_graph(G, 2, D, center=False))

[1.0]

In [22]:
# define a function to calcualte the mean and sd for userid's network
def Get_Mean_SD(userid):
    Nodes = list(nx.ego_graph(G, userid, D, center=False))
    df_Nodes = df_purchase.loc[df_purchase['id'].isin(Nodes)]
    if len(df_Nodes) >= 2:    
        if len(df_Nodes) > T:
            df_Nodes = df_Nodes.sort_values('timestamp').iloc[-int(T):]
        df_Nodes.shape
        # the std from pd is different from np; np is correct
        #mean = df_Nodes.amount.mean()
        #sd = df_Nodes.amount.std()
        mean = np.mean(df_Nodes['amount'])
        sd = np.std(df_Nodes['amount'])
        mean = float('{0:.2f}'.format(mean))
        sd = float('{0:.2f}'.format(sd))
    else:
        mean=np.nan
        sd=np.nan
    
    return mean, sd

In [23]:
Get_Mean_SD(2.0)

(29.1, 21.46)

## Step2: Determine whether a purchase is anomalous 
input file: sample_dataset/stream_log.json

In [24]:
# read in the stream_log.json
streamlogfile = 'log_input/stream_log.json'
df_stream = pd.read_json(streamlogfile, lines=True)
# If sort on the timestamp is needed, commentout the following line
#df_stream = df_stream.sort_values('timestamp')

# open output file flagged_purchases.json
flaggedfile = 'log_output/flagged_purchases.json'
f = open(flaggedfile, 'w')

In [25]:
# Determine whether a purchase is anomalous; update purchase history; update social network
for i in range(0, len(df_stream)):
    datai = df_stream.iloc[i]
    event_type = datai['event_type']
    if event_type == 'purchase':
        # update purchase history
        df_purchase = df_purchase.append(datai[['event_type','id','timestamp','amount']])
        timestamp = datai['timestamp']
        timestamp = str(timestamp)
        userid = datai['id']
        amount = datai['amount']
        mean, sd = Get_Mean_SD(userid)
        if mean != np.nan:
            mean_3sd = mean + (3*sd)
            if amount > mean_3sd:
                f.write('{{"event_type":"{0:s}", "timestamp":"{1:s}", "id": "{2:.0f}", "amount": "{3:.2f}", "mean": "{4:.2f}", "sd": "{5:.2f}"}}\n'.format(event_type, timestamp, userid, amount, mean, sd))
    # update social network
    if event_type == 'befriend':
        id1 = datai['id1']
        id2 = datai['id2']
        G.add_edge(id1,id2)
    if event_type == 'unfriend':
        id1 = datai['id1']
        id2 = datai['id2']
        if G.has_edge(id1,id2):
            G.remove_edge(id1,id2)  
    


In [26]:
f.close() 

In [27]:
df_stream.describe()

Unnamed: 0,amount,id
count,1.0,1.0
mean,1601.83,2.0
std,,
min,1601.83,2.0
25%,1601.83,2.0
50%,1601.83,2.0
75%,1601.83,2.0
max,1601.83,2.0


In [28]:
df_stream.head()

Unnamed: 0,amount,event_type,id,timestamp
0,1601.83,purchase,2,2017-06-13 11:33:02
