In [10]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
import json

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier,AdaBoostRegressor
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import plot_partial_dependence
from joblib import dump, load
import pickle
import matplotlib.pyplot as plt

In [11]:
def convertunix(columns,df):
    '''columns: list of column headers to convert
       df: name of df'''
    for val in columns:
        df[val] = pd.to_datetime(df[val],unit='s')
    return df
def get_num_tickets(val):
    ticket_quant = []
    for i in range(len(val)):
        ticket_quant.append(val[i]['quantity_total'])
    return sum(ticket_quant)
def get_num_tiers(val):
    ticket_quant = []
    for i in range(len(val)):
        ticket_quant.append(val[i]['quantity_total'])
    return len(ticket_quant)
def get_total_value(val):
    ticket_costs = []
    for i in range(len(val)):
        ticket_costs.append(val[i]['cost'])
    ticket_quant = []
    for i in range(len(val)):
        ticket_quant.append(val[i]['quantity_total'])
    total = 0
    for cost, quant in zip(ticket_costs, ticket_quant):
        total += cost*quant
    return total
def get_max_ticket_cost(val):
    ticket_costs = []
    for i in range(len(val)):
        ticket_costs.append(val[i]['cost'])
    if len(ticket_costs) == 0:
        return 0
    else:
        return max(ticket_costs)
def get_min_ticket_cost(val):
    ticket_costs = []
    for i in range(len(val)):
        ticket_costs.append(val[i]['cost'])
    if len(ticket_costs) == 0:
        return 0
    else:
        return min(ticket_costs)

In [75]:
def clean_df(path_to_file):
    df=pd.read_json(path_to_file)
    df['tickets_total'] = df['ticket_types'].apply(get_num_tickets)
    df['tiers'] = df['ticket_types'].apply(get_num_tiers)
    df['max_cost'] = df['ticket_types'].apply(get_max_ticket_cost)
    df['min_cost'] = df['ticket_types'].apply(get_min_ticket_cost)
    df['total'] = df['ticket_types'].apply(get_total_value)
    df['intl_trans'] = df['country'] != df['venue_country']
    df['type_one_user'] = df['user_type'] == 1
    df['org_desc_exists'] = [0 if len(df['org_desc'][i])==0 else 1 for i in range(len(df))]
    df['org_name_exists'] = [0 if len(df['org_name'][i])==0 else 1 for i in range(len(df))]
    df['previous_payout_count'] = [len(df.previous_payouts[i]) for i in range(len(df))]
    df['org_facebook'].fillna(value=0, inplace=True)
    df['org_twitter'].fillna(value=0, inplace=True)
    df['org_facebook_exists'] = [0 if df['org_facebook'][i]==0 else 1 for i in range(len(df))]
    df['org_twitter_exists'] = [0 if df['org_twitter'][i]==0 else 1 for i in range(len(df))]
    convert = ['approx_payout_date','event_created','event_published','event_start','event_end','user_created']
    convertunix(convert,df)
    emaillist = ['ymail.com','lidf.co.uk','live.fr','rocketmail.com','yahoo.fr']
    df.loc[~df["email_domain"].isin(emaillist), "email_domain"] = 0
    df.loc[df["email_domain"].isin(emaillist), "email_domain"] = 1
    countrylist = ['MA','VN','A1','PK','PH','ID','NG','CI','CZ','DZ']
    df.loc[~df["country"].isin(countrylist), "country"] = 0
    df.loc[df["country"].isin(countrylist), "country"] = 1
    df.drop(['object_id', 'name','name_length','num_order','num_payouts','org_facebook','org_twitter','payee_name','payout_type','previous_payouts','previous_payouts','org_name','org_desc','listed','fb_published','event_published','event_end','event_start','event_created','has_logo','has_header','currency','description','approx_payout_date','delivery_method','body_length','channels','gts','sale_duration', 'sale_duration2', 'ticket_types', 'user_created', 'user_type', 'venue_address', 'venue_country', 'venue_latitude', 'venue_longitude', 'venue_name', 'venue_state', 'show_map'],axis=1,inplace=True)
    return df

In [15]:
def clean_new_data(df):
    #df = pd.read_json(path_to_file)
    #df['tickets_total'] = df['ticket_types'].apply(get_num_tickets)
    df['tickets_total'] = sum(df.ticket_types[i]['quantity_total'] for i in range(len(df.ticket_types)))
    #df['tiers'] = df['ticket_types'].apply(get_num_tiers)
    df['tiers'] = len(df.ticket_types)
    #df['max_cost'] = df['ticket_types'].apply(get_max_ticket_cost)
    df['max_cost'] = max(df.ticket_types[i]['cost'] for i in range(len(df.ticket_types)))
    #df['min_cost'] = df['ticket_types'].apply(get_min_ticket_cost)
    df['min_cost'] = max(df.ticket_types[i]['cost'] for i in range(len(df.ticket_types)))
    #df['total'] = df['ticket_types'].apply(get_total_value)
    df['total'] = sum((df.ticket_types[i]['cost']*df.ticket_types[i]['quantity_total'])for i in range(len(df.ticket_types)))    
    df['intl_trans'] = df['country'] != df['venue_country']
    df['type_one_user'] = df['user_type'] == 1
    df['org_desc_exists'] = [0 if len(df['org_desc'][i])==0 else 1 for i in range(len(df))]
    df['org_name_exists'] = [0 if len(df['org_name'][i])==0 else 1 for i in range(len(df))]
    df['previous_payout_count'] = [len(df.previous_payouts[i]) for i in range(len(df))]
    df['org_facebook'].fillna(value=0, inplace=True)
    df['org_twitter'].fillna(value=0, inplace=True)
    df['org_facebook_exists'] = [0 if df['org_facebook'][i]==0 else 1 for i in range(len(df))]
    df['org_twitter_exists'] = [0 if df['org_twitter'][i]==0 else 1 for i in range(len(df))]
    convert = ['approx_payout_date','event_created','event_published','event_start','event_end','user_created']
    convertunix(convert,df)
    emaillist = ['ymail.com','lidf.co.uk','live.fr','rocketmail.com','yahoo.fr']
    df.loc[~df["email_domain"].isin(emaillist), "email_domain"] = 0
    df.loc[df["email_domain"].isin(emaillist), "email_domain"] = 1
    countrylist = ['MA','VN','A1','PK','PH','ID','NG','CI','CZ','DZ']
    df.loc[~df["country"].isin(countrylist), "country"] = 0
    df.loc[df["country"].isin(countrylist), "country"] = 1
    df.drop(['object_id', 'name','name_length','num_order','num_payouts','org_facebook','org_twitter','payee_name','payout_type','previous_payouts','previous_payouts','org_name','org_desc','listed','fb_published','event_published','event_end','event_start','event_created','has_logo','has_header','currency','description','approx_payout_date','delivery_method','body_length','channels','gts','sale_duration', 'sale_duration2', 'ticket_types', 'user_created', 'user_type', 'venue_address', 'venue_country', 'venue_latitude', 'venue_longitude', 'venue_name', 'venue_state', 'show_map'],axis=1,inplace=True)
    column_list = ['country', 'email_domain', 'has_analytics', 'user_age', 'tickets_total',
       'tiers', 'max_cost', 'min_cost', 'total', 'intl_trans', 'type_one_user',
       'org_desc_exists', 'org_name_exists', 'previous_payout_count',
       'org_facebook_exists', 'org_twitter_exists']
    
    return df

In [165]:
#df = clean_df('../data/data.json')
df = pd.read_json('../data/data.json')

In [141]:
ex = pd.read_json('../example.json')

In [193]:
fraud_ex = df.drop('acct_type', axis = 1)

In [204]:
sanity2 = fraud_ex.iloc(0)

In [175]:
fraud_ex.org_desc

''

In [18]:
def predict_fraud(ex, model):
    ex = pd.read_json('../example.json')
    clean_ex = clean_new_data(ex)
    pred = model.predict(clean_ex)
    return pred[0]

In [19]:
pkl_filename = "../data/rf_taylor_model.pkl"

with open(pkl_filename, 'rb') as file:
    model = pickle.load(file)

In [183]:
fraud_ex.shape[0]

50

In [187]:
fraud_ex.T.shape[0]

50

In [170]:
predict_fraud(fraud_ex, model)

IndexError: string index out of range

In [143]:
clean_ex = clean_new_data2(ex)

In [144]:
clean_ex

Unnamed: 0,email_domain,user_age,country,has_analytics,tickets_total,tiers,max_cost,min_cost,total,intl_trans,type_one_user,org_desc_exists,org_name_exists,previous_payout_count,org_facebook_exists,org_twitter_exists
0,0,0,0,0,125,1,50.0,50.0,6250.0,False,True,0,1,9,1,1


In [145]:
df

Unnamed: 0,acct_type,country,email_domain,has_analytics,user_age,tickets_total,tiers,max_cost,min_cost,total,intl_trans,type_one_user,org_desc_exists,org_name_exists,previous_payout_count,org_facebook_exists,org_twitter_exists
0,fraudster_event,0,0,0,36,920,3,550.00,25.00,36000.00,False,True,0,1,0,0,0
1,premium,0,0,0,149,100,2,35.00,35.00,3500.00,False,False,1,1,49,0,1
2,premium,0,0,0,214,48,1,93.51,93.51,4488.48,False,False,1,1,36,0,0
3,premium,0,0,0,889,30000,6,25.00,5.00,410000.00,True,False,0,0,21,0,0
4,premium,0,0,0,35,264,8,200.00,36.00,28596.00,False,False,1,1,49,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14332,fraudster_event,0,0,0,0,400,1,45.00,45.00,18000.00,False,True,0,0,0,0,0
14333,premium,0,0,0,1374,3256,8,90.00,0.00,187600.00,False,False,1,1,5,0,0
14334,premium,0,0,0,84,148,5,150.00,5.00,3450.00,True,False,0,1,1,0,0
14335,tos_warn,0,0,0,812,100,2,79.33,79.33,7933.00,False,False,0,0,128,0,0


In [112]:
df_pkl = pd.read_pickle('../data/pickled_df.pkl')

In [123]:
df_pkl.columns

Index(['country', 'email_domain', 'has_analytics', 'user_age', 'tickets_total',
       'tiers', 'max_cost', 'min_cost', 'total', 'intl_trans', 'type_one_user',
       'org_desc_exists', 'org_name_exists', 'previous_payout_count',
       'org_facebook_exists', 'org_twitter_exists', 'fraud'],
      dtype='object')

In [128]:
clean_ex.columns

Index(['email_domain', 'user_age', 'country', 'has_analytics', 'tickets_total',
       'tiers', 'max_cost', 'min_cost', 'total', 'intl_trans', 'type_one_user',
       'org_desc_exists', 'org_name_exists', 'previous_payout_count',
       'org_facebook_exists', 'org_twitter_exists'],
      dtype='object')

In [20]:
pkl_filename = "../data/rf_taylor_model.pkl"

with open(pkl_filename, 'rb') as file:
    model = pickle.load(file)

In [21]:
model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [146]:
pred = model.predict(clean_ex)

In [148]:
pred

array([False])

In [22]:
import requests
import json
def get_page():
    """Revieves data from the specified url for prediction
    Returns: data (PandasDataFrame): 1 row DataFrame from url"""
    d = json.loads(requests.get('http://galvanize-case-study-on-fraud.herokuapp.com/data_point').text)
    data = pd.DataFrame(columns=list(d.keys()))
    data.loc[0] = list(d.values())
    return(data)

In [23]:
get_page

<function __main__.get_page()>

In [24]:
x = get_page()

In [33]:
clean_new_data(x)

TypeError: list indices must be integers or slices, not str

In [29]:
def predict_fraud(ex, model):
    clean_ex = clean_new_data(ex)
    pred = model.predict_proba(clean_ex)
    return pred[0]

In [30]:
predict_fraud(x, model)

TypeError: list indices must be integers or slices, not str

In [34]:
x

Unnamed: 0,approx_payout_date,body_length,channels,country,currency,delivery_method,description,email_domain,event_created,event_end,...,ticket_types,user_age,user_created,user_type,venue_address,venue_country,venue_latitude,venue_longitude,venue_name,venue_state
0,1356030000,622,8,AU,AUD,0.0,"<p><img style=""margin: 0px 0px 0px 25px; float...",ilovecomputers.com.au,1354413288,1355598000,...,"[{'availability': 1, 'cost': 8.41, 'event_id':...",137,1342568354,3,162 Brisbane Rd,AU,-26.690899,153.114886,iLove Computers Training Room,QLD


In [41]:
#x['tickets_total'] = sum(x.ticket_types[]['quantity_total'] for i in range(len(x.ticket_types)))
x['tickets_total'] = x.ticket_types

In [47]:
x['tickets_total'][0][0]['quantity_total']

24

In [48]:
clean_df(x)

ValueError: Invalid file path or buffer object type: <class 'pandas.core.frame.DataFrame'>

In [52]:
def clean_new_data2(df):
    #df=pd.read_json(path_to_file)
    df['tickets_total'] = df['ticket_types'].apply(get_num_tickets)
    df['tiers'] = df['ticket_types'].apply(get_num_tiers)
    df['max_cost'] = df['ticket_types'].apply(get_max_ticket_cost)
    df['min_cost'] = df['ticket_types'].apply(get_min_ticket_cost)
    df['total'] = df['ticket_types'].apply(get_total_value)
    df['intl_trans'] = df['country'] != df['venue_country']
    df['type_one_user'] = df['user_type'] == 1
    df['org_desc_exists'] = [0 if len(df['org_desc'][i])==0 else 1 for i in range(len(df))]
    df['org_name_exists'] = [0 if len(df['org_name'][i])==0 else 1 for i in range(len(df))]
    df['previous_payout_count'] = [len(df.previous_payouts[i]) for i in range(len(df))]
    df['org_facebook'].fillna(value=0, inplace=True)
    df['org_twitter'].fillna(value=0, inplace=True)
    df['org_facebook_exists'] = [0 if df['org_facebook'][i]==0 else 1 for i in range(len(df))]
    df['org_twitter_exists'] = [0 if df['org_twitter'][i]==0 else 1 for i in range(len(df))]
    convert = ['approx_payout_date','event_created','event_published','event_start','event_end','user_created']
    convertunix(convert,df)
    emaillist = ['ymail.com','lidf.co.uk','live.fr','rocketmail.com','yahoo.fr']
    df.loc[~df["email_domain"].isin(emaillist), "email_domain"] = 0
    df.loc[df["email_domain"].isin(emaillist), "email_domain"] = 1
    countrylist = ['MA','VN','A1','PK','PH','ID','NG','CI','CZ','DZ']
    df.loc[~df["country"].isin(countrylist), "country"] = 0
    df.loc[df["country"].isin(countrylist), "country"] = 1
    df.drop(['object_id', 'name','name_length','num_order','num_payouts','org_facebook','org_twitter','payee_name','payout_type','previous_payouts','previous_payouts','org_name','org_desc','listed','fb_published','event_published','event_end','event_start','event_created','has_logo','has_header','currency','description','approx_payout_date','delivery_method','body_length','channels','gts','sale_duration', 'sale_duration2', 'ticket_types', 'user_created', 'user_type', 'venue_address', 'venue_country', 'venue_latitude', 'venue_longitude', 'venue_name', 'venue_state', 'show_map'],axis=1,inplace=True)
    return df

In [53]:
clean_new_data2(x)

Unnamed: 0,country,email_domain,has_analytics,user_age,tickets_total,tiers,max_cost,min_cost,total,intl_trans,type_one_user,org_desc_exists,org_name_exists,previous_payout_count,org_facebook_exists,org_twitter_exists
0,0,0,0,137,24,1,8.41,8.41,201.84,False,False,1,1,48,0,0


In [64]:
def predict_fraud(ex, model):
    clean_ex = clean_new_data2(ex)
    pred = model.predict_proba(clean_ex)
    return pred[0]

In [65]:
model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [78]:
x = get_page()

In [79]:
x2 = get_page()

In [80]:
x2

Unnamed: 0,approx_payout_date,body_length,channels,country,currency,delivery_method,description,email_domain,event_created,event_end,...,ticket_types,user_age,user_created,user_type,venue_address,venue_country,venue_latitude,venue_longitude,venue_name,venue_state
0,1369108800,0,11,US,USD,0.0,,msn.com,1367426861,1368676800,...,"[{'availability': 1, 'cost': 20.0, 'event_id':...",817,1296863450,1,,,,,,


In [72]:
predict_fraud(x2, model)

array([1., 0.])

In [68]:
x

Unnamed: 0,country,email_domain,has_analytics,user_age,tickets_total,tiers,max_cost,min_cost,total,intl_trans,type_one_user,org_desc_exists,org_name_exists,previous_payout_count,org_facebook_exists,org_twitter_exists
0,0,0,0,441,62,4,30.0,0.0,250.0,False,True,0,1,9,0,1


In [67]:
predict_fraud(x, model)

array([1., 0.])

NameError: name 'df' is not defined

In [83]:
df = pd.read_pickle('../data/pickled_df.pkl').drop('fraud', axis = 1)

In [85]:
sum(model.predict(df))

1262

In [86]:
len(df)

14337