In [1]:
# Imports 
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

%matplotlib inline 

In [2]:
# That Data
df = pd.read_json('../../../Data/data.json')
df.head()

ValueError: Expected object or value

In [62]:
# Make fraud column 
df['fraud'] = df.acct_type.apply(lambda x: 1 if 'fraud' in x else 0)

In [63]:
df.fraud.value_counts()

0    13044
1     1293
Name: fraud, dtype: int64

In [None]:
# Get columns 
columns = list(df.columns)

# See What they looks like 
df.info()

In [None]:
# Desccribe Int variables 
df.describe().T

In [None]:
# Get missing varaiabels 
missing_values = []
for col in columns:
    if df[col].isna().any():
        missing_values.append(col)
        
print(f'Number: {len(missing_values)} \n\nVariables with missing values:\n{missing_values}')

In [None]:
# get variabels by type 
var_floats = []
var_ints = []
var_objects = []

for col in columns:
    if df[col].dtype == 'int64':
        var_ints.append(col)
    elif df[col].dtype == 'float64':
        var_floats.append(col)
    else:
        var_objects.append(col)

print(f' Floats: {var_floats} \n\n Ints: {var_ints} \n\n Objects: {var_objects}')

# Notes 

**Isolation Forest**

In [None]:
# Show Int data distribution 
def plot(data, vars_, r, c):
    fig, axs = plt.subplots(r, c, figsize = (10,10))

    for i, ax in enumerate(axs.flatten()):
        ax.hist(data[vars_[i]])
        ax.set_xlabel(vars_[i])
        ax.set_ylabel('freq')

    plt.tight_layout()
plot(df, var_ints, 6, 3)

In [None]:
df.user_type.value_counts()

In [None]:
# Show float data distribution 
plot(df, var_floats, 3, 3)

In [None]:
# Peek Object Variables 
for v in var_objects:
    print(f'    {v} \n{df[v].value_counts()} \n\n\n')

In [None]:
# Return all events with no payouts 
nopayouts_mask = df.previous_payouts.apply(lambda x: True if len(x) <= 0 else False)
df[nopayouts_mask]

In [None]:
# No pay out that are farud
df2 = df[~nopayouts_mask]

In [None]:
df.fraud.sum()

In [None]:
df2_fraud = df2[df2.fraud == 1]

In [None]:
df2_fraud

In [None]:
plot(df2_fraud, var_ints, 6, 3)

In [None]:
plot(df2_fraud, var_floats, 3, 3)

In [None]:
# Peek Object Variables 
for v in var_objects:
    print(f'    {v} \n{df2_fraud[v].value_counts()} \n\n\n')
    
# delivery_method     991 non-null    float64
# has_header          339 non-null    float64
# 42  venue_name          203 non-null    object
# 25  org_facebook        289 non-null    float64
# 31  sale_duration       288 non-null    float64

In [None]:
# Peek Object Variables 
for v in ['payee_name', 'org_name']:
    print(f'    {v} \n{df[v].value_counts()} \n\n\n')

In [None]:
df.fraud.sum()

In [None]:
# Check to see if payee is a real person 
df['missing_payee'] = df.payee_name.apply(lambda x: 1 if len(x) < 3 else 0)

## ------------ Fraud Payee and Pay to

In [None]:
# Missing payee
mask_none_missing_payee = df['missing_payee'] == 0

In [None]:
# None Missing payee that are fraud 
temp = df[(df['missing_payee'] == 0) & (df['fraud'] == 1)]

In [None]:
# get all the people who did not pay out to some one but was fraud 
mask = temp.previous_payouts.apply(lambda x: True if len(x) <= 0 else False)

In [None]:
temp.loc[~mask, ['payee_name']]

In [None]:
temp[~mask].previous_payouts[11504]

# ---- Important fraud factors 
1. No previous pay outs 
2. Pay out is to no one 
3. Payout is to payee



mask = df.previous_payouts.apply(lambda x: True if len(x) <= 0 else False)
df[mask]

In [81]:
def not_premium(x):
    return 1 if x.lower() != 'premium' else 0

def no_previous_payout(x):
    return 1 if len(x) <= 0 else 0

def payout_name_flag(x):
    for d in x:
        if len(d['name']) > 3:
            return 1
    return 0

def payout_toself(payee, payouts):
    if payee == '': return 1
    for pay in payouts:
        if payee.lower() in pay['name'].lower():
            return 1
    return 0

In [82]:
df

Unnamed: 0,target,previous_payout,no_payout_name,payout_toself,missing_data
0,1,1,0,0,0
1,0,0,1,1,0
2,0,0,1,1,1
3,0,0,0,1,1
4,0,0,1,1,0
...,...,...,...,...,...
14332,1,1,0,0,1
14333,0,0,1,1,0
14334,0,0,0,1,0
14335,0,0,0,1,0


In [83]:
temp = dict()
temp = {'target': df['fraud'].copy(), 
        #'premium': df.acct_type.apply(lambda x: not_premium(x)),
        'previous_payout': df.previous_payouts.apply(lambda x: no_previous_payout(x)),
        'no_payout_name': df.previous_payouts.apply(lambda x: payout_name_flag(x)),
        'payout_toself': df.apply(lambda x: payout_toself(x.payee_name, x.previous_payouts), axis = 1),
        'missing_data': df.isnull().any(axis = 1) * 1
       }

KeyError: 'fraud'

In [84]:
df = pd.DataFrame(temp)
df.head()

In [66]:
def get_crosstab(X, y, model_type):
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)
    model = model_type
    model.fit(Xtrain, ytrain)
    print(model.predict_proba(Xtest))
    predicted = model.predict(Xtest)
    actual = ytest
    crosstab = pd.crosstab(ytest, model.predict(Xtest), rownames=['actual'], colnames=['predicted'])
    return model, crosstab

# ----- Grid Search 

In [67]:
rfc = RandomForestClassifier(random_state=0)
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

Xtrain, Xtest, ytrain, ytest = train_test_split(df.drop('target', axis = 1), df['target'])

In [68]:
CV_rfc = GridSearchCV(estimator = rfc, param_grid = param_grid, cv= 5)
CV_rfc.fit(Xtrain, ytrain)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [200, 500]})

In [69]:
params = CV_rfc.best_params_
params

{'criterion': 'gini',
 'max_depth': 4,
 'max_features': 'auto',
 'n_estimators': 200}

In [80]:
get_crosstab(df.drop('target', axis = 1), df['target'], RandomForestClassifier(**params))

[[0.04051674 0.95948326]
 [0.96116233 0.03883767]
 [0.97949803 0.02050197]
 ...
 [0.97949803 0.02050197]
 [0.97949803 0.02050197]
 [0.99296543 0.00703457]]


(RandomForestClassifier(max_depth=4, n_estimators=200),
 predicted     0    1
 actual              
 0          3272   24
 1            69  220)

In [55]:
77 / (77 + 257)

0.23053892215568864

### Columns in API 
> 'body_length', 'channels', 'country', 'currency', 'delivery_method',
   'description', 'email_domain', 'event_created', 'event_end',
   'event_published', 'event_start', 'fb_published', 'has_analytics',
   'has_header', 'has_logo', 'listed', 'name', 'name_length', 'object_id',
   'org_desc', 'org_facebook', 'org_name', 'org_twitter', 'payee_name',
   'payout_type', 'previous_payouts', 'sale_duration', 'show_map',
   'ticket_types', 'user_age', 'user_created', 'user_type',
   'venue_address', 'venue_country', 'venue_latitude', 'venue_longitude',
   'venue_name', 'venue_state', 'sequence_number'