In [58]:
# Imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

%matplotlib inline 

In [2]:
# That Data
df = pd.read_json('../../../Data/data.json')

In [8]:
import requests
api_url='https://hxobin8em5.execute-api.us-west-2.amazonaws.com/api/'
api_key='vYm9mTUuspeyAWH1v-acfoTlck-tCxwTw9YfCynC'

def get_data(next_sequence_number):
    """Fetch data from the API."""
    payload = {'api_key': api_key,
               'sequence_number': next_sequence_number}
    response = requests.post(api_url, json=payload)
    data = response.json()
    next_sequence_number = data['_next_sequence_number']
    return data['data']

In [9]:
x = get_data(0)[0]

In [36]:
import math 

def missing_data(x):
    '''
    Param:
        x: dictionary
    Return:
        1 if missing value is found else 0
    '''
    for value in x.values():
        if value == '':
            return 1
    return 0

def no_previous_payout(x):
    '''
    Param:
        list of previous payouts 
    '''
    return 1 if len(x) <= 0 else 0

def payout_name_flag(x):
    '''
    Param:
        list of previous payouts 
    '''
    for d in x:
        if len(d['name']) < 3:
            return 1
    return 0

def payout_toself(payee, payouts):
    '''
    Param:
        payee: payee 
        payouts: list of previous payouts 
    '''
    for pay in payouts:
        if payee.lower() in pay['name'].lower():
            return 1
    return 0

def payee_name_flag(x):
    return 1 if len(x) < 3 else 0

def delivery(x):
    if math.isnan(x): return 0
    return int(x)

In [45]:
def parse_data(data):
    '''
        Param:
            api data call
    '''
    temp = dict()
    
    if type(data) == type(dict()):
        temp = {
            'payout_toself': payout_toself(data['payee_name'], data['previous_payouts']),
            'missing_data': missing_data(data),
            'previous_payout': no_previous_payout(data['previous_payouts']),
            'no_payout_name': payout_name_flag(data['previous_payouts']),
            'no_payee_name': payee_name_flag(data['payee_name']),
            'account_type': data['user_type'],
            'delivery_method': delivery(data['delivery_method']),
            'user_age': data['user_age']}
    else:
        temp = {
            'payout_toself': data.apply(lambda x: payout_toself(x.payee_name, x.previous_payouts), axis = 1),
            'missing_data': data.isnull().any(axis = 1) * 1,
            'previous_payout': data.previous_payouts.apply(lambda x: no_previous_payout(x)),
            'no_payout_name': data.previous_payouts.apply(lambda x: payout_name_flag(x)),
            'no_payee_name': data.payee_name.apply(lambda x: payee_name_flag(x)),
            'account_type': data.user_type,
            'delivery_method': data.delivery_method.apply(lambda x: delivery(x)),
            'user_age': data.user_age }
    
    return temp

# Grid search 2

In [49]:
X = pd.DataFrame(parse_data(df))
y = df.acct_type.apply(lambda x: 1 if 'fraud' in x else 0)

In [51]:
y

0        1
1        0
2        0
3        0
4        0
        ..
14332    1
14333    0
14334    0
14335    0
14336    1
Name: acct_type, Length: 14337, dtype: int64

In [56]:
rfc = RandomForestClassifier(random_state=0)
param_grid = { 
    'n_estimators': [100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)

In [59]:
CV_rfc = GridSearchCV(estimator = rfc, param_grid = param_grid, cv= 5)
CV_rfc.fit(Xtrain, ytrain)
params = CV_rfc.best_params_
params

{'criterion': 'gini',
 'max_depth': 5,
 'max_features': 'log2',
 'n_estimators': 100}

In [60]:
def get_crosstab(X, y, model_type):
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)
    model = model_type
    model.fit(Xtrain, ytrain)
    print(model.predict_proba(Xtest))
    predicted = model.predict(Xtest)
    actual = ytest
    crosstab = pd.crosstab(ytest, model.predict(Xtest), rownames=['actual'], colnames=['predicted'])
    return model, crosstab

In [69]:
get_crosstab(X, y, RandomForestClassifier(**params))

[[0.97669346 0.02330654]
 [0.9902892  0.0097108 ]
 [0.03328363 0.96671637]
 ...
 [0.99498496 0.00501504]
 [0.98486094 0.01513906]
 [0.98101579 0.01898421]]


(RandomForestClassifier(max_depth=5, max_features='log2'),
 predicted     0    1
 actual              
 0          3238   22
 1            69  256)