In [26]:
import pandas as pd
import numpy as np
from sklearn import cross_validation, svm, preprocessing, metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score

## Train Data Processing
- type 做 one-hot
- 做兩個feature (org_error and dest_error)
- 刪掉一些沒用的欄位（id, name, type）

In [17]:
trainpath="/data/examples/may_the_4_be_with_u/show_me_the_money/train.csv"
testpath="/data/examples/may_the_4_be_with_u/show_me_the_money/test.csv"

In [32]:
df_train = pd.DataFrame(pd.read_csv(trainpath))

In [33]:
type_onehot = pd.get_dummies(df_train.type,prefix=['type'])
type_onehot.columns = ['CASH_IN', 'CASH_OUT', 'DEBIT','PAYMENT', 'TRANSFER']
df_train = pd.concat([df_train, type_onehot], axis=1)
df_train = df_train.drop(['type','nameOrig','nameDest','id'], axis=1)

In [34]:
df_train['org_error'] =  df_train.newbalanceOrig + df_train.amount - df_train.oldbalanceOrg
df_train['dest_error'] = df_train.oldbalanceDest + df_train.amount - df_train.newbalanceDest

In [35]:
Y = df_train.Fraud

In [36]:
df_train = df_train.drop('Fraud', axis=1)

In [37]:
df_train.head()

Unnamed: 0,id,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER,org_error,dest_error
0,0,323,9092.69,0.0,0.0,0.0,0.0,0,0,0,1,0,9092.69,9092.69
1,1,164,23468.75,0.0,23468.75,77300.17,53831.42,1,0,0,0,0,46937.5,46937.5
2,2,228,39688.11,0.0,0.0,168913.26,208601.37,0,1,0,0,0,39688.11,0.0
3,3,229,179016.96,103485.0,0.0,2483034.47,2662051.42,0,1,0,0,0,75531.96,0.01
4,4,709,77155.99,143307.0,66151.01,21213865.83,21291021.82,0,1,0,0,0,0.0,-3.72529e-09


## XGBoost
- 用scale_pos_weigh解決樣本數不平均的問題

In [38]:
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance, to_graphviz

In [39]:
trainX, testX, trainY, testY = train_test_split(df_train, Y, test_size = 0.2, random_state = 3)

In [40]:
weights = (Y == 0).sum() / (1.0 * (Y == 1).sum())
clf = XGBClassifier(max_depth = 3, scale_pos_weight = weights, n_jobs = 4)
probabilities = clf.fit(trainX, trainY).predict_proba(testX)

In [41]:
print('AUPRC = {}'.format(average_precision_score(testY, probabilities[:, 1])))

AUPRC = 0.997103204273392


## testdata

In [73]:
df_test = pd.DataFrame(pd.read_csv(testpath))

In [74]:
type_onehot = pd.get_dummies(df_test.type,prefix=['type'])
type_onehot.columns = ['CASH_IN', 'CASH_OUT', 'DEBIT','PAYMENT', 'TRANSFER']
df_test = pd.concat([df_test, type_onehot], axis=1)
df_test = df_test.drop(['type','nameOrig','nameDest'], axis=1)

In [75]:
df_test['org_error'] =  df_test.newbalanceOrig + df_test.amount - df_test.oldbalanceOrg
df_test['dest_error'] = df_test.oldbalanceDest + df_test.amount - df_test.newbalanceDest

In [76]:
pr = clf.predict_proba(df_test)

In [82]:
df_pr = pd.DataFrame(pr)

In [86]:
df_test['Fraud'] = df_pr[1]

In [87]:
df_test.columns

Index(['id', 'step', 'amount', 'oldbalanceOrg', 'newbalanceOrig',
       'oldbalanceDest', 'newbalanceDest', 'CASH_IN', 'CASH_OUT', 'DEBIT',
       'PAYMENT', 'TRANSFER', 'org_error', 'dest_error', 'Fraud'],
      dtype='object')

In [88]:
df_sub = df_test.drop(['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig',
       'oldbalanceDest', 'newbalanceDest', 'CASH_IN', 'CASH_OUT', 'DEBIT',
       'PAYMENT', 'TRANSFER', 'org_error', 'dest_error'], axis = 1)

In [93]:
df_sub['id'] = df_sub['id'] -1

In [95]:
df_sub.to_csv('submit_1.csv', index = False)