# 1. Import package

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.ensemble import RandomForestClassifier

# 2. Load Data

In [3]:
# Load sample training data
df_train = pd.read_csv('fraud_detection_train.csv')
df_train.head()

Unnamed: 0,visit_id,kdkc,dati2,typeppk,jkpst,umur,jnspelsep,los,cmg,severitylevel,...,proc63_67,proc68_70,proc71_73,proc74_75,proc76_77,proc78_79,proc80_99,proce00_e99,procv00_v89,label
0,1,1107,150,SB,P,64,2,0,F,0,...,0,0,0,0,0,0,0,0,0,1
1,2,1303,200,C,L,45,1,9,E,3,...,0,0,0,0,0,0,4,0,0,1
2,3,1114,172,B,P,34,2,0,Q,0,...,0,0,0,0,0,0,0,0,0,1
3,4,601,90,SC,L,34,2,0,Q,0,...,0,0,0,0,0,0,0,0,0,1
4,5,1006,130,B,L,27,2,0,F,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
#check data completeness
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200217 entries, 0 to 200216
Data columns (total 53 columns):
visit_id         200217 non-null int64
kdkc             200217 non-null int64
dati2            200217 non-null int64
typeppk          200217 non-null object
jkpst            200217 non-null object
umur             200217 non-null int64
jnspelsep        200217 non-null int64
los              200217 non-null int64
cmg              200217 non-null object
severitylevel    200217 non-null int64
diagprimer       200217 non-null object
dx2_a00_b99      200217 non-null int64
dx2_c00_d48      200217 non-null int64
dx2_d50_d89      200217 non-null int64
dx2_e00_e90      200217 non-null int64
dx2_f00_f99      200217 non-null int64
dx2_g00_g99      200217 non-null int64
dx2_h00_h59      200217 non-null int64
dx2_h60_h95      200217 non-null int64
dx2_i00_i99      200217 non-null int64
dx2_j00_j99      200217 non-null int64
dx2_koo_k93      200217 non-null int64
dx2_l00_l99      200217 non

In [5]:
#check whether the dataset is imbalance or not
df_train.label.value_counts()

1    100255
0     99962
Name: label, dtype: int64

### Dataset is perfectly balance

# 3. Feature Engineer 

In [6]:
def label_encoder(df, categorical_columns=None):
    """Encode categorical values as integers (0,1,2,3...) with pandas.factorize. """
    # if categorical_colunms are not given than treat object as categorical features
    if not categorical_columns:
        categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    for col in categorical_columns:
        df[col], uniques = pd.factorize(df[col])
    return df, categorical_columns

In [7]:
'''
to_encode = ['typeppk', 'jkpst', 'cmg', 'diagprimer']
             '''

df_train,colname = label_encoder(df_train, categorical_columns=None)

In [8]:
#check whether the data type have all been changed to int
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200217 entries, 0 to 200216
Data columns (total 53 columns):
visit_id         200217 non-null int64
kdkc             200217 non-null int64
dati2            200217 non-null int64
typeppk          200217 non-null int64
jkpst            200217 non-null int64
umur             200217 non-null int64
jnspelsep        200217 non-null int64
los              200217 non-null int64
cmg              200217 non-null int64
severitylevel    200217 non-null int64
diagprimer       200217 non-null int64
dx2_a00_b99      200217 non-null int64
dx2_c00_d48      200217 non-null int64
dx2_d50_d89      200217 non-null int64
dx2_e00_e90      200217 non-null int64
dx2_f00_f99      200217 non-null int64
dx2_g00_g99      200217 non-null int64
dx2_h00_h59      200217 non-null int64
dx2_h60_h95      200217 non-null int64
dx2_i00_i99      200217 non-null int64
dx2_j00_j99      200217 non-null int64
dx2_koo_k93      200217 non-null int64
dx2_l00_l99      200217 non-nul

# 4. Modelling

## 4.1 Modelling to get feature importances
### a. Light Gradient Boosting

### Train Test Split

In [9]:
X = df_train.drop(['label','visit_id'],axis = 1)
#drop visit_id as a not relevant feature in this modelling
y = df_train['label']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)

In [29]:
#try Light Gradient Boosting
# Create a  lgb training set
train_set = lgb.Dataset(X_train, label = y_train.values
                        )

# Find default hyperparameters
model = lgb.LGBMClassifier()
params = model.get_params()

In [30]:
#Cross Validation
cv_results = lgb.cv(params, train_set, num_boost_round = 10000, metrics = 'auc', 
                        early_stopping_rounds = 100, seed = 50, nfold = 5)

print('Cross Validation ROC AUC: {:.5f} with std: {:.5f}.'.format(cv_results['auc-mean'][-1],
                                                                               cv_results['auc-stdv'][-1]))


Please use silent argument of the Dataset constructor to pass this parameter.
  .format(key))


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 839
[LightGBM] [Info] Number of data points in the train set: 128138, number of used features: 48
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 839
[LightGBM] [Info] Number of data points in the train set: 128138, number of used features: 48
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 839
[LightGBM] [Info] Number of data points in the train set: 128138, number of used features: 48
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 839
[LightGBM] [Info] Number of data points in the train set: 128139, number of used features: 48
You 

In [33]:
#Light Gradient Boosting Modelling with Cross Validation Result
model = lgb.LGBMClassifier(n_estimators = len(cv_results['auc-mean']), random_state=50)
model.fit(X_train, y_train.values)

LGBMClassifier(random_state=50)

### Feature Importance based on LGB

In [34]:
fi = pd.DataFrame({'feature': X_train.columns, 
                   'importance': model.feature_importances_})
fi = fi.sort_values('importance', ascending = False)

In [35]:
fi[fi.importance > 0]

Unnamed: 0,feature,importance
0,kdkc,690
1,dati2,558
7,cmg,285
2,typeppk,277
9,diagprimer,262
6,los,135
48,proc80_99,104
4,umur,102
8,severitylevel,67
5,jnspelsep,63


In [40]:
#column that we will use are feature whose importance more than 100
column_taken_b = fi[fi.importance > 100]
b = column_taken_b.feature
b

0           kdkc
1          dati2
7            cmg
2        typeppk
9     diagprimer
6            los
48     proc80_99
4           umur
Name: feature, dtype: object

### b. Random Forest

In [10]:
#try random forest
RFmodel = RandomForestClassifier ()
RFmodel.fit(X_train,y_train)

RandomForestClassifier()

In [41]:
#feature importance RF
feat_imp_RF = pd.DataFrame(RFmodel.feature_importances_, columns = ['Importances']).join(pd.DataFrame(X_train.columns,columns = ['Features']))
feat_imp_RF.sort_values('Importances',ascending = False)

Unnamed: 0,Importances,Features
4,0.21828,umur
1,0.165295,dati2
0,0.149238,kdkc
9,0.105417,diagprimer
2,0.079654,typeppk
7,0.052183,cmg
48,0.045196,proc80_99
6,0.031368,los
3,0.025888,jkpst
8,0.010544,severitylevel


In [42]:
column_taken = feat_imp_RF[feat_imp_RF.Importances > 0.02]
a = column_taken.Features
a

0           kdkc
1          dati2
2        typeppk
3          jkpst
4           umur
6            los
7            cmg
9     diagprimer
48     proc80_99
Name: Features, dtype: object

## 4.2 Modelling to Predict

### a. Light GB

In [43]:
#akurasi untuk nomor 2a.
model_conf_mat = pd.DataFrame(confusion_matrix(y_test,model.predict(X_test)), index = ['Actual not fraud', 'Actual fraud'], columns = ['Predict not fraud', 'Predict fraud'])
print("\nScore : ", model.score(X_test,y_test))



Score :  0.6763310358605534


### b. Random Forest

In [25]:
RFmodel_conf_mat = pd.DataFrame(confusion_matrix(y_test,RFmodel.predict(X_test)), index = ['Actual not fraud', 'Actual fraud'], columns = ['Predict not fraud', 'Predict fraud'])

print("\nScore : ", RFmodel.score(X_test,y_test))


Score :  0.7082709020077914


### c. Mix (Using Random forest with feature from LGB Feature Importance)

In [38]:
#Train test split
X_3 = df_train[b]
y_3 = df_train['label']
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_3,y_3,test_size = 0.2)

In [44]:
RFmodel_3 = RandomForestClassifier ()
RFmodel_3.fit(X_train_3,y_train_3)

RandomForestClassifier()

In [45]:
RFmodel_conf_mat_3 = pd.DataFrame(confusion_matrix(y_test_3,RFmodel_3.predict(X_test_3)), index = ['Actual not fraud', 'Actual fraud'], columns = ['Predict not fraud', 'Predict fraud'])

print("\nScore : ", RFmodel_3.score(X_test_3,y_test_3))


Score :  0.7006542802916792


### Hyperparameter tuning of Random Forest

In [47]:
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(RFmodel_3.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [48]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune. it's already done before, model used is named RFmodel_over_3
rf = RFmodel_over_3
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_3, y_3)

In [21]:
rf_random.best_params_

In [49]:
RFmodel_4 = RandomForestClassifier(n_estimators = 600,
 min_samples_split = 10,
 min_samples_leaf = 4,
 max_features = 'sqrt',
 max_depth = 90,
 bootstrap = False)
RFmodel_4.fit(X_train_3,y_train_3)
RFmodel_conf_mat = pd.DataFrame(confusion_matrix(y_test_3,RFmodel_4.predict(X_test_3)), index = ['Actual not fraud', 'Actual fraud'], columns = ['Predict not fraud', 'Predict fraud'])
print("\nScore : ", RFmodel_4.score(X_test_3,y_test_3))


Score :  0.7170862051743082


## Load Data Validation

In [50]:
df_val = pd.read_csv('fraud_detection_val.csv')

In [54]:
def label_encoder(df_val, categorical_columns=None):
    """Encode categorical values as integers (0,1,2,3...) with pandas.factorize. """
    # if categorical_colunms are not given than treat object as categorical features
    if not categorical_columns:
        categorical_columns = [col for col in df_val.columns if df_val[col].dtype == 'object']
    for col in categorical_columns:
        df_val[col], uniques = pd.factorize(df_val[col])
    return df_val, categorical_columns

In [55]:
'''
to_encode = ['typeppk', 'jkpst', 'cmg', 'diagprimer']
             '''

df_val,colname = label_encoder(df_val, categorical_columns=None)

In [57]:
df_val_2 = df_val[b]

In [59]:
final = RFmodel_4.predict(df_val_2)

In [60]:
submission = pd.DataFrame({'visit_id': df_val['visit_id'], 
                                'predict_label': final})

In [62]:
submission.to_csv('final prediction hitotsu.csv',index = False)