# Load and observe data

Firstly, import necessary modules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

Read data and concat them into one df

In [2]:
df_train = pd.read_csv(
    './data/train.csv',
    #dtype={'Field_34': float}
)
df_test = pd.read_csv('./data/test.csv')

df = pd.concat([df_train, df_test])

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


Get some information about data

In [3]:
df_train.describe()

Unnamed: 0,id,label,Field_3,Field_10,Field_13,Field_14,Field_16,Field_17,Field_19,Field_20,...,partner5_G,partner5_H,partner5_K,partner5_L,num_of_phone,Field_78,Field_79,Field_80,Field_81,Field_82
count,53030.0,53030.0,25564.0,25564.0,25564.0,25564.0,25564.0,25564.0,25564.0,25564.0,...,48283.0,48283.0,48283.0,48283.0,50518.0,36397.0,27782.0,36320.0,35981.0,53030.0
mean,26514.5,0.316726,1.538687,1.096542,1.0,1.0,1.0,1.0,0.267022,3603181.0,...,4.1e-05,0.0,0.0,0.0,1.087236,20.80144,30.911307,-1.793163,-32.149412,1.668659
std,15308.586724,0.465204,0.498511,0.332593,0.0,0.0,0.0,0.0,0.835968,2282034.0,...,0.006436,0.0,0.0,0.0,0.297815,40.10993,35.968215,32.176041,63.442452,1.09091
min,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,-267.0,0.0,-267.0,-290.0,1.0
25%,13257.25,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1490000.0,...,0.0,0.0,0.0,0.0,1.0,4.0,5.326888,-5.6,-69.0,1.0
50%,26514.5,0.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,3801000.0,...,0.0,0.0,0.0,0.0,1.0,16.0,15.556349,2.086957,0.0,1.0
75%,39771.75,1.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,4500000.0,...,0.0,0.0,0.0,0.0,1.0,36.0,46.013087,9.5,1.0,2.0
max,53029.0,1.0,2.0,5.0,1.0,1.0,1.0,1.0,6.864,29800000.0,...,1.0,0.0,0.0,0.0,5.0,242.0,325.976226,238.0,238.0,4.0


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53030 entries, 0 to 53029
Columns: 195 entries, id to Field_82
dtypes: float64(133), int64(3), object(59)
memory usage: 78.9+ MB


In [5]:
df_train['label'].value_counts()

0    36234
1    16796
Name: label, dtype: int64

We can see that data is imbalance

# Pre-process data

In [6]:
def getNullCounts(df_temp):
    null_counts = pd.DataFrame(df_temp.isnull().sum(axis = 0))
    null_counts.reset_index(inplace = True)
    null_counts.columns = ['Field', 'Number of null']
    null_counts['% null'] = null_counts['Number of null'] / len(list(df_temp.index))
    null_counts.sort_values(
        ['Number of null'], 
        ascending = False,
        axis = 0,
        inplace = True
    )
    return null_counts

def getBadFields(df_temp):
    null_counts = getNullCounts(df_temp)
    bad_fields = null_counts['Field'][null_counts['% null'] > 0.7]
    return list(bad_fields)

def dropBadFields(df_temp):
    bad_fields = getBadFields(df_temp)
    return df_temp.drop(
        bad_fields,
        axis = 1,
    )
    

In [7]:
def dropObjectFields(df_temp):
    object_fields = df_temp.loc[:, list(df_temp.dtypes == object)].columns
    object_fields = list(object_fields)
    return df_temp.drop(
        object_fields,
        axis = 1,
    )
    

In [8]:
def preProcess(df_temp):
    df_temp = dropBadFields(df_temp)
    df_temp = dropObjectFields(df_temp)
    df_temp = df_temp.drop(['id'], axis = 1)
    
    df_temp = df_temp.fillna(0)
    df_temp = pd.get_dummies(df_temp)
    
    return df_temp

In [9]:
df_train = preProcess(df_train)

In [10]:
df_train

Unnamed: 0,label,Field_3,Field_10,Field_13,Field_14,Field_16,Field_17,Field_19,Field_20,Field_21,...,partner5_G,partner5_H,partner5_K,partner5_L,num_of_phone,Field_78,Field_79,Field_80,Field_81,Field_82
0,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0,4258600.0,4.5,...,0.0,0.0,0.0,0.0,1.0,0.0,0.000000,0.000000,0.0,1
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.000000,0.000000,0.0,1
2,0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,5000000.0,4.5,...,0.0,0.0,0.0,0.0,1.0,33.0,10.769445,6.466667,0.0,2
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,33.0,10.769445,6.466667,0.0,3
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,33.0,10.769445,6.466667,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53025,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1490000.0,4.5,...,0.0,0.0,0.0,0.0,1.0,202.0,157.837469,14.000000,-183.0,1
53026,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,26.0,10.246951,12.500000,2.0,1
53027,0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,4015000.0,4.5,...,0.0,0.0,0.0,0.0,1.0,0.0,0.000000,0.000000,0.0,1
53028,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.000000,0.000000,0.0,4


# Create and train model

In [11]:
df_train = df_train.sample(frac = 1, random_state = 1).reset_index(drop = True)

In [12]:
X_train = df_train.drop(columns=['label'])
Y_train = df_train['label']
print(X_train.shape)
print(Y_train.shape)

(53030, 134)
(53030,)


In [17]:
creditTree = DecisionTreeClassifier(criterion="entropy", max_depth = 60)
creditTree.fit(X_train, Y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=60, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [18]:
X_test = preProcess(df_test)
print(X_test.shape)

(20381, 134)


In [19]:
Y_test = creditTree.predict_proba(X_test)[:, 1]
Y_test

array([1.        , 0.        , 0.34773869, ..., 0.        , 0.        ,
       0.        ])

In [20]:
res_df = pd.DataFrame({'id': df_test['id'], 'label': Y_test})
res_df.to_csv('submission.csv', index=False)

## Some code to double check pre-processing

In [None]:
print(lr.classes_)
df_train.to_csv('test.csv', index = False)
print(dropObjectFields(df_train).info())
df_train = dropBadFields(df_train)
print(getNullCounts(df_train))