In [22]:
import pandas as pd
import os 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

In [2]:
training_ds_path = 'Training_Data'
concatenated_data_file = 'concat_data.csv'

def preprocess_columns(df):
    feature_names = list(df.columns)
    final_feature_names = []
    for feature_name in feature_names:
        if ' ' in feature_name:
            feature_name = feature_name.replace(' ','_')
        final_feature_names.append(feature_name)
    return final_feature_names


def combine(training_ds_path):
    dfs = []
    filenames = os.listdir(training_ds_path)
    for filename in filenames:
        df = pd.read_csv(f'{training_ds_path}/{filename}')
        dfs.append(df)
    if concatenated_data_file in os.listdir():
        os.remove(concatenated_data_file)
    concat_df = pd.concat([dfs[i] for i in range(len(dfs))])
    concat_df.columns = preprocess_columns(concat_df)
    concat_df.to_csv('concat_data.csv', index = False)

In [3]:
combine(training_ds_path)
df = pd.read_csv('concat_data.csv')
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default_payment_next_month
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [7]:
df.columns = [i.lower() for i in df.columns]
selected_features = ['limit_bal','sex','education','marriage','age','pay_amt6']
X = df[selected_features]
y = df['default_payment_next_month']
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=1,test_size=0.2)
print('X_train shape :',X_train.shape)
print('y_train shape :',y_train.shape)
print('X_test shape :',X_test.shape)
print('y_test shape :',y_test.shape)

X_train shape : (24824, 6)
y_train shape : (24824,)
X_test shape : (6207, 6)
y_test shape : (6207,)


In [31]:
# random forest model 
model = RandomForestClassifier().fit(X_train,y_train)
model_score = model.score(X_test,y_test)
print('model_score = ',model_score)

model_score =  0.9895279523119059


In [32]:
predictions = model.predict(X_test)
actual = y_test
model_evaluation = pd.DataFrame()
model_evaluation['actual'] = y_test
model_evaluation['predictions'] = predictions
model_evaluation

Unnamed: 0,actual,predictions
10623,0,0
12298,0,0
19382,0,0
26884,1,1
18261,0,0
...,...,...
20193,0,0
27680,0,0
16535,0,0
11749,1,1


In [35]:
joblib.dump(model,'random_forest_model.sav')

['random_forest_model.sav']

In [46]:
# loading the saved model and making predictions
loaded_model = joblib.load('random_forest_model.sav')
loaded_model.predict([[50000,1,3,2,55,994]])
loaded_model.predict([[140000,1,3,1,36,3300]])

array([0], dtype=int64)

In [47]:
X_test

Unnamed: 0,limit_bal,sex,education,marriage,age,pay_amt6
10623,360000,2,2,1,37,4268
12298,280000,2,1,1,36,2000
19382,140000,1,3,1,36,3300
26884,50000,1,3,2,55,994
18261,50000,1,2,2,29,6318
...,...,...,...,...,...,...
20193,180000,2,1,2,24,0
27680,110000,2,2,3,45,2000
16535,60000,1,2,2,25,2435
11749,50000,2,2,2,25,3000
