In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



In [2]:
data = pd.read_csv('train_data.csv')

In [3]:
X = data.drop('default payment next month', axis=1)
y = data['default payment next month']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
X_train.shape

(17999, 24)

In [6]:
y_train.shape

(17999,)

In [7]:
X_test.shape

(6000, 24)

In [8]:
y_test.shape

(6000,)

## Inital Models

In [9]:
ss = StandardScaler()
knn = KNeighborsClassifier()
rf = RandomForestClassifier()
lr = LogisticRegression()

In [10]:
X_train_scaled = ss.fit_transform(X_train)
#X_test_scaled = ss.transform(X_test)

In [11]:
X_train.shape

(17999, 24)

In [12]:
knn.fit(X_train_scaled, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [13]:
rf.fit(X_train_scaled, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [14]:
print(f"training accuracy: {knn.score(X_train_scaled, y_train)}")
print(f"testing accuracy: {knn.score(X_test, y_test)}")
knn_y_hat = knn.predict(X_test)

training accuracy: 0.8396577587643758
testing accuracy: 0.7761666666666667


In [15]:
print(f"training accuracy: {rf.score(X_train_scaled, y_train)}")
print(f"testing accuracy: {rf.score(X_test, y_test)}")
rf_y_hat = rf.predict(X_test)

training accuracy: 0.9813878548808267
testing accuracy: 0.7353333333333333


In [16]:
knn_f1 = f1_score(y_test, knn_y_hat, pos_label= 0, average='binary')
knn_f1

0.8739795439617153

In [17]:
rf_f1 = f1_score(y_test, rf_y_hat, pos_label= 0, average='binary')
rf_f1

0.8407221664994984

## Final Model

In [30]:
final_data = pd.read_csv('final.csv')

In [31]:
final_data

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,target
0,350000,1,1,2,37,-2,-2,-2,-2,-2,...,466,466,316,316,316,466,466,316,316,0
1,50000,2,2,1,37,2,2,2,0,0,...,13026,13268,13497,5500,0,580,600,600,600,0
2,50000,2,1,2,23,-1,-1,-1,-1,-1,...,4800,9810,660,2548,2321,4800,9810,660,2980,0
3,20000,1,3,1,56,0,0,0,0,2,...,13784,13420,13686,1508,1216,1116,0,490,658,0
4,110000,2,2,2,32,0,0,0,0,0,...,108829,110557,106082,5400,5400,4100,4100,4100,4200,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23983,50000,2,1,2,24,0,0,0,-2,-2,...,0,0,0,7522,0,0,0,0,0,1
23984,250000,1,3,1,44,-1,-1,-1,-1,-1,...,43546,38051,576,3924,4482,43676,38166,576,2389,1
23985,50000,1,3,2,26,2,0,0,2,2,...,44891,47654,48721,1800,4000,0,3500,2000,0,1
23986,110000,2,2,1,38,0,0,0,0,0,...,39854,9293,-1288,5000,10000,5048,2000,39958,52000,1


In [32]:
X = final_data.drop('target', axis=1)
y = final_data['target']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [34]:
#smote = SMOTE(random_state=42)
#X_train, y_train = smote.fit_resample(X_train, y_train)

In [35]:
X_train_scaled = ss.fit_transform(X_train)

In [36]:
knn.fit(X_train_scaled, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [37]:
rf.fit(X_train_scaled, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [38]:
print(f"training accuracy: {knn.score(X_train_scaled, y_train)}")
print(f"testing accuracy: {knn.score(X_test, y_test)}")
knn_y_hat = knn.predict(X_test)

training accuracy: 0.8401978767161359
testing accuracy: 0.7770551942637985


In [39]:
print(f"training accuracy: {rf.score(X_train_scaled, y_train)}")
print(f"testing accuracy: {rf.score(X_test, y_test)}")
rf_y_hat = rf.predict(X_test)

training accuracy: 0.9782669112333945
testing accuracy: 0.3231615807903952


In [46]:
knn_f1 = f1_score(y_test, knn_y_hat, pos_label= 0, average='binary')
knn_f1

0.8744718805745939

In [43]:
rf_f1 = f1_score(y_test, rf_y_hat, average='binary')
rf_f1

0.22345513678974555

## Prediction

In [47]:
test = pd.read_csv('test_features.csv')

In [56]:
# test

In [50]:
test.drop(columns=['Unnamed: 0','ID'], axis=1, inplace=True)

In [51]:
test = test[test['EDUCATION'] != 0]

In [53]:
test['EDUCATION'].replace(to_replace=6, value=5, inplace=True)

In [57]:
test

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,240000,1,1,1,44,-1,-1,-1,-1,-1,...,2701,2427,1104,2362,2188,2701,2427,1104,2362,519
1,50000,1,3,1,41,0,0,0,0,0,...,48468,39203,28913,26636,1816,1753,1433,990,967,1071
2,20000,2,3,1,41,0,0,0,0,0,...,11583,13079,14546,16149,2000,1600,2000,2000,2000,2000
3,320000,2,1,2,34,-2,-2,-2,-2,-2,...,-528,-1336,-1336,-1336,0,4,0,0,0,0
4,120000,2,2,1,23,0,0,0,0,0,...,66825,68820,69776,71297,3000,2500,3100,2700,2800,2800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5996,200000,2,1,2,30,-1,-1,2,2,2,...,75635,94454,60875,41221,15349,10,32000,0,3000,0
5997,100000,2,2,1,36,2,2,2,2,2,...,77977,79071,76918,81713,3500,3300,3000,0,6500,3000
5998,50000,2,2,1,42,3,3,2,2,0,...,24574,24348,16130,16467,500,1200,516,566,585,608
5999,70000,2,1,1,32,0,0,0,0,0,...,71314,71595,70040,68927,2900,2766,3648,2700,2487,2600


In [58]:
knn_predict = knn.predict(test)

In [59]:
knn_predict

array([0, 0, 0, ..., 0, 0, 0])

In [71]:
prediction = pd.DataFrame(knn_predict)

In [72]:
prediction.to_csv(r'credit_default_preds_arsr.csv', header=False, index=False, index_label=False)

In [73]:
prediction

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
5993,0
5994,0
5995,0
5996,0
