<a href="https://colab.research.google.com/github/sultanardia/competion/blob/main/Credit_Risk_Scoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import pandas library

In [983]:
import pandas as pd

# Read CSV

In [984]:
df = pd.read_csv('/content/train.csv')
df.head()

Unnamed: 0,customer_id,customer_bod,gender,phone_flag,student,employment,credit_card,balance,income,tenure,default
0,8300,1993-08-17,Female,1.0,No,Self Employed,1.0,87104.12,5015120.75,4yrs 4mon,0
1,672,2007-12-17,Female,1.0,Yes,,0.0,89236.34,2266076.58,4yrs 1mon,0
2,5670,2000-02-05,Female,1.0,Yes,,0.0,171553.12,1779347.34,0yrs 9mon,0
3,2975,1999-11-16,Female,1.0,Yes,,0.0,85979.04,2014246.24,1yrs 8mon,0
4,3883,1977-08-18,Male,1.0,No,Salaried,0.0,48874.77,5445148.31,0yrs 10mon,0


# Create function to convert ```tenure``` datas column into month datas

In [985]:
def getMonth(data):
    data = data.replace('yrs', '').replace('mon', '').split()
    return (int(data[0]) * 12) + int(data[1])

df['tenure_months'] = df['tenure'].apply(lambda x : getMonth(x))

# Create function to convert ```customer_bod``` datas column into age datas

In [986]:
from datetime import date

def getAge(bod):
    today = date.today()
    age = today.year - bod.year - ((today.month, today.day) < (bod.month, bod.day))
    return age

df['customer_bod'] = pd.to_datetime(df['customer_bod'])
df['age'] = df['customer_bod'].apply(lambda x : getAge(date(x.year, x.month, x.day)))

# Check missing values

In [987]:
df.isnull().sum()

customer_id        0
customer_bod       0
gender           308
phone_flag         0
student            0
employment       992
credit_card       11
balance            0
income             0
tenure             0
default            0
tenure_months      0
age                0
dtype: int64

# Drop the missing row values

In [988]:
df.dropna(subset = ['gender', 'credit_card', 'employment'], how = 'any', axis = 0, inplace = True)

# Make sure to zero missing values

In [989]:
df.isnull().sum()

customer_id      0
customer_bod     0
gender           0
phone_flag       0
student          0
employment       0
credit_card      0
balance          0
income           0
tenure           0
default          0
tenure_months    0
age              0
dtype: int64

# Encoding the object features

In [990]:
from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()

df['employment'] = LE.fit_transform(df['employment'])
df['student'] = LE.fit_transform(df['student'])
df['gender'] = LE.fit_transform(df['gender'])

# Drop unwanted features

In [991]:
df.drop(['customer_id', 'customer_bod', 'tenure'], axis = 1, inplace = True)

# Make sure the data is ready

In [992]:
df.head()

Unnamed: 0,gender,phone_flag,student,employment,credit_card,balance,income,default,tenure_months,age
0,0,1.0,0,1,1.0,87104.12,5015120.75,0,52,28
4,1,1.0,0,0,0.0,48874.77,5445148.31,0,10,44
7,1,1.0,0,0,0.0,0.0,7112035.3,0,0,37
8,0,1.0,0,0,1.0,103848.03,6455858.31,0,30,22
9,0,1.0,0,0,0.0,0.0,3444624.3,0,0,40


# Split datas into train and test datas

In [993]:
from sklearn.model_selection import train_test_split

X = df.drop('default', axis = 1)
Y = df['default']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 42, test_size = 0.2)

# Decision Tree Classifier

In [994]:
from sklearn.tree import DecisionTreeClassifier

DTC = DecisionTreeClassifier(criterion = 'entropy')
DTC.fit(X_train, Y_train)
Y_pred_DTC = DTC.predict(X_test)

# Logistic Regression

In [995]:
from sklearn.linear_model import LogisticRegression

Log = LogisticRegression()
Log.fit(X_train, Y_train)
Y_pred_Log = Log.predict(X_test)

# Naive Bayes

In [996]:
from sklearn.naive_bayes import GaussianNB

GNB = GaussianNB()
GNB.fit(X_train, Y_train)
Y_pred_GNB = GNB.predict(X_test)

# KNN

In [997]:
from sklearn.neighbors import KNeighborsClassifier

KNN = KNeighborsClassifier()
KNN.fit(X_train, Y_train)
Y_pred_KNN = KNN.predict(X_test)

# Evaluate using F1 Score

In [1017]:
from sklearn.metrics import f1_score

print('DTC :', f1_score(Y_test, Y_pred_DTC))
print('Log :', f1_score(Y_test, Y_pred_Log))
print('GNB :', f1_score(Y_test, Y_pred_GNB))
print('KNN :', f1_score(Y_test, Y_pred_KNN))

DTC : 0.6511627906976744
Log : 0.38095238095238093
GNB : 0.7000000000000001
KNN : 0.45161290322580644


# Processing test datas

In [1009]:
df_test = pd.read_csv('/content/test.csv')
df_test.head()

Unnamed: 0,customer_id,customer_bod,gender,phone_flag,student,employment,credit_card,balance,income,tenure
0,9365,1999-10-22,Male,0.0,No,Salaried,1.0,0.0,4430744.15,0yrs 0mon
1,999,1987-05-03,Female,0.0,No,Salaried,0.0,67431.4,3743149.53,0yrs 7mon
2,2835,2000-10-20,Male,1.0,No,Salaried,1.0,69128.28,4821579.61,1yrs 11mon
3,5821,1994-07-13,Male,1.0,No,Salaried,0.0,151051.78,3159764.69,0yrs 10mon
4,2330,1996-10-03,Male,0.0,No,Self Employed,0.0,92727.85,5659353.2,1yrs 3mon


In [1010]:
df_test['tenure_months'] = df_test['tenure'].apply(lambda x : getMonth(x))

df_test['customer_bod'] = pd.to_datetime(df_test['customer_bod'])
df_test['age'] = df_test['customer_bod'].apply(lambda x : getAge(x))

In [1011]:
df_test.isnull().sum()

customer_id       0
customer_bod      0
gender           14
phone_flag        0
student           0
employment       85
credit_card       2
balance           0
income            0
tenure            0
tenure_months     0
age               0
dtype: int64

In [1012]:
df_test.dropna(subset = ['gender', 'credit_card', 'employment'], how = 'any', axis = 0, inplace = True)

In [1013]:
drop_col = ['customer_id', 'customer_bod', 'tenure']
df_test.drop(drop_col, axis = 1, inplace = True)

In [1014]:
df_test['employment'] = LE.fit_transform(df_test['employment'])
df_test['student'] = LE.fit_transform(df_test['student'])
df_test['gender'] = LE.fit_transform(df_test['gender'])

In [1015]:
df_test.head()

Unnamed: 0,gender,phone_flag,student,employment,credit_card,balance,income,tenure_months,age
0,1,0.0,0,0,1.0,0.0,4430744.15,0,21
1,0,0.0,0,0,0.0,67431.4,3743149.53,7,34
2,1,1.0,0,0,1.0,69128.28,4821579.61,23,20
3,1,1.0,0,0,0.0,151051.78,3159764.69,10,27
4,1,0.0,0,1,0.0,92727.85,5659353.2,15,25


# Result

In [1016]:
GNB.predict(df_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1])