In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier

In [2]:
df=pd.read_csv('../../../Downloads/Credit_Score_Data/Credit_Score_Data/train.csv')

In [3]:
df.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,5634,3392,1,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,26.82262,265.0,No,49.574949,21.46538,High_spent_Small_value_payments,312.494089,Good
1,5635,3392,2,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,31.94496,266.0,No,49.574949,21.46538,Low_spent_Large_value_payments,284.629162,Good
2,5636,3392,3,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,28.609352,267.0,No,49.574949,21.46538,Low_spent_Medium_value_payments,331.209863,Good
3,5637,3392,4,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,31.377862,268.0,No,49.574949,21.46538,Low_spent_Small_value_payments,223.45131,Good
4,5638,3392,5,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,24.797347,269.0,No,49.574949,21.46538,High_spent_Medium_value_payments,341.489231,Good


In [4]:
df.shape

(100000, 28)

In [5]:
df.columns

Index(['ID', 'Customer_ID', 'Month', 'Name', 'Age', 'SSN', 'Occupation',
       'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Type_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Payment_Behaviour', 'Monthly_Balance',
       'Credit_Score'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ID                        100000 non-null  int64  
 1   Customer_ID               100000 non-null  int64  
 2   Month                     100000 non-null  int64  
 3   Name                      100000 non-null  object 
 4   Age                       100000 non-null  float64
 5   SSN                       100000 non-null  float64
 6   Occupation                100000 non-null  object 
 7   Annual_Income             100000 non-null  float64
 8   Monthly_Inhand_Salary     100000 non-null  float64
 9   Num_Bank_Accounts         100000 non-null  float64
 10  Num_Credit_Card           100000 non-null  float64
 11  Interest_Rate             100000 non-null  float64
 12  Num_of_Loan               100000 non-null  float64
 13  Type_of_Loan              100000 non-null  ob

In [7]:
df.isna().sum()

ID                          0
Customer_ID                 0
Month                       0
Name                        0
Age                         0
SSN                         0
Occupation                  0
Annual_Income               0
Monthly_Inhand_Salary       0
Num_Bank_Accounts           0
Num_Credit_Card             0
Interest_Rate               0
Num_of_Loan                 0
Type_of_Loan                0
Delay_from_due_date         0
Num_of_Delayed_Payment      0
Changed_Credit_Limit        0
Num_Credit_Inquiries        0
Credit_Mix                  0
Outstanding_Debt            0
Credit_Utilization_Ratio    0
Credit_History_Age          0
Payment_of_Min_Amount       0
Total_EMI_per_month         0
Amount_invested_monthly     0
Payment_Behaviour           0
Monthly_Balance             0
Credit_Score                0
dtype: int64

In [8]:
#drop the non-required columns
df.drop(columns=['Customer_ID','Name','ID'],inplace=True)

X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Credit_Score']), df['Credit_Score'], test_size=0.2, random_state=42)

In [9]:
numeric_features=list(df.select_dtypes(exclude='object'))
numeric_features

['Month',
 'Age',
 'SSN',
 'Annual_Income',
 'Monthly_Inhand_Salary',
 'Num_Bank_Accounts',
 'Num_Credit_Card',
 'Interest_Rate',
 'Num_of_Loan',
 'Delay_from_due_date',
 'Num_of_Delayed_Payment',
 'Changed_Credit_Limit',
 'Num_Credit_Inquiries',
 'Outstanding_Debt',
 'Credit_Utilization_Ratio',
 'Credit_History_Age',
 'Total_EMI_per_month',
 'Amount_invested_monthly',
 'Monthly_Balance']

In [10]:
cat_features=list(df.select_dtypes(include='object'))
#cat_features.remove('Name')
cat_features.remove('Credit_Score')
cat_features

['Occupation',
 'Type_of_Loan',
 'Credit_Mix',
 'Payment_of_Min_Amount',
 'Payment_Behaviour']

In [11]:
df['Credit_Score'].value_counts()

Standard    53174
Poor        28998
Good        17828
Name: Credit_Score, dtype: int64

In [12]:
df['Credit_Score'] = df['Credit_Score'].map({'Poor':0 , 'Standard':1,'Good':2})

In [13]:
#numeric_features.append('Credit_Score')

In [14]:
num_transformer= Pipeline([('imputer_num',SimpleImputer(strategy='median')),
                           ('scalar',MinMaxScaler())])
cat_transformer= Pipeline([('imputer_cat',SimpleImputer(strategy='most_frequent')),
                           ('encoder',OneHotEncoder(handle_unknown='ignore',sparse=False))])

In [15]:
preprocessor=ColumnTransformer([('numeric_pipe',num_transformer,numeric_features),
                               ('cot_pipe',cat_transformer,cat_features)])

In [16]:
select_feature=SelectKBest(score_func=chi2,k=20)

# Build model with randomforest

In [17]:
pipe_rf= Pipeline([('preprocess_pipe',preprocessor),
                      ('features',select_feature),
                     ('model_rf',RandomForestClassifier())])

In [18]:
pipe_rf.fit(X_train,y_train)

Pipeline(steps=[('preprocess_pipe',
                 ColumnTransformer(transformers=[('numeric_pipe',
                                                  Pipeline(steps=[('imputer_num',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scalar',
                                                                   MinMaxScaler())]),
                                                  ['Month', 'Age', 'SSN',
                                                   'Annual_Income',
                                                   'Monthly_Inhand_Salary',
                                                   'Num_Bank_Accounts',
                                                   'Num_Credit_Card',
                                                   'Interest_Rate',
                                                   'Num_of_Loan',
                                                   'Delay

In [19]:
pipe_rf.score(X_train,y_train)

0.999925

In [20]:
pipe_rf.score(X_test,y_test)

0.81345

# Build model with SVM

In [21]:
pipe_svm= Pipeline([('preprocess_pipe',preprocessor),
                      ('features',select_feature),
                     ('model_svm',RandomForestClassifier())])

In [22]:
pipe_svm.fit(X_train,y_train)

Pipeline(steps=[('preprocess_pipe',
                 ColumnTransformer(transformers=[('numeric_pipe',
                                                  Pipeline(steps=[('imputer_num',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scalar',
                                                                   MinMaxScaler())]),
                                                  ['Month', 'Age', 'SSN',
                                                   'Annual_Income',
                                                   'Monthly_Inhand_Salary',
                                                   'Num_Bank_Accounts',
                                                   'Num_Credit_Card',
                                                   'Interest_Rate',
                                                   'Num_of_Loan',
                                                   'Delay

In [23]:
pipe_svm.score(X_train,y_train)

0.9998875

In [24]:
pipe_svm.score(X_test,y_test)

0.813

# Build model with KNN

In [25]:
pipe_knn= Pipeline([('preprocess_pipe',preprocessor),
                      ('features',select_feature),
                     ('model_knn',KNeighborsClassifier())])

In [26]:
pipe_knn.fit(X_train,y_train)

Pipeline(steps=[('preprocess_pipe',
                 ColumnTransformer(transformers=[('numeric_pipe',
                                                  Pipeline(steps=[('imputer_num',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scalar',
                                                                   MinMaxScaler())]),
                                                  ['Month', 'Age', 'SSN',
                                                   'Annual_Income',
                                                   'Monthly_Inhand_Salary',
                                                   'Num_Bank_Accounts',
                                                   'Num_Credit_Card',
                                                   'Interest_Rate',
                                                   'Num_of_Loan',
                                                   'Delay

In [27]:
pipe_knn.score(X_train,y_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.8155625

In [28]:
pipe_knn.score(X_test,y_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.72695

# Build model with Desition Tree

In [29]:
pipe_dct= Pipeline([('preprocess_pipe',preprocessor),
                      ('features',select_feature),
                     ('model_dt',DecisionTreeClassifier())])

In [30]:
pipe_dct.fit(X_train,y_train)

Pipeline(steps=[('preprocess_pipe',
                 ColumnTransformer(transformers=[('numeric_pipe',
                                                  Pipeline(steps=[('imputer_num',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scalar',
                                                                   MinMaxScaler())]),
                                                  ['Month', 'Age', 'SSN',
                                                   'Annual_Income',
                                                   'Monthly_Inhand_Salary',
                                                   'Num_Bank_Accounts',
                                                   'Num_Credit_Card',
                                                   'Interest_Rate',
                                                   'Num_of_Loan',
                                                   'Delay

In [31]:
pipe_dct.score(X_train,y_train)

0.9999375

In [32]:
pipe_dct.score(X_test,y_test)

0.759

In [34]:
import pickle
pickle.dump(pipe_rf,open('pipe.pkl','wb'))