# 1. Import Libraries

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from sklearn.model_selection import GridSearchCV

# 2. Data Read

In [2]:
df = pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,pct_tl_open_L6M,pct_tl_closed_L6M,Tot_TL_closed_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,CC_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,...,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,HL_Flag,GL_Flag,MARITALSTATUS,EDUCATION,GENDER,last_prod_enq2,first_prod_enq2,Approved_Flag
0,0.0,0.0,0,0.0,0,0,0,4,1,4,...,0.0,0.0,1,0,Married,12TH,M,PL,PL,P2
1,0.0,0.0,0,0.0,0,0,0,0,0,1,...,0.0,0.0,0,0,Single,GRADUATE,F,ConsumerLoan,ConsumerLoan,P2
2,0.125,0.0,0,0.0,1,0,0,0,2,6,...,0.0,0.0,1,0,Married,SSC,M,ConsumerLoan,others,P2
3,0.0,0.0,0,0.0,0,0,0,0,3,0,...,0.0,0.0,0,0,Married,POST-GRADUATE,M,AL,AL,P1
4,0.0,0.0,1,0.167,0,0,0,0,6,0,...,0.429,0.0,1,0,Married,12TH,M,ConsumerLoan,PL,P3


In [3]:
df.shape

(42064, 43)

# 3. Data Preprocessing

In [4]:
cat_var = []
for i in df.columns:
    if df[i].dtype == 'object':
        cat_var.append(i)

In [5]:
cat_var

['MARITALSTATUS',
 'EDUCATION',
 'GENDER',
 'last_prod_enq2',
 'first_prod_enq2',
 'Approved_Flag']

In [6]:
df['EDUCATION'].unique()

array(['12TH', 'GRADUATE', 'SSC', 'POST-GRADUATE', 'UNDER GRADUATE',
       'OTHERS', 'PROFESSIONAL'], dtype=object)

In [7]:
# ordinal encoding for education

df.loc[df['EDUCATION']=='SSC', ['EDUCATION']] = 1
df.loc[df['EDUCATION']=='12TH', ['EDUCATION']] = 2
df.loc[df['EDUCATION']=='GRADUATE', ['EDUCATION']] = 3
df.loc[df['EDUCATION']=='UNDER GRADUATE', ['EDUCATION']] = 3
df.loc[df['EDUCATION']=='POST-GRADUATE', ['EDUCATION']] = 4
df.loc[df['EDUCATION']=='OTHERS', ['EDUCATION']] = 1
df.loc[df['EDUCATION']=='PROFESSIONAL',['EDUCATION']] = 3       

In [8]:
df['EDUCATION'].dtype

dtype('O')

In [9]:
df['EDUCATION'].value_counts()
df['EDUCATION'] = df['EDUCATION'].astype(int)

In [10]:
df['EDUCATION'].dtype

dtype('int64')

In [11]:
df['MARITALSTATUS'].unique()

array(['Married', 'Single'], dtype=object)

In [12]:
df['GENDER'].unique()

array(['M', 'F'], dtype=object)

In [13]:
df['last_prod_enq2'].unique()

array(['PL', 'ConsumerLoan', 'AL', 'CC', 'others', 'HL'], dtype=object)

In [14]:
df['first_prod_enq2'].unique()

array(['PL', 'ConsumerLoan', 'others', 'AL', 'HL', 'CC'], dtype=object)

In [15]:
# remaining all categorical columns encoded through onehot encoding
df_encoded = pd.get_dummies(df, columns=['MARITALSTATUS','GENDER','last_prod_enq2','first_prod_enq2'], dtype='int')

In [16]:
df_encoded.head()

Unnamed: 0,pct_tl_open_L6M,pct_tl_closed_L6M,Tot_TL_closed_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,CC_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,...,last_prod_enq2_ConsumerLoan,last_prod_enq2_HL,last_prod_enq2_PL,last_prod_enq2_others,first_prod_enq2_AL,first_prod_enq2_CC,first_prod_enq2_ConsumerLoan,first_prod_enq2_HL,first_prod_enq2_PL,first_prod_enq2_others
0,0.0,0.0,0,0.0,0,0,0,4,1,4,...,0,0,1,0,0,0,0,0,1,0
1,0.0,0.0,0,0.0,0,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0
2,0.125,0.0,0,0.0,1,0,0,0,2,6,...,1,0,0,0,0,0,0,0,0,1
3,0.0,0.0,0,0.0,0,0,0,0,3,0,...,0,0,0,0,1,0,0,0,0,0
4,0.0,0.0,1,0.167,0,0,0,0,6,0,...,1,0,0,0,0,0,0,0,1,0


In [17]:
df_encoded.shape

(42064, 55)

In [18]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 55 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pct_tl_open_L6M               42064 non-null  float64
 1   pct_tl_closed_L6M             42064 non-null  float64
 2   Tot_TL_closed_L12M            42064 non-null  int64  
 3   pct_tl_closed_L12M            42064 non-null  float64
 4   Tot_Missed_Pmnt               42064 non-null  int64  
 5   CC_TL                         42064 non-null  int64  
 6   Home_TL                       42064 non-null  int64  
 7   PL_TL                         42064 non-null  int64  
 8   Secured_TL                    42064 non-null  int64  
 9   Unsecured_TL                  42064 non-null  int64  
 10  Other_TL                      42064 non-null  int64  
 11  Age_Oldest_TL                 42064 non-null  int64  
 12  Age_Newest_TL                 42064 non-null  int64  
 13  t

In [19]:
X = df_encoded.drop(['Approved_Flag'], axis=1)
y = df_encoded['Approved_Flag']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)

(33651, 54) (8413, 54)


# 4. Model Selection

## 4.1 Decision Tree 

In [21]:
dtree_model = DecisionTreeClassifier(criterion='gini', max_depth=8, min_samples_split=10)
dtree_model.fit(X_train, y_train)
y_pred = dtree_model.predict(X_test)

In [22]:
accuracy = accuracy_score(y_test, y_pred)
print()
print(f'Accuracy: {accuracy:.2f}')
print()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1-Score: {f1_score[i]}")
    print()


Accuracy: 0.76

class p1:
Precision: 0.8029612756264237
Recall: 0.6952662721893491
F1-Score: 0.7452431289640592

class p2:
Precision: 0.8013268156424581
Recall: 0.9098116947472745
F1-Score: 0.8521303258145363

class p3:
Precision: 0.4262295081967213
Recall: 0.2747169811320755
F1-Score: 0.3340982101881597

class p4:
Precision: 0.7355718782791185
Recall: 0.6812439261418853
F1-Score: 0.7073662966700303



## 4.2 RandomForest Classifier

In [23]:
rnd_forest = RandomForestClassifier(n_estimators=100)
rnd_forest.fit(X_train, y_train)
y_pred = rnd_forest.predict(X_test)

In [24]:
accuracy = accuracy_score(y_test, y_pred)
print()
print(f'Accuracy: {accuracy:.2f}')
print()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1-Score: {f1_score[i]}")
    print()


Accuracy: 0.76

class p1:
Precision: 0.8337264150943396
Recall: 0.6972386587771203
F1-Score: 0.7593984962406015

class p2:
Precision: 0.7942973523421588
Recall: 0.9276511397423192
F1-Score: 0.8558105513394898

class p3:
Precision: 0.4323076923076923
Recall: 0.2120754716981132
F1-Score: 0.28455696202531644

class p4:
Precision: 0.718475073313783
Recall: 0.7142857142857143
F1-Score: 0.716374269005848



## 4.3 Xgboost

In [25]:
xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',  
                                   num_class=4,
                                  colsample_bytree=0.3,
                                  learning_rate=0.1,
                                  max_depth=8, 
                                  alpha=10,
                                  n_estimators=100)

In [26]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [28]:
xgb_classifier.fit(X_train, y_train)
y_pred = xgb_classifier.predict(X_test)

In [29]:
accuracy = accuracy_score(y_test, y_pred)
print()
print(f'Accuracy: {accuracy:.2f}')
print()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1-Score: {f1_score[i]}")
    print()


Accuracy: 0.77

class p1:
Precision: 0.8409610983981693
Recall: 0.7248520710059172
F1-Score: 0.7786016949152542

class p2:
Precision: 0.7995927371457662
Recall: 0.933994053518335
F1-Score: 0.8615834704699213

class p3:
Precision: 0.4535928143712575
Recall: 0.22867924528301886
F1-Score: 0.30406422478675366

class p4:
Precision: 0.7464212678936605
Recall: 0.7094266277939747
F1-Score: 0.7274539113104136



- After using these classifier we figure out xgboost is selected for good estimator among three classifier

In [30]:
df_encoded['Approved_Flag'].value_counts()

Approved_Flag
P2    25452
P3     6440
P4     5264
P1     4908
Name: count, dtype: int64

- in our case proportion of P1 is approximately 12%. then we assumed that our data is balanced. And due to this we highly focused on accuracy_score.
- if our data is imbalanced then we need to get maximized f1-score

# 5. HyperParameter Tuning

#### a brief explanation of the hyperparameters listed in your param_grid:

1. colsample_bytree: This parameter specifies the fraction of features (columns) to be randomly sampled for each tree in the model. It helps in reducing overfitting by ensuring that not all features are used for building each tree, thus introducing more randomness.

2. learning_rate: Also known as the shrinkage factor, this parameter scales the contribution of each tree. A lower learning rate means the model will learn more slowly but potentially more accurately, as it helps in fine-tuning the model by updating weights in smaller steps.

3. max_depth: This parameter determines the maximum depth of each tree. Increasing the depth allows the model to capture more complex patterns but also increases the risk of overfitting. Shallower trees are more robust to noise but might underfit the data.

4. alpha: This is the L1 regularization term on weights. It adds a penalty equal to the absolute value of the magnitude of coefficients, encouraging the model to keep coefficients small, thus reducing overfitting and improving generalization.

5. n_estimators: This parameter specifies the number of trees in the ensemble. More trees can improve the model’s performance but also increase computational cost and the risk of overfitting if not properly regulated.

In [31]:
param_grid ={
    'colsample_bytree': [0.3, 0.5, 0.7, 0.9],
    'learning_rate'   : [0.001, 0.01, 0.1, 1],
    'max_depth'       : [3, 5, 8, 10],
    'alpha'           : [1, 10, 100],
    'n_estimators'    : [50, 100, 200]
}

In [32]:
index= 0

answer_grid = {
    'combination' : [],
    'train_accuracy': [],
    'test_accuracy' : [],
    'colsample_bytree' : [],
    'learning_rate': [],
    'max_depth'    : [],
    'alpha'        : [],
    'n_estimators' : [],
}

In [33]:
# for colsample_bytree in param_grid['colsample_bytree']:
#     for learning_rate in param_grid['learning_rate']:
#         for max_depth in param_grid['max_depth']:
#             for alpha in param_grid['alpha']:
#                 for n_estimators in param_grid['n_estimators']:
#                     index = index + 1

#                     model = xgb.XGBClassifier(objective='multi:softmax',
#                                           num_class=4, 
#                                           colsample_bytree= colsample_bytree,
#                                           learning_rate = learning_rate,
#                                           max_depth = max_depth,
#                                           n_estimators= n_estimators)

#                     y = df_encoded['Approved_Flag']
#                     X = df_encoded.drop(['Approved_Flag'], axis=1)

#                     label_encoder = LabelEncoder()
#                     y_encoded = label_encoder.fit_transform(y)

#                     X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

#                     model.fit(X_train, y_train)

#                     y_pred_train = model.predict(X_train)
#                     y_pred_test = model.predict(X_test)

#                     train_accuracy = accuracy_score(y_train, y_pred_train)
#                     test_accuracy  = accuracy_score(y_test , y_pred_test)

#                     answer_grid['combination'].append(index)
#                     answer_grid['train_accuracy'].append(train_accuracy)
#                     answer_grid['test_accuracy'].append(test_accuracy)
#                     answer_grid['colsample_bytree'].append(colsample_bytree)
#                     answer_grid['learning_rate'].append(learning_rate)
#                     answer_grid['max_depth'].append(max_depth)
#                     answer_grid['alpha'].append(alpha)
#                     answer_grid['n_estimators'].append(n_estimators)

#                     # Print results for this combination
#                     print(f"Combination {index}")
#                     print(f"colsample_bytree: {colsample_bytree}, learning_rate: {learning_rate}, max_depth: {max_depth}, alpha: {alpha}, n_estimators: {n_estimators}")
#                     print(f"Train Accuracy: {train_accuracy:.2f}")
#                     print(f"Test Accuracy : {test_accuracy :.2f}")
#                     print("-" * 30)
                


In [34]:
df_result = pd.DataFrame(answer_grid)
df_result.head()

Unnamed: 0,combination,train_accuracy,test_accuracy,colsample_bytree,learning_rate,max_depth,alpha,n_estimators


In [35]:
df_result['test_accuracy'].max()

nan

In [36]:
df_result[df_result['test_accuracy']==0.7806965410673957]

Unnamed: 0,combination,train_accuracy,test_accuracy,colsample_bytree,learning_rate,max_depth,alpha,n_estimators


# 6. Tuned Model

In [37]:
xgb_model = xgb.XGBClassifier(objective='multi:softmax',  
                                   num_class=4,
                                  colsample_bytree=0.5,
                                  learning_rate=1,
                                  max_depth=3, 
                                  alpha=10,
                                  n_estimators=50)

In [38]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [39]:
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

In [40]:
accuracy = accuracy_score(y_test, y_pred)
print()
print(accuracy)


0.7811719957209081


In [47]:
import pickle
pickle.dump(xgb_model, open('xgboost_model.pkl', 'wb'))

In [46]:
y=set(y_pred)
y

{0, 1, 2, 3}