Training data preprocessing and Model training

Create dataframe and import necessary liabries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('train.csv')

Removing features with more than 20% missing values

In [None]:
plt.figure(figsize=(20, 50))
sns.barplot(x=data.isnull().sum(), y=data.columns)
plt.xlabel('Missing Values')
plt.ylabel('Features')
plt.show()

for col in data.columns:
    if data[col].isnull().sum() > 103558:
        data = data.drop(col, axis=1)

Removing features with the same value for all instances

In [None]:
cols = ["disbursement_method", "hardship_flag"]
for col in cols:
    data = data.drop(col, axis=1)

Encoding using one hot encoding and frequency encoding

In [None]:
oh_en_cols = ["term", "initial_list_status", "debt_settlement_flag"]   
data_encoded = pd.get_dummies(data, columns=oh_en_cols)

f_en_cols = ["application_type", "home_ownership", "verification_status", "grade", "emp_length", "purpose", "title", 
         "emp_title", "issue_d", "zip_code", "addr_state", "earliest_cr_line", "sub_grade", "last_pymnt_d", 
         
         "last_credit_pull_d", "pymnt_plan"]
for col in f_en_cols:
    dr_frequency_map=data_encoded[col].value_counts().to_dict()
    data_encoded[col] = data_encoded[col].map(dr_frequency_map)

Filling missing values of features with a skewed distribution

In [None]:
from scipy.stats import skew

# Identify skewed columns
numeric_cols = data_encoded.select_dtypes(include=['float64', 'int64']).columns
skewed_cols = data_encoded[numeric_cols].apply(lambda x: skew(x.dropna()))

# Choose a skewness threshold 
skew_threshold = 0.5
skewed_cols = skewed_cols[abs(skewed_cols) > skew_threshold].index

# Replace missing values with the median in skewed columns
for col in skewed_cols:
    median_value = data_encoded[col].median()
    data_encoded[col].fillna(median_value, inplace=True)



Filling the values of features (features with relatively low missing values) with the mean 

In [None]:
plt.figure(figsize=(20, 50))
sns.barplot(x=data_encoded.isnull().sum(), y=data_encoded.columns)
plt.xlabel('Missing Values')
plt.ylabel('Features')
plt.show()

fill_mean = ["bc_util", "revol_util", "mo_sin_old_il_acct", "percent_bc_gt_75"]

for col in fill_mean:
    data_encoded[col].fillna(data_encoded[col].mean(), inplace=True)

Fill other features with missing values with zero

In [None]:
data_encoded = data_encoded.fillna(value=0)
plt.figure(figsize=(20, 50))
sns.barplot(x=data_encoded.isnull().sum(), y=data_encoded.columns)
plt.xlabel('Missing Values')
plt.ylabel('Features')
plt.show()

Heatmap to see the correlations between a group of features and the loan status

In [None]:
cols = ['installment', 'int_rate', 'num_sats', 'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'percent_bc_gt_75',
        'pub_rec_bankruptcies', 'tax_liens', 'tot_hi_cred_lim', "mo_sin_rcnt_rev_tl_op", "num_rev_tl_bal_gt_0",
        "num_rev_accts", "num_bc_tl", "num_bc_sats", "num_actv_rev_tl", "num_actv_bc_tl", "mort_acc", 
        "num_accts_ever_120_pd", "mths_since_recent_inq", "mths_since_recent_bc", 
        "bc_open_to_buy", "mo_sin_rcnt_tl", "mo_sin_old_rev_tl_op", "mo_sin_old_il_acct", 
        "total_rev_hi_lim", "total_rec_late_fee", "total_rec_int" , "tot_cur_bal", "tot_coll_amt", 
        "collections_12_mths_ex_med", "policy_code", "avg_cur_bal","issue_d", "earliest_cr_line", 
        "last_pymnt_d", "last_credit_pull_d", "annual_inc", "dti", "delinq_2yrs", "inq_last_6mths", 
        "total_il_high_credit_limit", "loan_status"]

corr_matrix = data_encoded[cols].corr()
plt.figure(figsize=(50, 50))
heatmap = sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.show()


Removing features with weak correaltion with loan status

In [None]:
col_to_drop = ['installment', 'int_rate', 'num_sats', 'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 
               'pub_rec_bankruptcies', 'tax_liens', 'tot_hi_cred_lim', "mo_sin_rcnt_rev_tl_op", "num_rev_tl_bal_gt_0", 
               "num_rev_accts", "num_bc_tl", "num_bc_sats", "num_actv_rev_tl", "num_actv_bc_tl", "mort_acc", 
               "num_accts_ever_120_pd", "mths_since_recent_inq", "mths_since_recent_bc", "bc_open_to_buy", 
               "mo_sin_rcnt_tl", "mo_sin_old_rev_tl_op", "mo_sin_old_il_acct", "total_rev_hi_lim", "total_rec_late_fee", 
               "total_rec_int", "total_rev_hi_lim", "tot_cur_bal", "tot_coll_amt", "collections_12_mths_ex_med", 
               "policy_code", "avg_cur_bal","issue_d", "earliest_cr_line", "last_pymnt_d", "last_credit_pull_d", 
               "annual_inc", "dti", "delinq_2yrs", "inq_last_6mths", "total_il_high_credit_limit"]
data_encoded = data_encoded.drop(col_to_drop, axis=1)


Extracting loan status from the dataframe

In [None]:
y = data_encoded["loan_status"]
data_encoded = data_encoded.drop("loan_status", axis=1)

Standardizing the data

In [None]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
scaler.fit(data_encoded)
scaled_data=scaler.transform(data_encoded)

PCA transformation

In [None]:
from sklearn.decomposition import PCA

pca=PCA(n_components=20,random_state=42)
pca.fit(scaled_data)
X_pca=pca.transform(scaled_data)

Model training

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# fit model to training data
model = XGBClassifier()
model.fit(X_pca, y)

Making predictions with the model and calculating the accuracy of the predictions

In [None]:
# make predictions for test data
y_pred = model.predict(X_pca)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print(data_encoded.columns) 

SHAP analysis

In [None]:
import shap
!pip install shap
shap.initjs()

explainer=shap.Explainer(model)
shap_values=explainer(X_pca)
shap.plots.waterfall(shap_values[0])
shap.plots.bar(shap_values)
shap.plots.beeswarm(shap_values)

valid data preprocessing

Create dataframe and import necessary liabries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('X_test.csv')

Removing features with more than 20% missing values

In [None]:
for col in data.columns:
    if data[col].isnull().sum() > 34520:
        data = data.drop(col, axis=1)

Removing features with the same value for all instances

In [None]:
cols = ["disbursement_method", "hardship_flag"]
for col in cols:
    data = data.drop(col, axis=1)

Encoding using one hot encoding and frequency encoding

In [None]:
oh_en_cols = ["term", "initial_list_status", "debt_settlement_flag"]   
data_encoded = pd.get_dummies(data, columns=oh_en_cols)

f_en_cols = ["application_type", "home_ownership", "verification_status", "grade", "emp_length", "purpose", "title", 
         "emp_title", "issue_d", "zip_code", "addr_state", "earliest_cr_line", "sub_grade", "last_pymnt_d", 
         "last_credit_pull_d", "pymnt_plan"]
for col in f_en_cols:
    dr_frequency_map=data_encoded[col].value_counts().to_dict()
    data_encoded[col] = data_encoded[col].map(dr_frequency_map)

Filling missing values of features with a skewed distribution

In [None]:
from scipy.stats import skew

# Identify skewed columns
numeric_cols = data_encoded.select_dtypes(include=['float64', 'int64']).columns
skewed_cols = data_encoded[numeric_cols].apply(lambda x: skew(x.dropna()))

# Choose a skewness threshold 
skew_threshold = 0.5
skewed_cols = skewed_cols[abs(skewed_cols) > skew_threshold].index

# Replace missing values with the median in skewed columns
for col in skewed_cols:
    median_value = data_encoded[col].median()
    data_encoded[col].fillna(median_value, inplace=True)

Filling the values of features (features with relatively low missing values) with the mean 

In [None]:
plt.figure(figsize=(20, 50))
sns.barplot(x=data_encoded.isnull().sum(), y=data_encoded.columns)
plt.xlabel('Missing Values')
plt.ylabel('Features')
plt.show()

fill_mean = ["bc_util", "revol_util", "mo_sin_old_il_acct", "percent_bc_gt_75"]

for col in fill_mean:
    data_encoded[col].fillna(data_encoded[col].mean(), inplace=True)

Fill other features with missing values with zero

In [None]:
data_encoded = data_encoded.fillna(value=0)
plt.figure(figsize=(20, 50))
sns.barplot(x=data_encoded.isnull().sum(), y=data_encoded.columns)
plt.xlabel('Missing Values')
plt.ylabel('Features')
plt.show()

Removing features with weak correaltion with loan status

In [None]:
col_to_drop = ['installment', 'int_rate', 'num_sats', 'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 
               'pub_rec_bankruptcies', 'tax_liens', 'tot_hi_cred_lim', "mo_sin_rcnt_rev_tl_op", "num_rev_tl_bal_gt_0", 
               "num_rev_accts", "num_bc_tl", "num_bc_sats", "num_actv_rev_tl", "num_actv_bc_tl", "mort_acc", 
               "num_accts_ever_120_pd", "mths_since_recent_inq", "mths_since_recent_bc", "bc_open_to_buy", 
               "mo_sin_rcnt_tl", "mo_sin_old_rev_tl_op", "mo_sin_old_il_acct", "total_rev_hi_lim", "total_rec_late_fee", 
               "total_rec_int", "total_rev_hi_lim", "tot_cur_bal", "tot_coll_amt", "collections_12_mths_ex_med", 
               "policy_code", "avg_cur_bal","issue_d", "earliest_cr_line", "last_pymnt_d", "last_credit_pull_d", 
               "annual_inc", "dti", "delinq_2yrs", "inq_last_6mths", "total_il_high_credit_limit"]
data_encoded = data_encoded.drop(col_to_drop, axis=1)

Standardizing the data

In [None]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
scaler.fit(data_encoded)
scaled_data=scaler.transform(data_encoded)

PCA transformation

In [None]:
from sklearn.decomposition import PCA

pca=PCA(n_components=20,random_state=42)
pca.fit(scaled_data)
X_pca=pca.transform(scaled_data)

Making predictions with the trained models

In [None]:
y_pred = model.predict(X_pca)

Creating the modified csv file

In [None]:
final_result_df = pd.DataFrame(data_encoded)
final_result_df.insert(0, 'loan_status', y_pred)
final_result_df.to_csv('210099V.csv', index=False)

SHAP analysis

In [None]:
import shap
shap.initjs()

explainer=shap.Explainer(model)
shap_values=explainer(X_pca)
shap.plots.waterfall(shap_values[0])
shap.plots.bar(shap_values)
shap.plots.beeswarm(shap_values)