<a href="https://colab.research.google.com/github/thimmie52/AfricanScoringChallenge/blob/main/Day11AnalysisAfricanCreditScoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install catboost optuna dython

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting dython
  Downloading dython-0.7.8-py3-none-any.whl.metadata (2.9 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dython-0.7.8-py3-none-any.whl (26 kB)
Down

In [None]:
### Basic csv imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

### Analysis
from scipy.stats.contingency import chi2_contingency
from scipy.stats import ttest_ind
import math
from dython.nominal import associations

### Preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

### Warnings
import warnings
warnings.filterwarnings('ignore')

### Model Building
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

### Post Model
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, f1_score


### Setup of Data

In [None]:
train = pd.read_csv('/content/drive/MyDrive/Project /AfricanCreditScoring/Data/Train.csv')
test = pd.read_csv('/content/drive/MyDrive/Project /AfricanCreditScoring/Data/Test.csv')
indicators = pd.read_csv('/content/drive/MyDrive/Project /AfricanCreditScoring/Data/economic_indicators.csv')


df = pd.concat([train, test], axis=0)

In [None]:
add = indicators[indicators['Country'].isin(["Ghana", 'Kenya'])].T

In [None]:
row_index = add.index.get_loc('YR2020')
add.iloc[row_index, 4] = 14.5
row_index = add.index.get_loc('YR2021')
add.iloc[row_index, 4] = 14.5
row_index = add.index.get_loc('YR2022')
add.iloc[row_index, 4] = 27
row_index = add.index.get_loc('YR2023')
add.iloc[row_index, 4] = 30

row_index = add.index.get_loc('YR2020')
add.iloc[row_index, 10] = 23.06
row_index = add.index.get_loc('YR2021')
add.iloc[row_index, 10] = 20.97
row_index = add.index.get_loc('YR2022')
add.iloc[row_index, 10] = 20.16
row_index = add.index.get_loc('YR2023')
add.iloc[row_index, 10] = 35.85


new_header = add.iloc[0]
add = add[1:]
add.columns = new_header


new_header = add.iloc[0]
add.columns = [str(col) + ' ' + str(new_header[i]) for i, col in enumerate(add.columns)]


add.drop(['Indicator'], axis=0, inplace=True)

add = add.reset_index()
add = add.rename(columns={'index': 'country'})

add['country'] = add['country'].str.replace('YR', '')
add['country'] = add['country'].astype(int)



In [None]:
# prompt: I want to use regex to search for any name that starts with Ghana from a list without creating a function

import re

ghana_names = [name for name in add.columns if re.match(r'^Ghana', str(name))] + ['country_id']
kenya_names = [name for name in add.columns if re.match(r'^Kenya', str(name))]+ ['country_id']

add.rename(columns={'country': 'country_id'}, inplace=True)

In [None]:
df['disbursement_date'] = pd.to_datetime(df['disbursement_date'])
df['disbursement_year'] = df['disbursement_date'].dt.year

df['due_date'] = pd.to_datetime(df['due_date'])
df['due_year'] = df['due_date'].dt.year

In [None]:
kenya = df[df['country_id'] == 'Kenya']
ghana = df[df['country_id'] == 'Ghana']

joined_kenya = kenya.merge(add[kenya_names], left_on='disbursement_year', right_on='country_id', how='left')
joined_ghana = ghana.merge(add[ghana_names], left_on='disbursement_year', right_on='country_id', how='left')


In [None]:
joined_kenya.drop(['country_id_y'], axis=1, inplace=True)
joined_ghana.drop(['country_id_y'], axis=1, inplace=True)

joined_kenya.rename(columns= {'Kenya Inflation, consumer prices (annual %)': 'Inflation',
                               'Kenya Official exchange rate (LCU per US$, period average)': 'Exhange_rate',
                               'Kenya Real interest rate (%)': 'Interest_rate',
                               'Kenya Average precipitation in depth (mm per year)': 'Precipitation',
                               'Kenya Deposit interest rate (%)': 'Deposit_rate',
                               'Kenya Unemployment rate': 'Unemployment_rate',
                                'Kenya Lending interest rate (%)': 'Lending_rate',
                               'Kenya Interest rate spread (lending rate minus deposit rate, %)': 'Interest_rate_spread',
                               'Kenya Fossil fuel energy consumption (% of total)': 'Fossil_Fuel'}, inplace=True)


joined_ghana.rename(columns= {'Ghana Inflation, consumer prices (annual %)': 'Inflation',
                               'Ghana Official exchange rate (LCU per US$, period average)': 'Exhange_rate',
                               'Ghana Real interest rate (%)': 'Interest_rate',
                               'Ghana Average precipitation in depth (mm per year)': 'Precipitation',
                               'Ghana Deposit interest rate (%)': 'Deposit_rate',
                               'Ghana Unemployment rate': 'Unemployment_rate',
                                'Ghana Lending interest rate (%)': 'Lending_rate',
                               'Ghana Interest rate spread (lending rate minus deposit rate, %)': 'Interest_rate_spread',
                               'Ghana Fossil fuel energy consumption (% of total)': 'Fossil_Fuel'}, inplace=True)

In [None]:
joined_kenya['Inflation'] = joined_kenya['Inflation'].fillna(5.1)
joined_kenya['Exhange_rate'] = joined_kenya['Exhange_rate'].fillna(129.06)

numerical_cols = ['Interest_rate','Deposit_rate', 'Lending_rate']
for col in numerical_cols:
    if col != 'target':  # Exclude the target variable
        joined_kenya[col] = joined_kenya[col].astype(float)
        joined_kenya[col] = joined_kenya[col].fillna(joined_kenya[col].median())

In [None]:
df1 = pd.concat([joined_kenya, joined_ghana], axis=0)
df1.drop(['Precipitation', 'Fossil_Fuel', 'Interest_rate_spread', 'Unemployment_rate'], axis=1, inplace=True)

### Data Analysis

In [None]:
df1['Total_Amount'] = np.abs(df1['Total_Amount'])

In [None]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 87248 entries, 0 to 3524
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   ID                           87248 non-null  object        
 1   customer_id                  87248 non-null  int64         
 2   country_id_x                 87248 non-null  object        
 3   tbl_loan_id                  87248 non-null  int64         
 4   lender_id                    87248 non-null  int64         
 5   loan_type                    87248 non-null  object        
 6   Total_Amount                 87248 non-null  float64       
 7   Total_Amount_to_Repay        87248 non-null  float64       
 8   disbursement_date            87248 non-null  datetime64[ns]
 9   due_date                     87248 non-null  datetime64[ns]
 10  duration                     87248 non-null  int64         
 11  New_versus_Repeat            87248 non-null  ob

In [None]:
df1['tbl_loan_id'].value_counts()[:20]

Unnamed: 0_level_0,count
tbl_loan_id,Unnamed: 1_level_1
364043,3
363955,3
364297,3
364196,3
364483,3
364424,3
364260,3
364361,3
364479,3
364044,3


In [None]:
df1[df1['tbl_loan_id'] == 364479]

Unnamed: 0,ID,customer_id,country_id_x,tbl_loan_id,lender_id,loan_type,Total_Amount,Total_Amount_to_Repay,disbursement_date,due_date,...,Lender_portion_Funded,Lender_portion_to_be_repaid,target,disbursement_year,due_year,Inflation,Exhange_rate,Interest_rate,Deposit_rate,Lending_rate
477,ID_296803364479296540,296803,Ghana,364479,296540,Type_2,105002.96,116494.95,2022-09-27,2023-03-26,...,0.04121,5751.83,,2022,2023,31.255895,8.2724,27,12.052083,20.16
3138,ID_296803364479297182,296803,Ghana,364479,297182,Type_2,105002.96,116494.95,2022-09-27,2023-03-26,...,0.060667,8470.24,,2022,2023,31.255895,8.2724,27,12.052083,20.16
3429,ID_296803364479245684,296803,Ghana,364479,245684,Type_2,105002.96,116494.95,2022-09-27,2023-03-26,...,0.028978,4048.45,,2022,2023,31.255895,8.2724,27,12.052083,20.16


In [None]:
ghana['lender_id'].value_counts()

Unnamed: 0_level_0,count
lender_id,Unnamed: 1_level_1
296542,1803
297183,1264
296540,179
297182,163
245684,116


In [None]:
kenya['lender_id'].value_counts()

Unnamed: 0_level_0,count
lender_id,Unnamed: 1_level_1
267278,78874
251804,4303
267277,317
245684,229


In [None]:
customer = df1['customer_id'].value_counts().index

In [None]:
for i in customer:
  if df1[df1['customer_id'] == i]['country_id_x'].value_counts().index.shape != (1,):
    print(i)

In [None]:
df1[df1['customer_id'] == 247613]['country_id_x'].value_counts().index.shape == (1,)

True

In [None]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 87248 entries, 0 to 3524
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   ID                           87248 non-null  object        
 1   customer_id                  87248 non-null  int64         
 2   country_id_x                 87248 non-null  object        
 3   tbl_loan_id                  87248 non-null  int64         
 4   lender_id                    87248 non-null  int64         
 5   loan_type                    87248 non-null  object        
 6   Total_Amount                 87248 non-null  float64       
 7   Total_Amount_to_Repay        87248 non-null  float64       
 8   disbursement_date            87248 non-null  datetime64[ns]
 9   due_date                     87248 non-null  datetime64[ns]
 10  duration                     87248 non-null  int64         
 11  New_versus_Repeat            87248 non-null  ob

In [None]:
# 245684	is a lender that is active across Kenya and Ghana
# 296542 is probably the biggest bank in Ghana while 267278 is the biggest in Kenya
# For this analysis, We would assume that they have similar characteristics
# 297183 in ghana then would have similar characteristics with 251804 in kenya.


In [None]:
bank245684 = df1[df1['lender_id'] == 245684]
bank296542 = df1[df1['lender_id'] == 251804]
bank267278 = df1[df1['lender_id'] == 267278]

In [None]:
# The bank with ID 267278 in kenya, has about 78,874 applications.
# The highest loan recieved is 4408789.0
# The lowest loan recieved is 2.
# The average loan requested is 11,395, with a median of 5000.0, which means that a lot of customers usually borrow loans of little amount. Except few cases.
# Bank 267278 grants loan mostly when the rate is high, They also prefer to loan high amounts better.

# Bank 267278 113.8252 is the preferred min_rate. Duration = 7 to 360. ---> 296542, 297183

# Bank 251804 107.35625 is the preferred min_rate. Duration = 7 to 50. ----> 296540

# Bank 267277, You need a loan amount of 20,000 at least, interest of about 114, duration of 50 - 360 ---> 297182

# Bank 245684, You need a rate of at least 111, Duration 14-30



In [None]:
banks = [267278, 296542, 297183, 251804, 296540, 267277, 297182, 245684]
min_rate = [113.8, 113.8, 113.8, 107.35, 107.35, 114, 114, 111]
min_duration = [7,7,7,7,7,50,50,14]
max_duration = [360,360,360,50,50,360,360,30]

bank_info = pd.DataFrame({'lender_id': banks, 'min_rate': min_rate, 'min_duration': min_duration, 'max_duration': max_duration})

In [None]:
bank_info

Unnamed: 0,lender_id,min_rate,min_duration,max_duration
0,267278,113.8,7,360
1,296542,113.8,7,360
2,297183,113.8,7,360
3,251804,107.35,7,50
4,296540,107.35,7,50
5,267277,114.0,50,360
6,297182,114.0,50,360
7,245684,111.0,14,30


In [None]:
df1 = pd.merge(df1, bank_info, on='lender_id', how='left')

In [None]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87248 entries, 0 to 87247
Data columns (total 26 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   ID                           87248 non-null  object        
 1   customer_id                  87248 non-null  int64         
 2   country_id_x                 87248 non-null  object        
 3   tbl_loan_id                  87248 non-null  int64         
 4   lender_id                    87248 non-null  int64         
 5   loan_type                    87248 non-null  object        
 6   Total_Amount                 87248 non-null  float64       
 7   Total_Amount_to_Repay        87248 non-null  float64       
 8   disbursement_date            87248 non-null  datetime64[ns]
 9   due_date                     87248 non-null  datetime64[ns]
 10  duration                     87248 non-null  int64         
 11  New_versus_Repeat            87248 non-nu

In [None]:
df1['rate'] = df1['Total_Amount_to_Repay'] / df1['Total_Amount']

In [None]:
# We create a scoring template that can aid prediction.
# Meets the minimum rate = 10 points
# Meets the minimum duration = 5 points
# Meets the maximun duration = 5 points
# repay_prob * 10 = x points


df1['Lender_portion_Funded'] = df1['Amount_Funded_By_Lender'] / df1['Total_Amount']
df1['Percentage_lender_tobe_repaid'] = df1['Lender_portion_to_be_repaid'] / df1['Total_Amount']
df1['Percent_to_repay'] = (df1['Total_Amount_to_Repay'] - df1['Total_Amount']) / df1['Total_Amount']
df1['LoanRatio'] = df1['Total_Amount_to_Repay'] / df1['duration']



df1['minimum_rate_points'] = np.where(df1['rate'] >= df1['min_rate'], 10, 0)
df1['duration_points'] = np.where((df1['duration'] >= df1['min_duration']) & (df1['duration'] <= df1['max_duration']), 10, 0)
df1['risk'] = np.where(df1['duration'] > 366, -5, 0)
df1['loan_type_risk'] = np.where(df1['loan_type'].isin(["Type_14", "Type_2", "Type_23"]), 5, 0)
df1['Risk2'] = np.where((df1['Percent_to_repay'] < 02.5) & (df1['duration'] > 0) & (df1['Percent_to_repay'] > 0.1), 10,0)
df1['total_points'] = df1['minimum_rate_points'] + df1['duration_points'] + df1['risk'] + df1['loan_type_risk'] + df1['Risk2']

In [None]:
df1['disbursement_month'] = df1['disbursement_date'].dt.month
df1['disbursement_dow'] = df1['disbursement_date'].dt.day_of_week
df1['due_month'] = df1['due_date'].dt.month
df1['due_dow'] = df1['due_date'].dt.day_of_week

In [None]:
df1['Risk3'] = np.where((df1['disbursement_year'] == 2021) & (df1['disbursement_month'].isin([2,3,4,12])), 1,0)
df1['Risk4'] = np.where(df1['rate'] > 1.1, 1,0)

In [None]:
df2 = df1.copy()

Le = LabelEncoder()

for i in ['country_id_x', 'loan_type', 'target']:
  df2[i] = Le.fit_transform(df2[i])

df2.drop(['disbursement_date', 'due_date', 'loan_type', 'ID', 'customer_id'], axis=1, inplace=True)

In [None]:
result = associations(df2 , nominal_columns=['loan_type', 'New_versus_Repeat', 'loan_type_risk', 'target', 'Risk2', 'Risk3', 'disbursement_year', 'disbursement_month'],compute_only=True)

In [None]:
result['corr']['target'].sort_values(ascending=False)

Unnamed: 0,target
target,1.0
Percent_to_repay,0.533191
rate,0.533191
Risk4,0.527944
Risk2,0.527926
total_points,0.501057
country_id_x,0.394279
Interest_rate,0.393236
Exhange_rate,0.390793
Inflation,0.389874


In [None]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87248 entries, 0 to 87247
Data columns (total 42 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   ID                             87248 non-null  object        
 1   customer_id                    87248 non-null  int64         
 2   country_id_x                   87248 non-null  object        
 3   tbl_loan_id                    87248 non-null  int64         
 4   lender_id                      87248 non-null  int64         
 5   loan_type                      87248 non-null  object        
 6   Total_Amount                   87248 non-null  float64       
 7   Total_Amount_to_Repay          87248 non-null  float64       
 8   disbursement_date              87248 non-null  datetime64[ns]
 9   due_date                       87248 non-null  datetime64[ns]
 10  duration                       87248 non-null  int64         
 11  New_versus_Repe

In [None]:
df1[['Inflation', 'Exhange_rate', 'Interest_rate', 'Deposit_rate', 'Lending_rate']] = df1[['Inflation', 'Exhange_rate', 'Interest_rate', 'Deposit_rate', 'Lending_rate']].astype(float)

In [None]:
df1['New_versus_Repeat'] = np.where(df1['New_versus_Repeat'] == 'Repeat Loan', 1, 0)

In [None]:
df1['customer_id'] = df1['customer_id'].astype("category")

In [None]:
df1['LogPercentagelendertobeRepaid'] = np.tan(np.log(df1['Lender_portion_Funded']*df1['LoanRatio']))
df1['LogPercentagelendertobeRepaid'].fillna(0, inplace=True)

In [None]:
df1['Percentage_to_repay'] = df1['Total_Amount_to_Repay'] / df1['Total_Amount']


In [None]:
df1['Risk5'] = np.where((df1['LogPercentagelendertobeRepaid'] >= -2000) & (df1['Percentage_to_repay']>=1.5) & (df1['LogPercentagelendertobeRepaid'] <= 2000) , 1,0)

In [None]:
df1['is_weekend_disbursement'] = df1['disbursement_dow'].isin([5, 6]).astype(int)
df1['is_weekend_due'] = df1['due_dow'].isin([5, 6]).astype(int)
df1['is_loan_spanning_years'] = (df1['disbursement_year'] != df1['due_year']).astype(int)
df1['repayment_efficiency'] = df1['Percentage_to_repay'] / (df1['Percent_to_repay'] + 1e-8)
df1['lender_efficiency'] = df1['Percentage_lender_tobe_repaid'] / (df1['Lender_portion_Funded'] + 1e-8)
df1['inflation_change'] = df1['Inflation'] - df1.groupby('disbursement_year')['Inflation'].transform('mean')
df1['interest_rate_change'] = df1['Interest_rate'] - df1.groupby('disbursement_year')['Interest_rate'].transform('mean')
df1['inflation_interest_interaction'] = df1['Inflation'] * df1['Interest_rate']

# 7. Risk Features
df1['average_risk_score'] = df1[['Risk2', 'Risk4', 'Risk5','loan_type_risk']].mean(axis=1)

df1['EMI'] = df1['LoanRatio'] * df1['interest_rate_change']


In [None]:
df1.drop(['country_id_x', 'disbursement_date', 'due_date', 'Total_Amount', 'Total_Amount_to_Repay', 'Amount_Funded_By_Lender'], axis=1, inplace=True)

In [None]:
categorical_features = ['loan_type_risk', 'loan_type', 'New_versus_Repeat']
numerical_features = [col for col in df1.columns if col not in categorical_features]

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_data = encoder.fit_transform(df1[categorical_features])

# Get feature names after encoding
encoded_feature_names = encoder.get_feature_names_out(categorical_features)

# Create DataFrame with encoded features for the entire dataset
encoded_df = pd.DataFrame(encoded_data, columns=encoded_feature_names, index=df1.index)

# Concatenate encoded features with numerical features for the entire dataset
df_encoded = pd.concat([df1[numerical_features], encoded_df], axis=1)

### Modelling

In [None]:
train_df = df_encoded[df_encoded['target'].notnull()]
test_df = df_encoded[df_encoded['target'].isnull()]

In [None]:
X = train_df.drop(['ID', 'target'], axis=1)
y = train_df['target']

In [None]:
categorical_features = ['loan_type_risk', 'loan_type', 'New_versus_Repeat']
numerical_features = [col for col in X.columns if col not in categorical_features + ['LogPercentagelendertobeRepaid']]
mms = StandardScaler()
X[numerical_features] = mms.fit_transform(X[numerical_features])
test_df[numerical_features] = mms.transform(test_df[numerical_features])

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 68654 entries, 0 to 68653
Data columns (total 72 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   customer_id                     68654 non-null  float64
 1   tbl_loan_id                     68654 non-null  float64
 2   lender_id                       68654 non-null  float64
 3   duration                        68654 non-null  float64
 4   Lender_portion_Funded           68654 non-null  float64
 5   Lender_portion_to_be_repaid     68654 non-null  float64
 6   disbursement_year               68654 non-null  float64
 7   due_year                        68654 non-null  float64
 8   Inflation                       68654 non-null  float64
 9   Exhange_rate                    68654 non-null  float64
 10  Interest_rate                   68654 non-null  float64
 11  Deposit_rate                    68654 non-null  float64
 12  Lending_rate                    68654

In [None]:
%%time

from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier



params = {'n_estimators': 200, 'max_depth': 15, 'min_samples_split': 14, 'min_samples_leaf': 1}
model = CatBoostClassifier(verbose = False)
model.fit
tes = test_df.drop(['ID', 'target'], axis=1)

# Define the number of folds for the KFold cross-validation
n_folds = 10

# Create an instance of KFold with 5 splits, shuffled, and with random_state set to 1235
kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=1235)

# Create arrays to store the out-of-fold and prediction values
oofs = np.zeros((len(X)))
oofs1 = np.zeros((len(X)))
preds = np.zeros((len(tes)))



# Iterate over the folds
for fold, (train_index, test_index) in enumerate(kf.split(X, y)):
    # Split the data into training and testing sets for this fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Predict on the validation set
    vp3 = model.predict_proba(X_test)[:,-1]
    vp2 = (vp3 > 0.5).astype(int)
    vp = model.predict_proba(X_test)[:,1]
    vp1 = vp.copy()

    # Calculate the validation score as the root mean squared error between the true values and predictions
    val_score = f1_score((y_test), (vp2))
    roc = roc_auc_score(y_test, vp)
    print(f'\nFold {fold+1} Val score: {val_score}   ROC_Score: {roc}\n')

    # Predict on the test set
    tp = model.predict_proba(tes)[:, 1]

    # Store the validation predictions in the oofs array
    oofs[test_index] = vp2
    oofs1[test_index] = vp3
    # Average the predictions over the 5 folds and store in the preds array
    preds += tp/n_folds

# Calculate the out-of-fold score as the root mean squared error between the true values and predictions 0.704879757179547
oof_score = f1_score((y), (oofs))
print(f'\nOOF F1 score is : {oof_score}')


Fold 1 Val score: 0.8852459016393442   ROC_Score: 0.9990568037303943


Fold 2 Val score: 0.9016393442622951   ROC_Score: 0.9978816353445433


Fold 3 Val score: 0.928   ROC_Score: 0.9985528237011917


Fold 4 Val score: 0.8871595330739299   ROC_Score: 0.9983526447176299


Fold 5 Val score: 0.8979591836734694   ROC_Score: 0.993185756676558


Fold 6 Val score: 0.8979591836734694   ROC_Score: 0.9955916913946588


Fold 7 Val score: 0.8713692946058091   ROC_Score: 0.9973431129388987


Fold 8 Val score: 0.8991596638655462   ROC_Score: 0.9976163389132673


Fold 9 Val score: 0.8979591836734694   ROC_Score: 0.9983853758152615


Fold 10 Val score: 0.9349593495934959   ROC_Score: 0.9985431873694228


OOF F1 score is : 0.90020366598778
CPU times: user 6min 53s, sys: 16.6 s, total: 7min 10s
Wall time: 4min 23s


In [None]:
# prompt: # prompt: Let us do some feature importance, best 20 features

# Get feature importances from the trained CatBoost model
feature_importances = model.get_feature_importance(prettified=True)

# Sort feature importances in descending order
feature_importances = feature_importances.sort_values(by='Importances', ascending=False)

# Select the top 20 features
top_20_features = feature_importances.head(25)

# Print or use the top 20 features
top_20_features

Unnamed: 0,Feature Id,Importances
0,lender_efficiency,9.771391
1,LoanRatio,9.644802
2,due_month,8.375982
3,tbl_loan_id,7.519499
4,Lender_portion_to_be_repaid,7.179453
5,Lender_portion_Funded,6.898363
6,rate,6.872536
7,Percentage_to_repay,6.742721
8,Percent_to_repay,5.033108
9,LogPercentagelendertobeRepaid,4.562921


In [None]:
top_20_features["Feature Id"].values

array(['lender_efficiency', 'LoanRatio', 'due_month', 'tbl_loan_id',
       'Lender_portion_to_be_repaid', 'Lender_portion_Funded', 'rate',
       'Percentage_to_repay', 'Percent_to_repay',
       'LogPercentagelendertobeRepaid', 'Percentage_lender_tobe_repaid',
       'disbursement_month', 'customer_id', 'disbursement_dow',
       'repayment_efficiency', 'loan_type_Type_7', 'due_dow',
       'loan_type_Type_1', 'duration', 'EMI', 'loan_type_Type_5',
       'Exhange_rate', 'inflation_interest_interaction',
       'average_risk_score', 'disbursement_year'], dtype=object)

In [None]:
%%time

from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier



params = {'n_estimators': 200, 'max_depth': 15, 'min_samples_split': 14, 'min_samples_leaf': 1}
model = CatBoostClassifier(verbose = False)
model.fit
tes = test_df.drop(['ID', 'target'], axis=1)

# Define the number of folds for the KFold cross-validation
n_folds = 10

# Create an instance of KFold with 5 splits, shuffled, and with random_state set to 1235
kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=1235)


X1 = X[top_20_features["Feature Id"].values]
tes1 = tes[top_20_features["Feature Id"].values]

# Create arrays to store the out-of-fold and prediction values
oofs = np.zeros((len(X1)))
oofs1 = np.zeros((len(X1)))
preds = np.zeros((len(tes1)))



# Iterate over the folds
for fold, (train_index, test_index) in enumerate(kf.split(X1, y)):
    # Split the data into training and testing sets for this fold
    X_train, X_test = X1.iloc[train_index], X1.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Predict on the validation set
    vp3 = model.predict_proba(X_test)[:,-1]
    vp2 = (vp3 > 0.5).astype(int)
    vp = model.predict_proba(X_test)[:,1]
    vp1 = vp.copy()

    # Calculate the validation score as the root mean squared error between the true values and predictions
    val_score = f1_score((y_test), (vp2))
    roc = roc_auc_score(y_test, vp)
    print(f'\nFold {fold+1} Val score: {val_score}   ROC_Score: {roc}\n')

    # Predict on the test set
    tp = model.predict_proba(tes1)[:, 1]

    # Store the validation predictions in the oofs array
    oofs[test_index] = vp2
    oofs1[test_index] = vp3
    # Average the predictions over the 5 folds and store in the preds array
    preds += tp/n_folds

# Calculate the out-of-fold score as the root mean squared error between the true values and predictions 0.704879757179547
oof_score = f1_score((y), (oofs))
print(f'\nOOF F1 score is : {oof_score}')


Fold 1 Val score: 0.8943089430894309   ROC_Score: 0.9991097922848665


Fold 2 Val score: 0.9090909090909091   ROC_Score: 0.9981136074607885


Fold 3 Val score: 0.9133858267716536   ROC_Score: 0.9982454900852528


Fold 4 Val score: 0.9163346613545816   ROC_Score: 0.9980264707267674


Fold 5 Val score: 0.9024390243902439   ROC_Score: 0.9939026706231454


Fold 6 Val score: 0.8934426229508197   ROC_Score: 0.9951976261127596


Fold 7 Val score: 0.8699186991869918   ROC_Score: 0.9972406531985105


Fold 8 Val score: 0.9037656903765691   ROC_Score: 0.9970522214920493


Fold 9 Val score: 0.8934426229508197   ROC_Score: 0.9980579757252853


Fold 10 Val score: 0.9349593495934959   ROC_Score: 0.997917829643605


OOF F1 score is : 0.903173311635476
CPU times: user 6min 3s, sys: 15.6 s, total: 6min 19s
Wall time: 3min 51s


In [None]:
y_preds = np.where(preds > 0.5, 1, 0)
sub_file = test_df[['ID']].copy()
sub_file['target'] = y_preds
sub_file.to_csv('Day10AnalysisAfricanCreditScoring(3).csv', index=False)