In [3]:
 #Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

application_df = pd.read_csv("application_record.csv")
credit_df = pd.read_csv("credit_record.csv")
application_df['CNT_CHILDREN'] = application_df['CNT_CHILDREN'].astype(int)
application_df['AMT_INCOME_TOTAL'] = application_df['AMT_INCOME_TOTAL'].astype(float)
application_df['DAYS_BIRTH'] = application_df['DAYS_BIRTH'].astype(int)
application_df['DAYS_EMPLOYED'] = application_df['DAYS_EMPLOYED'].astype(int)
application_df['CNT_FAM_MEMBERS'] = application_df['CNT_FAM_MEMBERS'].astype(int)

application_df['OCCUPATION_TYPE'].fillna('Unknown', inplace=True)

# Drop any remaining rows with missing values
application_df.dropna(inplace=True)

application_df['AGE_YEARS'] = abs(application_df['DAYS_BIRTH']) / 365
application_df['YEARS_EMPLOYED'] = abs(application_df['DAYS_EMPLOYED']) / 365


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  application_df['OCCUPATION_TYPE'].fillna('Unknown', inplace=True)


In [4]:

application_df.drop(['FLAG_MOBIL', 'DAYS_BIRTH', 'DAYS_EMPLOYED'], axis=1, inplace=True)

# Process Credit Records
credit_df['MONTHS_BALANCE'] = credit_df['MONTHS_BALANCE'].astype(int)
credit_df['STATUS'] = credit_df['STATUS'].astype(str)

credit_df = credit_df[~credit_df['STATUS'].isin(['X', 'C'])]


joined_df = pd.merge(application_df, credit_df, on='ID', how='inner')


final_df = joined_df[['ID', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
                      'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
                      'AGE_YEARS', 'YEARS_EMPLOYED', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'STATUS']]

final_df['label'] = final_df['STATUS'].apply(lambda x: 1 if x == 'approved' else 0)

final_df.drop(['ID', 'STATUS'], axis=1, inplace=True)

categorical_cols = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE',
                    'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']
final_df = pd.get_dummies(final_df, columns=categorical_cols, drop_first=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['label'] = final_df['STATUS'].apply(lambda x: 1 if x == 'approved' else 0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.drop(['ID', 'STATUS'], axis=1, inplace=True)


In [5]:

X = final_df.drop('label', axis=1)
y = final_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [7]:

nn_model = Sequential([
    Dense(64, input_shape=(X_train.shape[1],), activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [11]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier


application_data = pd.read_csv('/content/application_record.csv')
credit_data = pd.read_csv('/content/credit_record.csv')


merged_data = pd.merge(application_data, credit_data, on='ID', how='left')

merged_data['Delinquency'] = merged_data['STATUS'].apply(lambda x: 1 if x in ['2', '3', '4', '5'] else 0)

target_data = merged_data.groupby('ID')['Delinquency'].max().reset_index()
final_data = pd.merge(application_data, target_data, on='ID', how='left')

final_data['Delinquency'] = final_data['Delinquency'].fillna(0)

final_data = final_data.drop(['ID', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
                              'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE'], axis=1)


final_data = pd.get_dummies(final_data, drop_first=True)
final_data.dropna(inplace=True)



In [12]:

X = final_data.drop('Delinquency', axis=1)
y = final_data['Delinquency']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

xgb_model = XGBClassifier(
    scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]),
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    n_estimators=500,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)



In [14]:

xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
classification_rep_xgb = classification_report(y_test, y_pred_xgb)

print("Random forest acuracy:", accuracy_xgb)
print("Classification Report:\n", classification_rep_xgb)

results = X_test.copy()
results['Actual Delinquency'] = y_test.values
results['Predicted Delinquency'] = y_pred_xgb

Parameters: { "use_label_encoder" } are not used.



Random forest acuracy: 0.9764999143721577
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98    131337
           1       0.97      0.98      0.98    131428

    accuracy                           0.98    262765
   macro avg       0.98      0.98      0.98    262765
weighted avg       0.98      0.98      0.98    262765

Predicted results saved to 'predicted_results_xgb.csv'


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier


application_data = pd.read_csv('/content/application_record.csv')
credit_data = pd.read_csv('/content/credit_record.csv')

merged_data = pd.merge(application_data, credit_data, on='ID', how='left')

merged_data['Delinquency'] = merged_data['STATUS'].apply(lambda x: 1 if x in ['0','1','2', '3', '4', '5'] else 0)

target_data = merged_data.groupby('ID')['Delinquency'].max().reset_index()
final_data = pd.merge(application_data, target_data, on='ID', how='left')

final_data['Delinquency'] = final_data['Delinquency'].fillna(0)

final_data = final_data.drop(['ID', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
                              'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE'], axis=1)

final_data = pd.get_dummies(final_data, drop_first=True)
final_data.dropna(inplace=True)

X = final_data.drop('Delinquency', axis=1)
y = final_data['Delinquency']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=0)

xgb_model = XGBClassifier(
    scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]),
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    n_estimators=800,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)


xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
classification_rep_xgb = classification_report(y_test, y_pred_xgb)

print("Random forest Accuracy:", accuracy_xgb)
print("Classification Report:\n", classification_rep_xgb)

results = X_test.copy()
results['Actual Delinquency'] = y_test.values
results['Predicted Delinquency'] = y_pred_xgb


Parameters: { "use_label_encoder" } are not used.



Random forest Accuracy: 0.8614004665215449
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.89      0.86    121904
           1       0.88      0.83      0.86    122029

    accuracy                           0.86    243933
   macro avg       0.86      0.86      0.86    243933
weighted avg       0.86      0.86      0.86    243933

Predicted results saved to 'predicted_results_xgb.csv'
