In [187]:
import pandas as pd
import numpy as np


train=pd.read_csv('cs-training.csv')
test=pd.read_csv('cs-test.csv')



In [188]:
## Check the columns with outliers or control risks to revise the train data for the modeling.
## The columns to be assessed are those 8 columns
#1.'RevolvingUtilizationOfUnsecuredLines'
#2. 'Age'
#3. 'DebtRatio'
#4. 'MonthlyIncome'
#5. 'NumberOfDependents'
#6. 'NumberOfTime30-59DaysPastDueNotWorse'
#7. 'NumberOfTime60-89DaysPastDueNotWorse'
#8. 'NumberOfTimes90DaysLate'

In [189]:
## Assessment 1 - Revolving Utilization of Unsecured Lines

train['RevolvingUtilizationOfUnsecuredLines'].describe()

count    150000.000000
mean          6.048438
std         249.755371
min           0.000000
25%           0.029867
50%           0.154181
75%           0.559046
max       50708.000000
Name: RevolvingUtilizationOfUnsecuredLines, dtype: float64

In [190]:
test['RevolvingUtilizationOfUnsecuredLines'].describe()

## Both train & test data has Revolving Utilization of Unsecured Lines over 1.
## Though it implies the organization has credit limit control risk,
#  we need to use the data since the risk actually lies in the organization.

count    101503.000000
mean          5.310000
std         196.156039
min           0.000000
25%           0.030131
50%           0.152586
75%           0.564225
max       21821.000000
Name: RevolvingUtilizationOfUnsecuredLines, dtype: float64

In [191]:
## Assessment 2 - Age

train['age'].describe()

count    150000.000000
mean         52.295207
std          14.771866
min           0.000000
25%          41.000000
50%          52.000000
75%          63.000000
max         109.000000
Name: age, dtype: float64

In [192]:
test['age'].describe()

# The train data has an age value under 19, which is 0.
# The minimum age of test data is 21.
# Therefore, it is adequate to remove age = 0 line in the test data for the better tuned modeling.

count    101503.000000
mean         52.405436
std          14.779756
min          21.000000
25%          41.000000
50%          52.000000
75%          63.000000
max         104.000000
Name: age, dtype: float64

In [193]:
## Assessment 3 - Debt Ratio

train['DebtRatio'].describe()

count    150000.000000
mean        353.005076
std        2037.818523
min           0.000000
25%           0.175074
50%           0.366508
75%           0.868254
max      329664.000000
Name: DebtRatio, dtype: float64

In [194]:
test['DebtRatio'].describe()

# Both train & test data has Revolving Utilization of Unsecured Lines over 1.
# Though it implies the organization has Debt to Income control risk,
#  we need to use the data since the risk actually lies in the organization.

count    101503.000000
mean        344.475020
std        1632.595231
min           0.000000
25%           0.173423
50%           0.364260
75%           0.851619
max      268326.000000
Name: DebtRatio, dtype: float64

In [195]:
## Assessment 4 - Monthly Income-missing value

null_counts_MonthlyIncome_train = train.MonthlyIncome.isna().sum()
print(null_counts_MonthlyIncome_train)

null_counts_MonthlyIncome_test = test.MonthlyIncome.isna().sum()
print(null_counts_MonthlyIncome_test)


29731
20103


In [196]:
# Remove rows with null values in 'MonthlyIncome' from train
train_dropna_MonthlyIncome = train.dropna(subset=['MonthlyIncome'])

# Calculate the median of the 'MonthlyIncome' column for train
median_monthly_income_train = train_cleaned['MonthlyIncome'].median()


# Remove rows with null values in 'MonthlyIncome' from test
test_dropna_MonthlyIncome = test.dropna(subset=['MonthlyIncome'])

# Calculate the median of the 'MonthlyIncome' column for test
median_monthly_income_test = test_cleaned['MonthlyIncome'].median()

print("Median Monthly Income for Train:", median_monthly_income_train)
print("Median Monthly Income for Test:", median_monthly_income_test)

Median Monthly Income for Train: 5400.0
Median Monthly Income for Test: 5400.0


In [197]:
#Replace the missing value in the training data with 5400

train['MonthlyIncome'].fillna(5400, inplace=True)

# Calculate the median of the 'MonthlyIncome' column
median_monthly_income_train = train['MonthlyIncome'].median()

print("Median Monthly Income_train:", median_monthly_income_train)

Median Monthly Income_train: 5400.0


In [198]:
#Replace the missing value in the test data with 5400

test['MonthlyIncome'].fillna(5400, inplace=True)

# Calculate the median of the 'MonthlyIncome' column
median_monthly_income_test = test['MonthlyIncome'].median()

print("Median Monthly Income_test:", median_monthly_income_test)

Median Monthly Income_test: 5400.0


In [199]:
## Assessment 5 - NumberOfDependents-missing value

null_counts_NumberOfDependents_train = train.NumberOfDependents.isna().sum()
print(null_counts_NumberOfDependents_train)

null_counts_NumberOfDependents_test = test.NumberOfDependents.isna().sum()
print(null_counts_NumberOfDependents_test)


3924
2626


In [200]:
# Remove rows with null values in 'NumberOfDependents' from train
train_dropna_dependents = train.dropna(subset=['NumberOfDependents'])

# Calculate the median of the 'NumberOfDependents' column for train
median_monthly_income_train = train_cleaned['NumberOfDependents'].median()


# Remove rows with null values in 'NumberOfDependents' from test
test_dropna_dependents = test.dropna(subset=['NumberOfDependents'])

# Calculate the median of the 'NumberOfDependents' column for test
median_monthly_income_test = test_cleaned['NumberOfDependents'].median()

print("Median Number Of Dependents for Train:", median_monthly_income_train)
print("Median Number O fDependents for Test:", median_monthly_income_test)

Median Number Of Dependents for Train: 0.0
Median Number O fDependents for Test: 0.0


In [201]:
#Replace the missing value in the training data with 0

train['NumberOfDependents'].fillna(0, inplace=True)

# Calculate the median of the 'NumberOfDependents' column
median_NumberOfDependents_train = train['NumberOfDependents'].median()

print("Median NumberOfDependents_train:", median_NumberOfDependents_train)

Median NumberOfDependents_train: 0.0


In [202]:
#Replace the missing value in the test data with 0

test['NumberOfDependents'].fillna(0, inplace=True)

# Calculate the median of the 'NumberOfDependents' column
median_NumberOfDependents_test = test['NumberOfDependents'].median()

print("Median NumberOfDependents_test:", median_NumberOfDependents_test)

Median NumberOfDependents_test: 0.0


In [203]:
## Assessment 6 - NumberOfTime30-59DaysPastDueNotWorse

train['NumberOfTime30-59DaysPastDueNotWorse'].describe()

count    150000.000000
mean          0.421033
std           4.192781
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          98.000000
Name: NumberOfTime30-59DaysPastDueNotWorse, dtype: float64

In [204]:
test['NumberOfTime30-59DaysPastDueNotWorse'].describe()

# Both have extra ordinary often past due history and implies the internal control risks.
# But we need to use the data since the risk actually lies in the organization.

count    101503.000000
mean          0.453770
std           4.538487
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          98.000000
Name: NumberOfTime30-59DaysPastDueNotWorse, dtype: float64

In [205]:
## Assessment 7 - NumberOfTime60-89DaysPastDueNotWorse

train['NumberOfTime60-89DaysPastDueNotWorse'].describe()

count    150000.000000
mean          0.240387
std           4.155179
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          98.000000
Name: NumberOfTime60-89DaysPastDueNotWorse, dtype: float64

In [206]:
test['NumberOfTime60-89DaysPastDueNotWorse'].describe()

# Both have extra ordinary often past due history and implies the internal control risks.
# But we need to use the data since the risk actually lies in the organization.

count    101503.000000
mean          0.270317
std           4.503578
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          98.000000
Name: NumberOfTime60-89DaysPastDueNotWorse, dtype: float64

In [207]:
## Assessment 8 - NumberOfTimes90DaysLate
train['NumberOfTimes90DaysLate'].describe()

count    150000.000000
mean          0.265973
std           4.169304
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          98.000000
Name: NumberOfTimes90DaysLate, dtype: float64

In [208]:
test['NumberOfTimes90DaysLate'].describe()

# Both have extra ordinary often past due history and implies the internal control risks.
# But we need to use the data since the risk actually lies in the organization.

count    101503.000000
mean          0.296691
std           4.515859
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          98.000000
Name: NumberOfTimes90DaysLate, dtype: float64

In [209]:
## Remove Age = 0 line in the train data before the modeling.

train_Final = train[train['age'] != 0]
train_Final['age'].describe()

count    149999.000000
mean         52.295555
std          14.771298
min          21.000000
25%          41.000000
50%          52.000000
75%          63.000000
max         109.000000
Name: age, dtype: float64

In [210]:
train_Final.describe()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,149999.0,149999.0,149999.0,149999.0,149999.0,149999.0,149999.0,149999.0,149999.0,149999.0,149999.0,149999.0
mean,75000.56203,0.06684,6.048472,52.295555,0.421029,353.007426,6418.458,8.452776,0.265975,1.018233,0.240388,0.737405
std,43301.552202,0.249746,249.756203,14.771298,4.192795,2037.825113,12890.44,5.145964,4.169318,1.129772,4.155193,1.10702
min,1.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,37500.5,0.0,0.029867,41.0,0.0,0.175074,3903.0,5.0,0.0,0.0,0.0,0.0
50%,75001.0,0.0,0.154176,52.0,0.0,0.366503,5400.0,8.0,0.0,1.0,0.0,0.0
75%,112500.5,0.0,0.559044,63.0,0.0,0.868257,7400.0,11.0,0.0,2.0,0.0,1.0
max,150000.0,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


In [211]:
##### Create Random Forest Model #####

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



# Define features and target
features = train_Final.drop('SeriousDlqin2yrs', axis=1)
target = train_Final['SeriousDlqin2yrs']

In [212]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42)

In [213]:
# Create and train a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [214]:
# Evaluate the model on the validation set
val_predictions = rf_classifier.predict(X_val)
accuracy = accuracy_score(y_val, val_predictions)
print("Validation Accuracy:", accuracy)

Validation Accuracy: 0.9359


In [215]:
# Use the trained model to predict the 'SeriousDlqin2yrs' column in the test data
test_features = test.drop('SeriousDlqin2yrs', axis=1)
test_predictions = rf_classifier.predict(test_features)

In [216]:
# Add the predictions as a new column in the test data
test['Predicted_SeriousDlqin2yrs'] = test_predictions

In [217]:
# Save the test data with predictions to a CSV file
test.to_csv('cs-test-predictions.csv', index=False)