In [None]:
import pandas as pd

# Load the data files
file0 = pd.read_csv('file0.csv')
file1 = pd.read_csv('file1.csv')
file2 = pd.read_csv('file2.csv')

# Concatenate the data files to form a single dataset
data = pd.concat([file0, file1, file2], ignore_index=True)

# Display the shape of the concatenated dataset
data.shape

## Data Preprocessing

In [None]:
# Check for missing values in the dataset
missing_values = data.isnull().sum()
missing_percentage = (data.isnull().sum() / len(data)) * 100
missing_data = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage})
missing_data.sort_values(by='Percentage', ascending=False)

In [None]:
# Check for missing values in the dataset
missing_values = data.isnull().sum()
missing_percentage = (data.isnull().sum() / len(data)) * 100

# Display columns with missing values and their percentage
missing_data = pd.DataFrame({'Missing Values': missing_values, 'Percentage (%)': missing_percentage})
missing_data[missing_data['Missing Values'] > 0].sort_values(by='Percentage (%)', ascending=False)

In [None]:
# Handle missing values
data['ChgOffDate'].fillna('Not Written Off', inplace=True)
most_frequent_state = data['State'].mode()[0]
data['State'].fillna(most_frequent_state, inplace=True)

# Verify if all missing values have been handled
data.isnull().sum().sum()

In [None]:
# Check for columns with missing values again
missing_values_updated = data.isnull().sum()
missing_data_updated = missing_values_updated[missing_values_updated > 0]
missing_data_updated

In [None]:
# Handle missing values

# Fill 'ChgOffDate' with a placeholder
data['ChgOffDate'].fillna('Not Written Off', inplace=True)

# Fill categorical columns with mode
data['RevLineCr'].fillna(data['RevLineCr'].mode()[0], inplace=True)
data['MIS_Status'].fillna(data['MIS_Status'].mode()[0], inplace=True)

# Fill 'Name' with 'Unknown'
data['Name'].fillna('Unknown', inplace=True)

# Convert 'DisbursementDate' to datetime format
data['DisbursementDate'] = pd.to_datetime(data['DisbursementDate'], errors='coerce')

# Fill 'DisbursementDate' with median date
median_date = data['DisbursementDate'].median()
data['DisbursementDate'].fillna(pd.Timestamp(median_date), inplace=True)

# Check if there are any missing values left
data.isnull().sum().sum()

In [None]:
# Handle missing values for the identified columns
data['BankState'].fillna(data['BankState'].mode()[0], inplace=True)
data['MIS_Status'].fillna(data['MIS_Status'].mode()[0], inplace=True)
data['City'].fillna(data['City'].mode()[0], inplace=True)
data['Name'].fillna(data['Name'].mode()[0], inplace=True)
data['DisbursementDate'].fillna('Unknown', inplace=True)

# Verify if all missing values have been handled
data.isnull().sum().sum()

In [None]:
# Check for columns with missing values again
remaining_missing_values = data.isnull().sum()
remaining_missing_data = remaining_missing_values[remaining_missing_values > 0]
remaining_missing_data

In [None]:
# Check columns with the remaining missing values
remaining_missing = data.isnull().sum()
remaining_missing[remaining_missing > 0]

In [None]:
# Handle missing values for the remaining columns
data['LowDoc'].fillna(data['LowDoc'].mode()[0], inplace=True)
data['Bank'].fillna(data['Bank'].mode()[0], inplace=True)
data['NewExist'].fillna(data['NewExist'].mode()[0], inplace=True)

# Verify if all missing values have been handled
data.isnull().sum().sum()

In [None]:
# Handle the remaining missing values

# Fill 'NewExist' and 'LowDoc' with mode
data['NewExist'].fillna(data['NewExist'].mode()[0], inplace=True)
data['LowDoc'].fillna(data['LowDoc'].mode()[0], inplace=True)

# Fill 'Bank' with 'Unknown'
data['Bank'].fillna('Unknown', inplace=True)

# Check if there are any missing values left
data.isnull().sum().sum()

## Data Exploration and Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style of the visualization
sns.set(style="whitegrid")

# Draw a bar plot of 'MIS_Status' value counts
plt.figure(figsize=(10, 6))
sns.countplot(y='MIS_Status', data=data, palette='pastel')
plt.title('Distribution of Loan Status')
plt.xlabel('Count')
plt.ylabel('Loan Status')
plt.show()

In [None]:
# Handle the remaining missing values

# Fill 'NewExist' with mode
data['NewExist'].fillna(data['NewExist'].mode()[0], inplace=True)

# Fill 'Bank' with 'Unknown Bank'
data['Bank'].fillna('Unknown Bank', inplace=True)

# Fill 'LowDoc' with mode
data['LowDoc'].fillna(data['LowDoc'].mode()[0], inplace=True)

# Check if there are any missing values left
data.isnull().sum().sum()

In [None]:
# Display the first few rows of the dataset to understand its structure
data.head()

## Feature Engineering

In [None]:
# Convert 'GrAppv' to numeric format
data['GrAppv'] = data['GrAppv'].str.replace('$', '').str.replace(',', '').astype(float)

# Create 'LoanDuration' feature
data['LoanDuration'] = (data['DisbursementDate'] - data['ApprovalDate']).dt.days

# Create 'LoanAmountPerEmployee' feature
data['LoanAmountPerEmployee'] = data['GrAppv'] / data['NoEmp']
data['LoanAmountPerEmployee'].replace([float('inf'), -float('inf')], 0, inplace=True)  # Handle division by zero

# Create 'JobImpact' feature
data['JobImpact'] = data['CreateJob'] - data['RetainedJob']

# Display the first few rows of the dataset with the new features
data[['LoanDuration', 'LoanAmountPerEmployee', 'JobImpact']].head()

## Data Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for plots
sns.set_style('whitegrid')

# Distribution of Loan Amounts
plt.figure(figsize=(10, 6))
sns.histplot(data['GrAppv'], bins=50, color='blue', kde=True)
plt.title('Distribution of Loan Amounts')
plt.xlabel('Loan Amount')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Loan Status Distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='MIS_Status', data=data, palette='viridis')
plt.title('Loan Status Distribution')
plt.xlabel('Loan Status')
plt.ylabel('Count')
plt.show()

In [None]:
# Loan Duration Distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['LoanDuration'], bins=50, color='green', kde=True)
plt.title('Distribution of Loan Durations')
plt.xlabel('Loan Duration (days)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Job Impact Distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['JobImpact'], bins=50, color='purple', kde=True)
plt.title('Distribution of Job Impact')
plt.xlabel('Job Impact (Jobs Created - Jobs Retained)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Loan Status Distribution
plt.figure(figsize=(8, 6))
sns.countplot(data=data, x='MIS_Status', palette='viridis')
plt.title('Loan Status Distribution')
plt.xlabel('Loan Status')
plt.ylabel('Count')
plt.show()

In [None]:
# Loan Duration Distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['LoanDuration'], bins=50, color='green', kde=True)
plt.title('Distribution of Loan Durations (Days between Approval and Disbursement)')
plt.xlabel('Days')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Job Impact Distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['JobImpact'], bins=50, color='purple', kde=True)
plt.title('Distribution of Job Impact (Difference between Jobs Created and Retained)')
plt.xlabel('Job Impact')
plt.ylabel('Frequency')
plt.show()

## Model Building

In [None]:
# Feature Selection

# Drop columns that are not relevant for prediction
drop_columns = ['LoanNr_ChkDgt', 'Name', 'City', 'State', 'Zip', 'Bank', 'BankState', 'NAICS', 'ApprovalDate', 'DisbursementDate', 'ChgOffDate']
data_cleaned = data.drop(columns=drop_columns)

# Target variable
y = data_cleaned['MIS_Status']
X = data_cleaned.drop(columns=['MIS_Status'])

# Display the selected features
X.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Encode categorical variables
label_encoders = {}
for column in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    label_encoders[column] = le

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Display the shape of the training and testing sets
X_train.shape, X_test.shape

In [None]:
# Identify columns with mixed data types
mixed_type_columns = [col for col in X.columns if len(X[col].apply(type).value_counts()) > 1]

# Convert these columns to string type
for col in mixed_type_columns:
    X[col] = X[col].astype(str)

# Encode categorical variables again
for column in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    label_encoders[column] = le

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Display the shape of the training and testing sets
X_train.shape, X_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize and train the Logistic Regression model
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)

# Predict on the testing set
y_pred_logreg = logreg.predict(X_test)

# Evaluate the model
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
classification_report_logreg = classification_report(y_test, y_pred_logreg)

accuracy_logreg, classification_report_logreg

In [None]:
# Reduce the size of the training dataset
sample_size = int(0.1 * len(X_train))  # 10% of the training data
X_train_sampled, _, y_train_sampled, _ = train_test_split(X_train, y_train, train_size=sample_size, random_state=42, stratify=y_train)

# Initialize and train the Logistic Regression model on the reduced dataset
logreg_sampled = LogisticRegression(max_iter=1000, random_state=42)
logreg_sampled.fit(X_train_sampled, y_train_sampled)

# Predict on the testing set
y_pred_logreg_sampled = logreg_sampled.predict(X_test)

# Evaluate the model
accuracy_logreg_sampled = accuracy_score(y_test, y_pred_logreg_sampled)
classification_report_logreg_sampled = classification_report(y_test, y_pred_logreg_sampled)

accuracy_logreg_sampled, classification_report_logreg_sampled

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train_sampled, y_train_sampled)

# Predict on the testing set
y_pred_rf = rf.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_report_rf = classification_report(y_test, y_pred_rf)

accuracy_rf, classification_report_rf

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize and train the Logistic Regression model
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)

# Predict on the testing set
y_pred_logreg = logreg.predict(X_test)

# Evaluate the model
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
classification_report_logreg = classification_report(y_test, y_pred_logreg)

accuracy_logreg, classification_report_logreg

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# Predict on the testing set
y_pred_rf = rf.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_report_rf = classification_report(y_test, y_pred_rf)

accuracy_rf, classification_report_rf

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize and train the Gradient Boosting model
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)

# Predict on the testing set
y_pred_gb = gb.predict(X_test)

# Evaluate the model
accuracy_gb = accuracy_score(y_test, y_pred_gb)
classification_report_gb = classification_report(y_test, y_pred_gb)

accuracy_gb, classification_report_gb

In [None]:
import shap

# Initialize the SHAP explainer
explainer = shap.TreeExplainer(rf)

# Compute SHAP values for a sample of the test set (for computational efficiency)
shap_values = explainer.shap_values(X_test[:1000])

# Plot the SHAP values for the top features
shap.summary_plot(shap_values[1], X_test[:1000], plot_type='bar')

In [None]:
!pip install shap

In [None]:
import shap
# Initialize the SHAP explainer again
explainer = shap.TreeExplainer(rf)

# Compute SHAP values for a sample of the test set (for computational efficiency)
shap_values = explainer.shap_values(X_test[:1000])

# Plot the SHAP values for the top features
shap.summary_plot(shap_values[1], X_test[:1000], plot_type='bar')

In [None]:
# Compute SHAP values for a sample of the test set (for computational efficiency)
shap_values = explainer.shap_values(X_test[:1000])

# Plot the SHAP values with feature names
shap.summary_plot(shap_values[1], X_test[:1000], feature_names=X.columns, plot_type='bar')