In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the datasets
file0 = pd.read_csv('file0.csv')
file1 = pd.read_csv('file1.csv')
file2 = pd.read_csv('file2.csv')

# Concatenate the datasets to form a single dataframe
data = pd.concat([file0, file1, file2], axis=0).reset_index(drop=True)

# Display the first few rows of the concatenated dataset
data.head()

### Data Preprocessing

In [None]:
# Check for missing values
missing_data = data.isnull().sum()
missing_percentage = (data.isnull().sum() / len(data)) * 100

# Create a dataframe to display missing data information
missing_df = pd.DataFrame({'Missing Values': missing_data, '% of Total Values': missing_percentage})
missing_df = missing_df[missing_df['Missing Values'] != 0].sort_values(by='% of Total Values', ascending=False)
missing_df

In [None]:
# Fill missing values for categorical columns with their mode
categorical_columns = ['Bank', 'MIS_Status', 'LowDoc', 'RevLineCr']
for column in categorical_columns:
    data[column].fillna(data[column].mode()[0], inplace=True)

# Fill missing values for date columns with a placeholder value
data['DisbursementDate'].fillna('Unknown', inplace=True)
data['ChgOffDate'].fillna('Unknown', inplace=True)

# Check if there are any missing values left
data.isnull().sum()

In [None]:
# Fill missing values for the 'NewExist' column with its mode
data['NewExist'].fillna(data['NewExist'].mode()[0], inplace=True)

# Check if there are any missing values left
data.isnull().sum().sum()

In [None]:
# Identify columns with missing values
columns_with_missing = data.columns[data.isnull().any()]
missing_data = data[columns_with_missing].isnull().sum()
missing_data

In [None]:
# Fill missing values for categorical columns with mode
categorical_columns = ['Bank', 'MIS_Status', 'LowDoc', 'RevLineCr']
for column in categorical_columns:
    mode_val = data[column].mode()[0]
    data[column].fillna(mode_val, inplace=True)

# Fill missing values for date columns with median date
date_columns = ['DisbursementDate', 'ChgOffDate']
for column in date_columns:
    median_date = pd.to_datetime(data[column].dropna()).median()
    data[column].fillna(median_date, inplace=True)

# Check if there are any missing values left
data.isnull().sum().sort_values(ascending=False)

In [None]:
# Inspect unique values in the 'DisbursementDate' and 'ChgOffDate' columns
disbursement_dates = data['DisbursementDate'].dropna().unique()
chgoff_dates = data['ChgOffDate'].dropna().unique()

disbursement_dates, chgoff_dates

In [None]:
# Replace 'Unknown' with NaN in 'ChgOffDate' column
data['ChgOffDate'].replace('Unknown', np.nan, inplace=True)

# Fill missing values for date columns with median date
for column in date_columns:
    median_date = pd.to_datetime(data[column].dropna()).median()
    data[column].fillna(median_date, inplace=True)

# Check if there are any missing values left
data.isnull().sum().sort_values(ascending=False)

In [None]:
# Check for non-standard date formats in 'DisbursementDate' and 'ChgOffDate' columns
def check_date_format(date_series):
    non_standard_dates = []
    for date in date_series.dropna().unique():
        try:
            pd.to_datetime(date)
        except:
            non_standard_dates.append(date)
    return non_standard_dates

non_standard_disbursement_dates = check_date_format(data['DisbursementDate'])
non_standard_chgoff_dates = check_date_format(data['ChgOffDate'])

non_standard_disbursement_dates, non_standard_chgoff_dates

In [None]:
# Replace 'Unknown' with NaN in 'DisbursementDate' column
data['DisbursementDate'].replace('Unknown', np.nan, inplace=True)

# Fill missing values for date columns with median date
for column in date_columns:
    median_date = pd.to_datetime(data[column].dropna()).median()
    data[column].fillna(median_date, inplace=True)

# Check if there are any missing values left
data.isnull().sum().sort_values(ascending=False)

### Feature Engineering

In [None]:
# Create 'Loan Duration' feature
data['DisbursementDate'] = pd.to_datetime(data['DisbursementDate'])
data['ChgOffDate'] = pd.to_datetime(data['ChgOffDate'])
data['Loan Duration'] = (data['ChgOffDate'] - data['DisbursementDate']).dt.days

# Create 'Loan Amount' feature
data['Loan Amount'] = data['GrAppv'] - data['SBA_Appv']

# Create 'Job Impact' feature
data['Job Impact'] = data['CreateJob'] - data['RetainedJob']

# Display the first few rows of the dataset with the new features
data[['Loan Duration', 'Loan Amount', 'Job Impact']].head()

In [None]:
# Inspect the data types of 'GrAppv' and 'SBA_Appv' columns
data[['GrAppv', 'SBA_Appv']].dtypes

In [None]:
# Convert 'GrAppv' and 'SBA_Appv' columns to numeric data type
data['GrAppv'] = data['GrAppv'].str.replace('[^\d.]', '', regex=True).astype(float)
data['SBA_Appv'] = data['SBA_Appv'].str.replace('[^\d.]', '', regex=True).astype(float)

# Create 'Loan Amount' feature
data['Loan Amount'] = data['GrAppv'] - data['SBA_Appv']

# Display the first few rows of the dataset with the new feature
data[['Loan Amount']].head()

### Exploratory Data Analysis (EDA)

In [None]:
# Visualize the distribution of loan statuses
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='MIS_Status')
plt.title('Distribution of Loan Statuses')
plt.ylabel('Number of Loans')
plt.xlabel('Loan Status')
plt.show()

### Data Visualization

In [None]:
# Visualize the distribution of loan amounts
plt.figure(figsize=(10, 6))
sns.histplot(data['Loan Amount'], bins=50, kde=True)
plt.title('Distribution of Loan Amounts')
plt.xlabel('Loan Amount')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Visualize the distribution of loan statuses
plt.figure(figsize=(8, 6))
sns.countplot(data=data, x='MIS_Status')
plt.title('Distribution of Loan Statuses')
plt.xlabel('Loan Status')
plt.ylabel('Count')
plt.show()

In [None]:
# Visualize the net impact of loans on job creation
plt.figure(figsize=(10, 6))
sns.histplot(data['Job Impact'], bins=50, kde=True)
plt.title('Net Impact of Loans on Job Creation')
plt.xlabel('Job Impact (Created - Retained)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Inspect the first few rows of the 'Job Impact' feature
data[['Job Impact']].head()

In [None]:
# Recreate 'Job Impact' feature
data['Job Impact'] = data['CreateJob'] - data['RetainedJob']

# Visualize the net impact of loans on job creation
plt.figure(figsize=(10, 6))
sns.histplot(data['Job Impact'], bins=50, kde=True)
plt.title('Net Impact of Loans on Job Creation')
plt.xlabel('Job Impact (Created - Retained)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Visualize the loan duration
plt.figure(figsize=(10, 6))
sns.histplot(data['Loan Duration'], bins=50, kde=True)
plt.title('Distribution of Loan Duration')
plt.xlabel('Loan Duration (Days)')
plt.ylabel('Frequency')
plt.show()

### Model Building

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Encode categorical variables
label_encoders = {}
categorical_columns = ['City', 'Name', 'State', 'Zip', 'Bank', 'BankState', 'NAICS', 'ApprovalFY', 'NewExist', 'FranchiseCode', 'UrbanRural', 'RevLineCr', 'LowDoc', 'MIS_Status']
for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column].astype(str))
    label_encoders[column] = le

# Drop columns that are not needed for modeling
data_model = data.drop(columns=['LoanNr_ChkDgt', 'ApprovalDate', 'ChgOffDate', 'DisbursementDate'])

# Split data into training and testing sets
X = data_model.drop('MIS_Status', axis=1)
y = data_model['MIS_Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize and train a Logistic Regression model
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)

# Predict on the test set
y_pred_logreg = logreg.predict(X_test)

# Evaluate the model's performance
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
classification_report_logreg = classification_report(y_test, y_pred_logreg)

accuracy_logreg, classification_report_logreg

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize and train a Logistic Regression model
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)

# Predict on the test set
y_pred = logreg.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

accuracy, classification_rep

In [None]:
# Identify columns with string data type
string_columns = X_train.select_dtypes(include=['object']).columns
string_columns

In [None]:
# Convert the identified columns to numeric data type
for column in string_columns:
    X_train[column] = X_train[column].str.replace('[^\d.]', '', regex=True).astype(float)
    X_test[column] = X_test[column].str.replace('[^\d.]', '', regex=True).astype(float)

# Train the Logistic Regression model again
logreg.fit(X_train, y_train)

# Predict on the test set
y_pred = logreg.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

accuracy, classification_rep

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train a Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf.predict(X_test)

# Evaluate the model's performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_rep_rf = classification_report(y_test, y_pred_rf)

accuracy_rf, classification_rep_rf

In [None]:
import shap

# Initialize the SHAP explainer
explainer = shap.TreeExplainer(rf)

# Compute SHAP values for a sample of the test set
shap_values = explainer.shap_values(X_test.sample(1000, random_state=42))

# Visualize the SHAP values for the Random Forest model
shap.summary_plot(shap_values[1], X_test.sample(1000, random_state=42))

In [None]:
!pip install -q shap

In [None]:
import shap

# Initialize the SHAP explainer
explainer = shap.TreeExplainer(rf)

# Compute SHAP values for a sample of the test set
shap_values = explainer.shap_values(X_test.sample(1000, random_state=42))

# Visualize the SHAP values for the Random Forest model
shap.summary_plot(shap_values, X_test.sample(1000, random_state=42), plot_type="bar")