In [None]:
# Import nesessary packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# Load data from pickle
df = pd.read_pickle("Loan_status.pkl")

In [None]:
df.head()

In [None]:
# Read the file 
# df = pd.read_csv("Loan_status_2007-2020Q3.gzip", on_bad_lines="skip", low_memory=False)

In [None]:
# Save file to pickle to save time
# df.to_pickle("Loan_status.pkl")

# Split Dataset into Training, Validation, Testing and Untouched Sets

In [None]:
## Split Dataset
from sklearn.model_selection import train_test_split

# Step 1: Split off untouched data (10%)
df_train_val, df_untouched = train_test_split(df, test_size=0.10, random_state=42)
df_train_val = df_train_val.copy()  # Avoid SettingWithCopyWarning
df_untouched = df_untouched.copy()
df_untouched["set_flag"] = 0  # Mark untouched

# Step 2: Split remaining into train (50%), validation (20%), and test (20%)
df_train, df_temp = train_test_split(df_train_val, test_size=0.40, random_state=42)
df_train = df_train.copy()
df_temp = df_temp.copy()
df_train["set_flag"] = 1  # Mark train

df_val, df_test = train_test_split(df_temp, test_size=0.50, random_state=42)
df_val = df_val.copy()
df_test = df_test.copy()
df_val["set_flag"] = 2  # Mark validation
df_test["set_flag"] = 3  # Mark test

# Combine all subsets back into one dataframe
df_final = pd.concat([df_train, df_val, df_test, df_untouched], ignore_index=True)

# Save as Parquet for efficient storage
df_final.to_parquet("dataset_with_flags.parquet", index=False)

# Check dataset sizes
print(f"Training Set: {len(df_train)} rows ({len(df_train)/len(df)*100:.1f}%)")
print(f"Validation Set: {len(df_val)} rows ({len(df_val)/len(df)*100:.1f}%)")
print(f"Test Set: {len(df_test)} rows ({len(df_test)/len(df)*100:.1f}%)")
print(f"Untouched Set: {len(df_untouched)} rows ({len(df_untouched)/len(df)*100:.1f}%)")

print("Data saved with set_flag column.")



In [None]:
# Dimension of training set
df_train.shape

# Data Processing

## 1. Drop features with > 50% missing values

In [None]:
## Drop features with > 50% missing values
missing_percent = (df_train.isna().sum() / len(df_train))* 100
cols_to_drop = missing_percent[missing_percent > 50].index
print(cols_to_drop)
df_train_dropped = df_train.drop(columns=cols_to_drop)

# Categorize Predictor Variable to Loss, Good and Other

In [None]:
# create categories for loan status
loss = ['Charged Off', 'Does not meet the credit policy. Status:Charged Off', 'Default']
good = ['Fully Paid', 'Current', 'Does not meet the credit policy. Status:Charged Off', 'Issued']
df_train_dropped['loan_category'] = df_train_dropped['loan_status'].apply(lambda x: 'Loss' if x in loss else ('Good' if x in good else 'Other'))

In [None]:
# filter only good and loss
df_train_dropped = df_train_dropped[df_train_dropped['loan_category'].isin(['Loss', 'Good'])]
# convert to numerical encoding
df_train_dropped['loan_cat_numerical'] = df_train_dropped['loan_category'].map({'Loss': 0, 'Good': 1})
df_train_dropped['loan_cat_numerical'].value_counts()

In [None]:
# Identify columns with string (object) content
string_columns = df_train_dropped.select_dtypes(include=['object']).columns.tolist()

# Display the list of column names containing string content
string_columns


# Convert String to Numeric

## 1. Convert id to numeric

In [None]:
## convert id to numeric
df_train_dropped['id'] = pd.to_numeric(df_train_dropped['id'], errors='coerce').astype('Int64')
# check if there'are any NAs
df_train_dropped['id'].isna().any()
df_train_dropped['id'].isna().sum()

# show the column with NA id
df_train_dropped[df_train_dropped['id'].isna()]

# drop the entire row 39786 because it's empty
df_train_dropped = df_train_dropped.drop(39786)
df_train_dropped.shape



In [None]:
# check result
df_train_dropped.head()

## 2. Convert emp_title to Numeric (Drop)

Reasons to drop feature emp_title:

1. High cardinality -- 372,749 unique values in 1,579,764 rows (~24% unique), making it difficult to extract meaningful patterns.
2. Encoding challenge -- One-hot encoding is impractical due to excessive feature expansion; label encoding introduces arbitrary ordinal relationships

In [None]:
## Drop emp_title column (too many distinct emp_titles)
df_train_dropped.drop(columns=["emp_title"], inplace=True)

df_train_dropped.head()

## 3. Convert home_ownership to Numerical (One-hot Encoding)

In [None]:
## Convert home_ownership into numerical
df_train_dropped['home_ownership'].unique()

df_train_dropped['home_ownership'].isnull().sum()

# perform one-hot encoding 
df_train_dropped = pd.get_dummies(df_train_dropped, columns=['home_ownership'], drop_first=False)

# convert the true/false into 1/0
home_ownership_cols = [col for col in df_train_dropped.columns if col.startswith('home_ownership_')]
df_train_dropped[home_ownership_cols] = df_train_dropped[home_ownership_cols].astype(int)
df_train_dropped.head()

## 4. Convert verification_status to Numerical (One-hot Encoding)

In [None]:
## convert verification_status
df_train_dropped['verification_status'].unique()
df_train_dropped['verification_status'].isnull().sum()

# one-hot encoding 
df_train_dropped = pd.get_dummies(df_train_dropped, columns=['verification_status'], drop_first=False)

# convert true/false to 1/0
verification_status_cols = [col for col in df_train_dropped.columns if col.startswith('verification_status_')]
df_train_dropped[verification_status_cols] = df_train_dropped[verification_status_cols].astype(int)
df_train_dropped.head()

## 5. Convert issue_d to Numerical 

In [None]:
## convert issue_d
df_train_dropped['issue_d'].head()

df_train_dropped['issue_d'].isnull().sum()

df_train_dropped['loan_status'].unique()

# Convert issue_d to datetime
df_train_dropped['issue_d'] = pd.to_datetime(df_train_dropped['issue_d'], format='%b-%Y')
# Group by month-year and calculate the proportion of "loss" loans
loss_rate = df_train_dropped.groupby(df_train_dropped['issue_d'].dt.to_period('Y'))['loan_category'].apply(lambda x: (x == 'Loss').mean())

# Plot the trend
plt.figure(figsize=(14, 6))
#x_labels = loss_rate.index.astype(str)[::4]  # Show every 4th label
plt.plot(loss_rate.index.astype(str), loss_rate.values, marker='o', linestyle='-')
#plt.xticks(ticks=x_labels, rotation=45)
plt.xticks(rotation=45)
plt.xlabel("Loan Issue Date (Year)")
plt.ylabel("Proportion of Loss Loans")
plt.title("Trend of Loan Default Rate Over Time (Yearly)")
plt.grid(True)
plt.show()

In [None]:
# convert issue_d to an Ordinal Feature
df_train_dropped['issue_d_ordinal'] = df_train_dropped['issue_d'].dt.year - df_train_dropped['issue_d'].dt.year.min()
#df_train_dropped.drop(columns=['issue_d'], inplace=True)  # Drop original datetime column

## 6. Convert url to Numeric (Drop)
Reason to drop url: 
1. url unaccessible

In [None]:
## loanDetails that requires investor account to login (Unaccessible)
## Drop column
df_train_dropped.drop(columns=["url"], inplace=True)

df_train_dropped.head()



## 7. Convert Term to Numeric

In [None]:
# Check feature Term and convert to numeric
df_train_dropped["term"].unique()

# Convert  numeric 
df_train_dropped["term"] = df_train_dropped["term"].str.extract("(\d+)").astype(float)

# Plot distribution of loan terms
plt.figure(figsize=(6,4))
sns.countplot(x=df_train_dropped["term"])
plt.title("Distribution of Loan Terms")
plt.xlabel("Loan Term (Months)")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.show()

## 8. Convert int_rate to Numeric

In [None]:
# Check feature int_rate and convert to numeric
df_train_dropped["int_rate"].unique()

# Plot distribution of int_rate
plt.figure(figsize=(6,4))
sns.countplot(x=df_train_dropped["int_rate"])
plt.title("Distribution of int_rate")
plt.xlabel("int_rate)")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.show()

# Convert
df_train_dropped["int_rate"] = df_train_dropped["int_rate"].replace("nan", np.nan)
df_train_dropped["int_rate"] = df_train_dropped["int_rate"].str.replace("%", "").astype(float) / 100
df_train_dropped["int_rate"].head()

## 9. Convert sub_grade to Numeric (Drop grade)

In [None]:
# Check feature grade and subgrade
df_train_dropped["grade"].unique()
df_train_dropped["sub_grade"].unique()

# Possible feature engineering: Combine into one feature A=1, B=2, C=3, D=4, E=5, F=6, G=7 (Smaller number has lower risk)===> 
# Use only converted sub_grade, drop feature grade
df_train_dropped = df_train_dropped.drop(["grade"], axis=1)

# Define base values for grades (lower = better credit, higher = higher risk)
grade_mapping = {"A" :1, "B" : 2, "C" : 3, "D" : 4, "E" : 5, "F" : 6, "G" : 7}

# Convert nan to np.nan
df_train_dropped["sub_grade"] = df_train_dropped["sub_grade"].replace("nan", np.nan)

# Check if has nan valus (1 nan)
df_train_dropped["sub_grade"].isna().sum()

# Check if the original "nan" value converted to np.nan (All converted)
print((df_train_dropped["sub_grade"] == "nan").sum())

# Plot the distribution of sub_grade with proper ranking from low to high
plt.figure(figsize=(12, 6))
sns.countplot(y=df_train_dropped["sub_grade"], order=sorted(df_train_dropped["sub_grade"].unique()), palette="Blues_r")
plt.title("Distribution of Sub Grade (Ranked Low to High)")
plt.xlabel("Count")
plt.ylabel("Sub Grade")
plt.show()


# Convert sub_grade into an ordered numeric feature where A1 is lowest risk and G5 is highest risk
df_train_dropped["sub_grade"] = df_train_dropped["sub_grade"].apply(lambda x: grade_mapping[str(x)[0]] * 10 + int(str(x)[1]) if pd.notna(x) else np.nan)



## 10. Convert emp_length to Numeric

In [None]:
# Check feature emp_length
df_train_dropped["emp_length"].unique()

def convert_emp_length(emp):
    if pd.isna(emp):  # Handle missing values
        return np.nan
    if emp == "10+ years":
        return 10
    elif emp == "< 1 year":
        return 0
    else:
        return int(emp.split()[0])  # Extract the number from "X years"
## Check below for convertion criterion


## Convert emp_length to numeric
df_train_dropped["emp_length"] = df_train_dropped["emp_length"].apply(convert_emp_length)

In [None]:
# Handeling 'purpose', 'title', 'zip_code', 'addr_state',

df_train_dropped["purpose"].unique()



## 11. Convert Purpose to Numeric


In [None]:
# Convert "purpose" from string to numeric
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
le = LabelEncoder()

# Fit and transform the 'purpose' column
df_train_dropped["purpose"] = le.fit_transform(df_train_dropped["purpose"])

# View unique mappings
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_mapping)

## 12.Convert Title to Numeric (Drop)

In [None]:
# Convert 'title' from strings to numeric (Since similar to "Purpose, drop it")
num_unique_titles = df_train_dropped["title"].nunique()
title_counts = df_train_dropped["title"].value_counts()
purpose_counts = df_train_dropped["purpose"].value_counts()

df_train_dropped = df_train_dropped.drop(columns=["title"])



## 13. Convert zip_code, addr_state to Numeric(Drop)

In [None]:
# Convert "zip_code" to numeric
df_train_dropped["zip_code"].unique()

# Drop zip_code
df_train_dropped = df_train_dropped.drop(columns=["zip_code"])

# Drop "addr_state"
df_train_dropped["addr_state"].unique()


In [None]:
# Check df after column dropping
df_train_dropped.head()

## 14. Convert hardship_flag to Numeric

In [None]:
# Map 'N' to 0 and 'Y' to 1
df_train_dropped["hardship_flag"] = df_train_dropped["hardship_flag"].map({
                                                                          "N": 0, "Y": 1})

df_train_dropped["hardship_flag"].unique()

## 15. Convert debt_settlement_flag to Numeric

In [None]:
# Map 'N' to 0 and 'Y' to 1
df_train_dropped["debt_settlement_flag"] = df_train_dropped["debt_settlement_flag"].map({
                                                                                        "N": 0, "Y": 1})

df_train_dropped["debt_settlement_flag"].unique()

## 16. Convert application_type to Numeric

In [None]:
# Map 'Individual' to 0 and 'Joint App' to 1
df_train_dropped["application_type"] = df_train_dropped["application_type"].map(
    {"Individual": 0, "Joint App": 1})

df_train_dropped["application_type"].unique()

## 17. Convert last_credit_pull_d to Numeric

Convert date to a numeric measure of recency in days (reference date - credit pull date; smaller is worse)

Reference Date: September 30, 2020 because the dataset appears to be as recent as Q3 2020.

In [None]:
df_train_dropped["last_credit_pull_d"] = pd.to_datetime(
    df_train_dropped["last_credit_pull_d"])

# Define reference date as the end of Q3 2020
reference_date = pd.to_datetime("2020-09-30")

df_train_dropped["last_credit_pull_d"] = (
    reference_date - df_train_dropped["last_credit_pull_d"]).dt.days

df_train_dropped["last_credit_pull_d"].head()