In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# Load data from pickle
df = pd.read_pickle("Loan_status.pkl")

In [None]:
# Read the file 
# df = pd.read_csv("Loan_status_2007-2020Q3.gzip", on_bad_lines="skip", low_memory=False)

In [None]:
# Save file to pickle to save time
# df.to_pickle("Loan_status.pkl")

In [None]:
from sklearn.model_selection import train_test_split

# Step 1: Split off untouched data (10%)
df_train_val, df_untouched = train_test_split(df, test_size=0.10, random_state=42)
df_train_val = df_train_val.copy()  # Avoid SettingWithCopyWarning
df_untouched = df_untouched.copy()
df_untouched["set_flag"] = 0  # Mark untouched

# Step 2: Split remaining into train (50%), validation (20%), and test (20%)
df_train, df_temp = train_test_split(df_train_val, test_size=0.40, random_state=42)
df_train = df_train.copy()
df_temp = df_temp.copy()
df_train["set_flag"] = 1  # Mark train

df_val, df_test = train_test_split(df_temp, test_size=0.50, random_state=42)
df_val = df_val.copy()
df_test = df_test.copy()
df_val["set_flag"] = 2  # Mark validation
df_test["set_flag"] = 3  # Mark test

# Combine all subsets back into one dataframe
df_final = pd.concat([df_train, df_val, df_test, df_untouched], ignore_index=True)

# Save as Parquet for efficient storage
df_final.to_parquet("dataset_with_flags.parquet", index=False)

# Check dataset sizes
print(f"Training Set: {len(df_train)} rows ({len(df_train)/len(df)*100:.1f}%)")
print(f"Validation Set: {len(df_val)} rows ({len(df_val)/len(df)*100:.1f}%)")
print(f"Test Set: {len(df_test)} rows ({len(df_test)/len(df)*100:.1f}%)")
print(f"Untouched Set: {len(df_untouched)} rows ({len(df_untouched)/len(df)*100:.1f}%)")

print("Data saved with set_flag column.")



In [None]:
# Dimension of training set
df_train.shape

In [None]:
## Drop features with > 50% missing values
missing_percent = (df_train.isna().sum() / len(df_train))* 100
cols_to_drop = missing_percent[missing_percent > 50].index
print(cols_to_drop)
df_train_dropped = df_train.drop(columns=cols_to_drop)

In [None]:
# Identify columns with string (object) content
string_columns = df_train_dropped.select_dtypes(include=['object']).columns.tolist()

# Display the list of column names containing string content
string_columns


In [None]:
## loanDetails that requires investor account to login (Unaccessible)
## Drop column
df_train_dropped.drop(columns=["url"], inplace=True)

df_train_dropped.head()


In [None]:
## Drop emp_title column (too many distinct emp_titles)
df_train_dropped.drop(columns=["emp_title"], inplace=True)

df_train_dropped.head()

In [None]:
# Check feature Term and convert to numeric
df_train_dropped["term"].unique()

# Convert  numeric (Remove comment to use)
df_train_dropped["term"] = df_train_dropped["term"].str.extract("(\d+)").astype(float)

# Plot distribution of loan terms
plt.figure(figsize=(6,4))
sns.countplot(x=df_train_dropped["term"])
plt.title("Distribution of Loan Terms")
plt.xlabel("Loan Term (Months)")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.show()

In [None]:
# Check feature int_rate and convert to numeric
df_train_dropped["int_rate"].unique()

# Plot distribution of int_rate
plt.figure(figsize=(6,4))
sns.countplot(x=df_train_dropped["int_rate"])
plt.title("Distribution of int_rate")
plt.xlabel("int_rate)")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.show()

df_train_dropped["int_rate"] = df_train_dropped["int_rate"].replace("nan", np.nan)
df_train_dropped["int_rate"] = df_train_dropped["int_rate"].str.replace("%", "").astype(float) / 100
df_train_dropped["int_rate"].head()



In [None]:
# Check feature grade and subgrade
df_train_dropped["grade"].unique()
df_train_dropped["sub_grade"].unique()

# Possible feature engineering: Combine into one feature A=1, B=2, C=3, D=4, E=5, F=6, G=7 (Smaller number has lower risk)===> 
# Use only converted sub_grade, drop feature grade
df_train_dropped = df_train_dropped.drop(["grade"], axis=1)

# Define base values for grades (lower = better credit, higher = higher risk)
grade_mapping = {"A" :1, "B" : 2, "C" : 3, "D" : 4, "E" : 5, "F" : 6, "G" : 7}

# Convert nan to np.nan
df_train_dropped["sub_grade"] = df_train_dropped["sub_grade"].replace("nan", np.nan)



# Convert sub_grade into an ordered numeric feature where A1 is lowest risk and G5 is highest risk
df_train_dropped["sub_grade"] = df_train_dropped["sub_grade"].apply(lambda x: grade_mapping[str(x)[0]] * 10 + int(str(x)[1]) if pd.notna(x) else np.nan)

# Check below for encoding matchup


In [None]:
# Check if has nan valus (1 nan)
df_train_dropped["sub_grade"].isna().sum()

# Check if the original "nan" value converted to np.nan (All converted)
print((df_train_dropped["sub_grade"] == "nan").sum())

# Plot the distribution of sub_grade with proper ranking from low to high
plt.figure(figsize=(12, 6))
sns.countplot(y=df_train_dropped["sub_grade"], order=sorted(df_train_dropped["sub_grade"].unique()), palette="Blues_r")
plt.title("Distribution of Sub Grade (Ranked Low to High)")
plt.xlabel("Count")
plt.ylabel("Sub Grade")
plt.show()

In [None]:
# Check feature emp_length
df_train_dropped["emp_length"].unique()

def convert_emp_length(emp):
    if pd.isna(emp):  # Handle missing values
        return np.nan
    if emp == "10+ years":
        return 10
    elif emp == "< 1 year":
        return 0
    else:
        return int(emp.split()[0])  # Extract the number from "X years"
## Check below for convertion criterion


## Convert emp_length to numeric
df_train_dropped["emp_length_numeric"] = df_train_dropped["emp_length"].apply(convert_emp_length)

In [None]:
# Drop original emp_length
df_train_dropped.drop(columns = ["emp_length"], inplace=True)

In [None]:
# Handeling 'purpose', 'title', 'zip_code', 'addr_state',

df_train_dropped["purpose"].unique()
df_train_dropped["purpose"].value_counts()
print(df_train_dropped["loan_status"].unique())

In [None]:
# Define a mapping
loan_status_map = {
    "Fully Paid": 1,
    "Does not meet the credit policy. Status:Fully Paid": 1,
    "Charged Off": 0,
    "Default": 0,
    "Late (31-120 days)": 0,
    "Late (16-30 days)": 0,
    "Does not meet the credit policy. Status:Charged Off": 0
}

# Apply the mapping
df_train_dropped["loan_status"] = df_train_dropped["loan_status"].map(loan_status_map)

# Remove rows where loan_status is NaN (e.g., 'Issued' or unknown statuses)
df_train_dropped = df_train_dropped.dropna(subset=["loan_status"])

# Convert to integer type
df_train_dropped["loan_status"] = df_train_dropped["loan_status"].astype(int)

In [None]:
# Convert "purpose" from string to numeric
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
le = LabelEncoder()

# Fit and transform the 'purpose' column
df_train_dropped["purpose_encoded"] = le.fit_transform(df_train_dropped["purpose"])

# View unique mappings
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_mapping)

In [None]:
# Convert 'title' from strings to numeric (Since similar to "Purpose, drop it")
num_unique_titles = df_train_dropped["title"].nunique()
title_counts = df_train_dropped["title"].value_counts()
purpose_counts = df_train_dropped["purpose"].value_counts()

df_train_dropped = df_train_dropped.drop(columns=["title"])



In [None]:
# Convert "zip_code" to numeric
df_train_dropped["zip_code"].unique()

# Drop zip_code
df_train_dropped = df_train_dropped.drop(columns=["zip_code"])


In [None]:
# Drop "addr_state"
df_train_dropped["addr_state"].unique()

In [None]:
# Check df after column dropping
df_train_dropped.head()