## Setup
Import the required packages and load the lending club dataset from CSV file.

In [None]:
import os
import sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Add the parent directory to the Python path to import the core module
sys.path.append(os.path.abspath(os.path.join("..")))

from core import get_data_path

csv_file_path = get_data_path("assignment2/lc_14to16.csv")
data = pd.read_csv(csv_file_path)

## 1. Summary (Exploratory Data Analysis)
Perform EDA to analyze the differences between the two datasets (before and after the 2015 controversy).

Histograms used to compare the distribution of the loan amount, annual income, and debt-to-income ratio.

In [None]:
import numpy as np

# Convert issue date to datetime to filter by time period
data["issue_d"] = pd.to_datetime(data["issue_d"])

# Define the controversy date
controversy_date = pd.to_datetime("2015-01-01")

# Create "Period" column based on the controversy date
# Create a new column to indicate the period, values are "Before 2015" and "After 2015"
data["Period"] = np.where(data["issue_d"] < controversy_date, "Before 2015", "After 2015")

# Print basic information for both periods
print("Total columns in original dataset: ", data.shape[1])
print("Total rows in original dataset: ", data.shape[0])
print("Total rows in dataset before 2015:", data[data["Period"] == "Before 2015"].shape[0])
print("Total rows in dataset after 2015:", data[data["Period"] == "After 2015"].shape[0])

# Print the first few rows of the dataset
print("First few rows of the dataset: ")
display(data.head())

# Print the summary statistics of the dataset
print("Summary Statistics: ")
display(data.describe())

# Calculate missing values in the dataset
missing_values = data.isnull().sum()
print("Missing values count:", )
display(missing_values[missing_values > 0])

# Plot distributions for loan amount
plt.figure(figsize=(10,5))
sns.histplot(data, x="loan_amnt", hue="Period", kde=True, bins=30)
plt.title("Loan Amount Distribution Before and After Controversy")
plt.xlabel("Loan Amount")
plt.ylabel("Frequency")
plt.show()

# Plot interest rate distribution
plt.figure(figsize=(10,5))
sns.histplot(data, x="int_rate", hue="Period", kde=True, bins=30)
plt.title("Interest Rate Distribution Before and After Controversy")
plt.xlabel("Interest Rate (%)")
plt.ylabel("Frequency")
plt.show()

# Plot grade distribution
plt.figure(figsize=(8,6))
sns.countplot(data=data, x="grade", hue="Period", order=sorted(data["grade"].unique()))
plt.title("Loan Grade Distribution Before and After Controversy")
plt.xlabel("Grade")
plt.ylabel("Count")
plt.show()

# Analyze Differences Between Periods
# Loan Purpose
plt.figure(figsize=(12,6))
sns.countplot(data=data, y="purpose", hue="Period", order=data["purpose"].value_counts().index)
plt.title("Loan Purpose Distribution Before and After Controversy")
plt.xlabel("Count")
plt.ylabel("Purpose")
plt.show()

# Employment Length
plt.figure(figsize=(10,6))
sns.countplot(data=data, x="emp_length", hue="Period", order=sorted(data["emp_length"].dropna().unique()))
plt.title("Employment Length Before and After Controversy")
plt.xlabel("Employment Length (Years)")
plt.ylabel("Count")
plt.show()

# Home Ownership
plt.figure(figsize=(8,6))
sns.countplot(data=data, x="home_ownership", hue="Period")
plt.title("Home Ownership Status Before and After Controversy")
plt.xlabel("Home Ownership")
plt.ylabel("Count")
plt.show()

# Correlation Analysis
# Select numerical features
numerical_features = data.select_dtypes(include=["float64", "int64"]).columns

# Compute correlation matrix
corr_matrix = data[numerical_features].corr()

# Plot heatmap
plt.figure(figsize=(12,10))
sns.heatmap(corr_matrix, cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

## 2. Preprocessing
### a. Standardize the numerical features and encode the categorical features

In [None]:
from sklearn.calibration import LabelEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Separate numerical and categorical columns
numerical_cols = data.select_dtypes(include=["float64", "int64"]).columns
categorical_cols = data.select_dtypes(include=["object"]).columns

# Fill numerical missing values with median
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].median())

# Fill categorical missing values with mode
for col in categorical_cols:
    data[col] = data[col].fillna(data[col].mode()[0])

# List of numerical features to standardize
num_features = ["loan_amnt", "int_rate", "installment", "annual_inc", "dti", "delinq_2yrs", "open_acc", "revol_bal", "total_acc"]

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the data
data[num_features] = scaler.fit_transform(data[num_features])

# Encode "grade"
label_enc = LabelEncoder()
data["grade"] = label_enc.fit_transform(data["grade"])

# Nominal categorical features to encode
nominal_features = ["home_ownership", "verification_status", "purpose", "addr_state", "application_type"]

# One-Hot Encoding
data = pd.get_dummies(data, columns=nominal_features, drop_first=True)

display(data.head())


### b. Identify and remove up to 1% of rows as outliers based on standardized `dti`, `annualincome`, and `delinq_2yrs` variables

In [None]:

# Initialize the scaler
scaler = StandardScaler()

# Features to check for outliers
outlier_features = ["dti", "annual_inc", "delinq_2yrs"]

# Standardize these features (if not already done)
data[outlier_features] = scaler.fit_transform(data[outlier_features])

# Sum the absolute standardized scores
data["outlier_score"] = data[outlier_features].abs().sum(axis=1)

# Calculate IQR
Q1 = data["outlier_score"].quantile(0.25)
Q3 = data["outlier_score"].quantile(0.75)
IQR = Q3 - Q1

# Define the bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = data[(data["outlier_score"] < lower_bound) | (data["outlier_score"] > upper_bound)]

# Determine the number of outliers to remove (up to 1% of total data)
max_outliers = int(0.01 * len(data))
outliers_to_remove = outliers.head(max_outliers)

# Remove outliers
data_cleaned = data.drop(outliers_to_remove.index).reset_index(drop=True)

# Calculate the percentage of data retained
retained_data_percent = (data_cleaned.shape[0] / data.shape[0]) * 100

# Check the shape of the cleaned data
print("Original Data Shape:", data.shape)
print("Cleaned Data Shape:", data_cleaned.shape)
print("Number of Outliers Removed:", outliers_to_remove.shape[0])
print("Percentage of Data Retained: {:.2f}%".format(retained_data_percent))

# Remove temporary columns
data_cleaned = data_cleaned.drop(columns=["outlier_score"])

## 3. Classification Task
### Define response variable and features

In [None]:
# Copy the data to avoid modifying the original DataFrame
data_ml = data_cleaned.copy()

print("Shape of data_ml:", data_ml.shape)

# Map "grade" to High-Low response
def map_high_low(grade):
    if grade in ["A", "B"]:
        return "High"
    elif grade in ["D", "E", "F", "G"]:
        return "Low"
    else:
        return np.nan  # Exclude "C"

# Map "grade" to High-Medium-Low response
def map_high_med_low(grade):
    if grade in ["A", "B"]:
        return "High"
    elif grade == "C":
        return "Medium"
    elif grade in ["D", "E", "F", "G"]:
        return "Low"
    else:
        return np.nan
    
# Remove ' months' and convert to integer
data_ml["term"] = data_ml["term"].str.strip().str.replace(" months", "").astype(int)

# Apply the mapping functions
data_ml["High_Low"] = data_ml["grade"].map(map_high_low)
data_ml["High_Med_Low"] = data_ml["grade"].map(map_high_med_low)

# Drop rows with NaN in "High_Low" (i.e., where grade is "C")
data_high_low = data_ml.dropna(subset=["High_Low"]).reset_index(drop=True)

# Drop rows with NaN in "High_Med_Low" (i.e., where grade is not A-G)
data_high_med_low = data_ml.dropna(subset=["High_Med_Low"]).reset_index(drop=True)

# Verify that data_high_low is not empty
print("Number of samples in data_high_low:", data_high_low.shape[0])
print("Number of samples in data_high_med_low:", data_high_med_low.shape[0])

### a. Split the Data into Train-Validate-Test Sets
I split the data into:

- Training Set: 70%
- Validation Set: 15%
- Test Set: 15%

In [None]:
from sklearn.model_selection import train_test_split

# Features and target for High-Low response
X = data_high_low.drop(["High_Low", "High_Med_Low"], axis=1)
y = data_high_low["High_Low"]

# Verify that X and y have the same number of samples
print("Features shape:", X.shape)
print("Target shape:", y.shape)

# First split: Train and temp (validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Second split: Validation and Test from temp
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

object_cols = X_train.select_dtypes(include=["object"]).columns.tolist()
print("Categorical columns:", object_cols)

label_enc = LabelEncoder()

for col in object_cols:
    data_ml[col] = label_enc.fit_transform(data_ml[col])

print("Training set size:", X_train.shape)
print("Validation set size:", X_val.shape)
print("Test set size:", X_test.shape)


### b. Build a logistic model to accurately predict the High-Low response

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score

print("Data types in X_train:")
display(X_train.dtypes)

# Initialize the Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)

X = data_high_low.drop(["High_Low", "High_Med_Low"], axis=1)
y = data_high_low["High_Low"]

label_enc = LabelEncoder()
data_high_low["grade"] = label_enc.fit_transform(data_high_low["grade"])
data_high_low["sub_grade"] = label_enc.fit_transform(data_high_low["sub_grade"])
display(data_high_low.head())

# Fit the model on the training data
log_reg.fit(X, y)

# Predict on validation set
y_val_pred = log_reg.predict(X_val)

# Calculate evaluation metrics
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred, pos_label="High")
recall = recall_score(y_val, y_val_pred, pos_label="High")

print("Logistic Regression (High-Low) Validation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)