In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
import pickle
from imblearn.over_sampling import RandomOverSampler

In [10]:

# Load and clean data
df = pd.read_csv("/Users/trishanandakumar/Desktop/BURE/Datasets/loan_data.csv")
df = df[df['person_age'] <= 100]

# Define features and target
target_col = 'loan_status'
X = df.drop(target_col, axis=1)
y = df[target_col]

# Column types
categorical_cols = [
    'person_gender', 'person_education', 'person_home_ownership',
    'loan_intent', 'previous_loan_defaults_on_file'
]
numerical_cols = [
    'person_age', 'person_income', 'person_emp_exp', 'loan_amnt',
    'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 
    'credit_score'
]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='drop'
)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Preprocess FIRST
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Get feature names
cat_encoder = preprocessor.named_transformers_['cat']
cat_features = cat_encoder.get_feature_names_out(categorical_cols)
all_features = numerical_cols + list(cat_features)

# Apply oversampling AFTER preprocessing
ros = RandomOverSampler(random_state=42)
X_train_balanced, y_train_balanced = ros.fit_resample(X_train_preprocessed, y_train)

# Convert to DataFrames
train_df = pd.DataFrame(X_train_balanced, columns=all_features)
test_df = pd.DataFrame(X_test_preprocessed, columns=all_features)

# Save files
print("\nSaving processed data...")
train_df.to_csv('train_preprocessed_balanced.csv', index=False)
test_df.to_csv('test_preprocessed.csv', index=False)
pd.Series(y_train_balanced).to_csv('y_train_balanced.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

# Verification
print("\n=== Final Data Shapes ===")
print(f"Balanced training: {train_df.shape} (was {len(X_train)})")
print(f"Test set: {test_df.shape}")

print("\n=== Class Distribution ===")
print("Training:", pd.Series(y_train_balanced).value_counts(normalize=True))
print("Test:", y_test.value_counts(normalize=True))


Saving processed data...

=== Final Data Shapes ===
Balanced training: (55988, 27) (was 35994)
Test set: (8999, 27)

=== Class Distribution ===
Training: loan_status
0    0.5
1    0.5
Name: proportion, dtype: float64
Test: loan_status
0    0.777753
1    0.222247
Name: proportion, dtype: float64


In [11]:
# Quick check for any preprocessing artifacts
print("Missing values in training:", train_df.isna().sum().sum())
print("Feature ranges:")
print(train_df.describe().loc[['min', 'max']])

Missing values in training: 0
Feature ranges:
     person_age  person_income  person_emp_exp  loan_amnt  loan_int_rate  \
min   -1.310952      -1.116102       -0.910595  -1.441765      -1.874019   
max   11.195475      36.586270       11.893174   4.038462       3.018224   

     loan_percent_income  cb_person_cred_hist_length  credit_score  \
min            -1.603884                   -0.999141     -4.819047   
max             5.986942                    6.219822      3.005230   

     person_gender_female  person_gender_male  ...  person_home_ownership_OWN  \
min                   0.0                 0.0  ...                        0.0   
max                   1.0                 1.0  ...                        1.0   

     person_home_ownership_RENT  loan_intent_DEBTCONSOLIDATION  \
min                         0.0                            0.0   
max                         1.0                            1.0   

     loan_intent_EDUCATION  loan_intent_HOMEIMPROVEMENT  loan_intent_ME

In [12]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(train_df, y_train_balanced)

# Verify it learned
print("\nTraining accuracy:", model.score(train_df, y_train_balanced))
print("Test accuracy:", model.score(test_df, y_test))


Training accuracy: 0.880813745802672
Test accuracy: 0.8489832203578175


In [14]:
# Get high-confidence samples for forget set
probs = model.predict_proba(train_df)
confidence = probs.max(axis=1)
forget_set = train_df[confidence > 0.9]  # Top 10% most confident predictions

print(f"Forget set size: {len(forget_set)} samples")

# Save as compressed format for large files
train_df.to_csv('train_balanced.csv.gz', index=False, compression='gzip')

Forget set size: 34608 samples
