## 1. Setup: Imports and Configuration

In [60]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [36]:
# Set plotting style
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['grid.linestyle'] = ':'
sns.set_style('whitegrid')

In [37]:
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
RAW_DIR = os.path.join(PROJECT_ROOT, "data", "raw")
PROCESSED_DIR = os.path.join(PROJECT_ROOT, "data", "processed")
os.makedirs(PROCESSED_DIR, exist_ok=True)

# File paths
train_path = os.path.join(RAW_DIR, "cs-training.csv")
test_path = os.path.join(RAW_DIR, "cs-test.csv")

# Check if files exist
assert os.path.exists(train_path), f"File not found: {train_path}"
assert os.path.exists(test_path), f"File not found: {test_path}"

## 2. Load Data

In [38]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Remove Unnamed: 0 column if exists
if 'Unnamed: 0' in train_df.columns:
    train_df = train_df.drop(columns=['Unnamed: 0'])
if 'Unnamed: 0' in test_df.columns:
    test_df = test_df.drop(columns=['Unnamed: 0'])

print("Training data shape:", train_df.shape)
print("Test data shape:", test_df.shape)
train_df.head()

Training data shape: (150000, 11)
Test data shape: (101503, 11)


Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


## 3. Data Cleaning

In [39]:
# Identify columns with late payment information
late_cols = [c for c in train_df.columns if 'NumberOfTime' in c or 'NumberOfTimes90DaysLate' in c]
# Replace magic values (96, 98) with NaN
magic_vals = [96, 98]
for col in late_cols:
    train_df[col] = train_df[col].replace(magic_vals, np.nan)
    test_df[col] = test_df[col].replace(magic_vals, np.nan)


## 4. Handle Missing Values Using Training Data Parameters

In [40]:
print("Missing values in training data:")
print(train_df.isna().sum().sort_values(ascending=False))
print("\nMissing values in test data:")
print(test_df.isna().sum().sort_values(ascending=False))

Missing values in training data:
MonthlyIncome                           29731
NumberOfDependents                       3924
NumberOfTime30-59DaysPastDueNotWorse      269
NumberOfTimes90DaysLate                   269
NumberOfTime60-89DaysPastDueNotWorse      269
SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
DebtRatio                                   0
NumberOfOpenCreditLinesAndLoans             0
NumberRealEstateLoansOrLines                0
dtype: int64

Missing values in test data:
SeriousDlqin2yrs                        101503
MonthlyIncome                            20103
NumberOfDependents                        2626
NumberOfTime30-59DaysPastDueNotWorse       214
NumberOfTimes90DaysLate                    214
NumberOfTime60-89DaysPastDueNotWorse       214
RevolvingUtilizationOfUnsecuredLines         0
age                                          0
DebtRatio                                 

In [41]:
# For late payment columns, fill with 0 (assuming missing = no late payments)
for col in late_cols:
    train_df[col] = train_df[col].fillna(0)
    test_df[col] = test_df[col].fillna(0)

In [42]:
# For MonthlyIncome, use median from training data
train_income_median = train_df['MonthlyIncome'].median()
train_df['MonthlyIncome'] = train_df['MonthlyIncome'].fillna(train_income_median)
test_df['MonthlyIncome'] = test_df['MonthlyIncome'].fillna(train_income_median)

In [43]:
# For NumberOfDependents, use mode from training data
train_dependents_mode = train_df['NumberOfDependents'].mode()[0]
train_df['NumberOfDependents'] = train_df['NumberOfDependents'].fillna(train_dependents_mode)
test_df['NumberOfDependents'] = test_df['NumberOfDependents'].fillna(train_dependents_mode)

In [44]:
print("\nMissing values after imputation:")
print("Training:", train_df.isna().sum().sum())
print("Test:", test_df.isna().sum().sum())


Missing values after imputation:
Training: 0
Test: 101503


## 5. Handle Outliers and Skewed Distributions

In [45]:
# Clip extreme values in RevolvingUtilizationOfUnsecuredLines and DebtRatio
for col in ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio']:
    q99 = train_df[col].quantile(0.99)
    train_df[col] = np.clip(train_df[col], 0, q99)
    test_df[col] = np.clip(test_df[col], 0, q99)  # Use training quantile

In [46]:
# Handle MonthlyIncome - log transformation and clipping
income_q99 = train_df['MonthlyIncome'].quantile(0.99)
train_df['MonthlyIncome'] = np.clip(train_df['MonthlyIncome'], 0, income_q99)
test_df['MonthlyIncome'] = np.clip(test_df['MonthlyIncome'], 0, income_q99)

In [47]:
# Apply log transformation to reduce skewness
train_df['LogMonthlyIncome'] = np.log1p(train_df['MonthlyIncome'])
test_df['LogMonthlyIncome'] = np.log1p(test_df['MonthlyIncome'])

In [48]:
# Remove invalid age values (0 or too high)
train_df = train_df[(train_df['age'] >= 18) & (train_df['age'] <= 100)]
test_df = test_df[(test_df['age'] >= 18) & (test_df['age'] <= 100)]

## 6. Feature Engineering

In [49]:
# Create age bins
age_bins = [18, 25, 35, 45, 55, 65, 75, 100]
age_labels = ['18-25', '26-35', '36-45', '46-55', '56-65', '66-75', '76+']

train_df = train_df.copy()
test_df = test_df.copy()

train_df.loc[:, 'AgeGroup'] = pd.cut(train_df['age'], bins=age_bins, labels=age_labels, right=False)
test_df.loc[:, 'AgeGroup'] = pd.cut(test_df['age'], bins=age_bins, labels=age_labels, right=False)

In [50]:
# Create binary flags for late payments
for col in late_cols:
    new_col_name = f'Has_{col}'
    train_df.loc[:, new_col_name] = (train_df[col] > 0).astype(int)
    test_df.loc[:, new_col_name] = (test_df[col] > 0).astype(int)

In [51]:
# Create total late payments feature
train_df.loc[:, 'TotalLatePayments'] = train_df[late_cols].sum(axis=1)
test_df.loc[:, 'TotalLatePayments'] = test_df[late_cols].sum(axis=1)

# Create income per dependent feature
train_df.loc[:, 'IncomePerDependent'] = train_df['MonthlyIncome'] / (train_df['NumberOfDependents'] + 1)
test_df.loc[:, 'IncomePerDependent'] = test_df['MonthlyIncome'] / (test_df['NumberOfDependents'] + 1)

# Create debt to income ratio
train_df.loc[:, 'DebtToIncomeRatio'] = train_df['DebtRatio'] * train_df['MonthlyIncome']
test_df.loc[:, 'DebtToIncomeRatio'] = test_df['DebtRatio'] * test_df['MonthlyIncome']

## 7. Encode Categorical Variables

In [52]:
# One-hot encode AgeGroup
train_df = pd.get_dummies(train_df, columns=['AgeGroup'], prefix='Age')
test_df = pd.get_dummies(test_df, columns=['AgeGroup'], prefix='Age')

In [53]:
train_df

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,...,TotalLatePayments,IncomePerDependent,DebtToIncomeRatio,Age_18-25,Age_26-35,Age_36-45,Age_46-55,Age_56-65,Age_66-75,Age_76+
0,1,0.766127,45,2.0,0.802982,9120.0,13,0.0,6,0.0,...,2.0,3040.000000,7.323197e+03,False,False,False,True,False,False,False
1,0,0.957151,40,0.0,0.121876,2600.0,4,0.0,0,0.0,...,0.0,1300.000000,3.168781e+02,False,False,True,False,False,False,False
2,0,0.658180,38,1.0,0.085113,3042.0,2,1.0,0,0.0,...,2.0,3042.000000,2.589149e+02,False,False,True,False,False,False,False
3,0,0.233810,30,0.0,0.036050,3300.0,5,0.0,0,0.0,...,0.0,3300.000000,1.189640e+02,False,True,False,False,False,False,False
4,0,0.907239,49,1.0,0.024926,23000.0,7,0.0,1,0.0,...,1.0,23000.000000,5.732910e+02,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,0,0.040674,74,0.0,0.225131,2100.0,4,0.0,1,0.0,...,0.0,2100.000000,4.727749e+02,False,False,False,False,False,True,False
149996,0,0.299745,44,0.0,0.716562,5584.0,4,0.0,1,0.0,...,0.0,1861.333333,4.001283e+03,False,False,True,False,False,False,False
149997,0,0.246044,58,0.0,3870.000000,5400.0,18,0.0,1,0.0,...,0.0,5400.000000,2.089800e+07,False,False,False,False,True,False,False
149998,0,0.000000,30,0.0,0.000000,5716.0,4,0.0,0,0.0,...,0.0,5716.000000,0.000000e+00,False,True,False,False,False,False,False


## 8. Prepare Data for Modeling

In [54]:
# Separate features and target
X = train_df.drop('SeriousDlqin2yrs', axis=1)
y = train_df['SeriousDlqin2yrs']

# Split training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Prepare test features (excluding target)
X_test = test_df.drop('SeriousDlqin2yrs', axis=1)

print("Data split completed:")
print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")

Data split completed:
Training set: (119988, 24)
Validation set: (29998, 24)
Test set: (101500, 24)


## 9. Scale Features

In [55]:
scaler = StandardScaler()

# Fit on training data
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrames
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val.columns, index=X_val.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

## 10. Save Processed Data

In [56]:
os.makedirs(PROCESSED_DIR, exist_ok=True)

In [57]:
train_processed_path = os.path.join(PROCESSED_DIR, 'train_processed.csv')
val_processed_path = os.path.join(PROCESSED_DIR, 'val_processed.csv')
test_processed_path = os.path.join(PROCESSED_DIR, 'test_processed.csv')

# Save with target variable
train_df.to_csv(train_processed_path, index=False)
pd.concat([X_val, y_val], axis=1).to_csv(val_processed_path, index=False)
test_df.to_csv(test_processed_path, index=False)

# Save scaled versions
train_scaled_path = os.path.join(PROCESSED_DIR, 'train_scaled.csv')
val_scaled_path = os.path.join(PROCESSED_DIR, 'val_scaled.csv')
test_scaled_path = os.path.join(PROCESSED_DIR, 'test_scaled.csv')

pd.concat([X_train_scaled, y_train], axis=1).to_csv(train_scaled_path, index=False)
pd.concat([X_val_scaled, y_val], axis=1).to_csv(val_scaled_path, index=False)
X_test_scaled.to_csv(test_scaled_path, index=False)

# Save scaler for future use
import joblib
scaler_path = os.path.join(PROCESSED_DIR, 'scaler.pkl')
joblib.dump(scaler, scaler_path)

['/Users/tdavi/Desktop/Se/ml studying/credit_scoring/data/processed/scaler.pkl']