Step 1: select small dataset from largedataset

In [None]:
import pandas as pd

# Load your original dataset
df = pd.read_csv("financial_risk_data.csv")

# Take only first 500 rows
df_small = df.head(500)

# Save it as a new file
df_small.to_csv("small_dataset.csv", index=False)


Step 2: show overview of small dataset 

In [None]:
import pandas as pd

# Load your dataset
df = pd.read_csv("small_dataset.csv")

# Basic shape of the data
print("üßæ Dataset Shape:", df.shape)

# First 5 rows
print("\nüîç First 5 Rows:")
display(df.head())

# Info about columns and data types
print("\n‚ÑπÔ∏è Dataset Info:")
df.info()

# Summary statistics for numeric columns
print("\nüìä Summary Statistics:")
display(df.describe())

# Check for missing values
print("\n‚ùó Missing Values in Each Column:")
print(df.isnull().sum())

# Check for duplicate rows
print("\nüìã Number of Duplicate Rows:", df.duplicated().sum())

# Display column names
print("\nü™∂ Column Names:")
print(df.columns.tolist())


Step 2: Data cleaning and fill data

In [None]:
# ---------------- STEP 3: CLEANING & PREPROCESSING (paste into your notebook) ----------------
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# ---------- CONFIG ----------
TARGET_COL = 'loan_status'   # change if different
ID_COLS = ['id', 'member_id', 'url', 'title', 'zip_code', 'desc']  # drop these if present
MAX_MISSING_RATIO = 0.80     # drop columns with >80% missing
CAT_CARDINALITY_LIMIT = 15   # only one-hot encode categoricals with unique <= this
TEST_SIZE_FRACTION = 0.20    # fallback fraction for larger datasets
RANDOM_STATE = 42
# ----------------------------

# Copy df to avoid accidental mutation
df_clean = df.copy()

print("Initial shape:", df_clean.shape)

# 1) Drop columns with too many missing values
missing_ratio = df_clean.isna().mean()
cols_to_drop = missing_ratio[missing_ratio > MAX_MISSING_RATIO].index.tolist()
print(f"Dropping {len(cols_to_drop)} columns with >{int(MAX_MISSING_RATIO*100)}% missing.")
df_clean.drop(columns=cols_to_drop, inplace=True, errors='ignore')

# 2) Drop obvious identifier columns if present
existing_ids = [c for c in ID_COLS if c in df_clean.columns]
if existing_ids:
    print("Dropping identifier cols:", existing_ids)
    df_clean.drop(columns=existing_ids, inplace=True, errors='ignore')

print("Shape after column drops:", df_clean.shape)

# 3) Normalize / prepare some known columns
# 3a) term -> months (e.g. '36 months' -> 36)
if 'term' in df_clean.columns:
    df_clean['term_months'] = df_clean['term'].astype(str).str.extract(r'(\d+)').astype(float)
    df_clean.drop(columns=['term'], inplace=True, errors='ignore')

# 3b) percent-like columns that might be strings with '%' (int_rate, revol_util, sec_app_revol_util, etc.)
percent_cols = [c for c in df_clean.columns if df_clean[c].dtype == 'object' and df_clean[c].astype(str).str.contains('%').any()]
# also include common percent-named columns if present
for c in ['int_rate', 'revol_util', 'sec_app_revol_util']:
    if c in df_clean.columns and df_clean[c].dtype == object:
        percent_cols.append(c)
percent_cols = list(set(percent_cols))
for c in percent_cols:
    try:
        df_clean[c] = df_clean[c].astype(str).str.replace('%', '').str.strip()
        df_clean[c] = pd.to_numeric(df_clean[c], errors='coerce')
        print(f"Converted percent-col -> numeric: {c}")
    except Exception as e:
        print("Could not convert percent col:", c, "->", e)

# 3c) If int_rate already numeric but in form '13.99' it's fine. If still object, try to coerce.
if 'int_rate' in df_clean.columns and not np.issubdtype(df_clean['int_rate'].dtype, np.number):
    df_clean['int_rate'] = pd.to_numeric(df_clean['int_rate'], errors='coerce')

# 4) Prepare target (binary)
if TARGET_COL not in df_clean.columns:
    raise ValueError(f"Target column '{TARGET_COL}' not found in dataframe.")

# If already numeric and 0/1 -> use directly; otherwise map common labels
if pd.api.types.is_numeric_dtype(df_clean[TARGET_COL].dtype):
    unique_vals = np.unique(df_clean[TARGET_COL].dropna().astype(int))
    if set(unique_vals).issubset({0, 1}):
        df_clean['target'] = df_clean[TARGET_COL].astype(int)
        print("Target already binary numeric (0/1).")
    else:
        # fallback: treat non-zero as 1
        df_clean['target'] = (df_clean[TARGET_COL] != 0).astype(int)
        print("Target numeric but not 0/1; mapped non-zero -> 1.")
else:
    # common label mapping (adjust if your dataset uses different labels)
    label_map = {
        'Fully Paid': 0,
        'Current': 0,
        'In Grace Period': 0,
        'Late (16-30 days)': 1,
        'Late (31-120 days)': 1,
        'Charged Off': 1,
        'Default': 1,
        'charged off': 1,
        'Fully Paid.': 0
    }
    unknown_labels = set(df_clean[TARGET_COL].dropna().unique()) - set(label_map.keys())
    if unknown_labels:
        print("Warning: Unknown target labels found (will be dropped):", unknown_labels)
    df_clean['target'] = df_clean[TARGET_COL].map(label_map)
    # drop rows where mapping failed
    before_rows = len(df_clean)
    df_clean = df_clean[df_clean['target'].notna()].copy()
    print(f"Dropped {before_rows - len(df_clean)} rows with unmapped target labels.")

# convert to int
df_clean['target'] = df_clean['target'].astype(int)
print("Target distribution:\n", df_clean['target'].value_counts())

# 5) Feature selection: numeric features + select low-cardinality categorical features
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
# remove target from numeric features
numeric_cols = [c for c in numeric_cols if c not in ['target']]

# candidate categorical columns (object / category types)
cat_cols = df_clean.select_dtypes(include=['object', 'category']).columns.tolist()

# keep only low-cardinality categorical columns for one-hot
cat_keep = []
for c in cat_cols:
    n_unique = df_clean[c].nunique(dropna=True)
    if 1 < n_unique <= CAT_CARDINALITY_LIMIT:
        cat_keep.append(c)

print(f"Selected {len(numeric_cols)} numeric features and {len(cat_keep)} categorical features for encoding.")
print("Numeric (sample):", numeric_cols[:10])
print("Categorical to encode (sample):", cat_keep[:10])

# Build X
X_num = df_clean[numeric_cols].copy()
X_cat = df_clean[cat_keep].copy() if cat_keep else pd.DataFrame(index=df_clean.index)

# 6) Impute numeric with median
num_imputer = SimpleImputer(strategy='median')
if not X_num.empty:
    X_num_imputed = pd.DataFrame(num_imputer.fit_transform(X_num), columns=X_num.columns, index=X_num.index)
else:
    X_num_imputed = X_num

# 7) Fill categorical missing with 'MISSING' and one-hot encode
if not X_cat.empty:
    X_cat_filled = X_cat.fillna('MISSING').astype(str)
    X_cat_encoded = pd.get_dummies(X_cat_filled, drop_first=True)
else:
    X_cat_encoded = pd.DataFrame(index=X_num_imputed.index)

# 8) Combine numeric + categorical
X = pd.concat([X_num_imputed, X_cat_encoded], axis=1)
print("Final feature matrix shape (before scaling):", X.shape)

# 9) Safety check - if X has any NaN (shouldn't) replace with median fallback
if X.isna().any().any():
    print("Warning: NaNs found in X after imputation. Applying median fallback for remaining NaNs.")
    X = pd.DataFrame(num_imputer.fit_transform(X), columns=X.columns, index=X.index)

# 10) Optional scaling (helpful for many models)
scaler = StandardScaler()
if X.shape[1] > 0:
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)
else:
    X_scaled = X

# 11) Prepare y and safe train-test split (absolute test count)
y = df_clean['target'].reset_index(drop=True)
X_scaled = X_scaled.reset_index(drop=True)

n_samples = len(X_scaled)
if n_samples == 0:
    raise ValueError("No samples available after preprocessing.")

n_test = max(1, int(round(TEST_SIZE_FRACTION * n_samples)))
if n_test >= n_samples:
    n_test = n_samples - 1
print(f"Using absolute test size = {n_test} of {n_samples} samples.")

# stratify if each class has >=2 samples
stratify_arg = None
vc = y.value_counts()
if vc.min() >= 2:
    stratify_arg = y
    print("Using stratified split.")
else:
    print("Not enough samples per class to stratify; doing random split.")

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=n_test, random_state=RANDOM_STATE, stratify=stratify_arg
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Train target distribution:\n", y_train.value_counts())
print("Test target distribution:\n", y_test.value_counts())

# 12) Save processed datasets into workspace variables for next step
processed = {
    'X_train': X_train, 'X_test': X_test,
    'y_train': y_train, 'y_test': y_test,
    'X_all': X_scaled, 'y_all': y
}
print("Preprocessing complete. Processed dict created for model training (use processed['X_train'] etc.).")
# ----------------------------------------------------------------------------------------------


Step 4: Model Training & Evaluation (Logistic Regression)


In [None]:
# üß† Step 4: Model Training & Evaluation (Logistic Regression)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# üß© Extract processed data from your step 3 dictionary
X_train = processed["X_train"]
X_test = processed["X_test"]
y_train = processed["y_train"]
y_test = processed["y_test"]

print(f"Training shape: {X_train.shape}, Testing shape: {X_test.shape}")

# ‚öôÔ∏è 1) Initialize Logistic Regression model
model = LogisticRegression(max_iter=2000, solver='lbfgs', class_weight='balanced', random_state=42)

# üöÄ 2) Train the model
model.fit(X_train, y_train)

# üîç 3) Make predictions
y_pred = model.predict(X_test)

# üìä 4) Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("\n‚úÖ Model Evaluation Results:")
print(f"Accuracy: {accuracy:.3f}\n")
print("Classification Report:\n", classification_report(y_test, y_pred))

# üß© 5) Confusion Matrix Visualization
plt.figure(figsize=(6, 5))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Logistic Regression")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# üíæ 6) Save model for later use
import pickle
pickle.dump(model, open("model.pkl", "wb"))
print("Model saved as model.pkl ‚úÖ")
