In [None]:
# Diagnostic checks for leakage / perfect predictors
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score

In [None]:
# --- Load data (adjust path if needed) ---
df = pd.read_csv("C:/Users/DELL/Desktop/MSc/1st Sem/AML/Loan_Default - Copy.csv")

In [3]:
# quick head & shapes
print("Data shape:", df.shape)
print("Columns:", df.columns.tolist()[:30])

Data shape: (49999, 34)
Columns: ['ID', 'year', 'loan_limit', 'Gender', 'approv_in_adv', 'loan_type', 'loan_purpose', 'Credit_Worthiness', 'open_credit', 'business_or_commercial', 'loan_amount', 'rate_of_interest', 'Interest_rate_spread', 'Upfront_charges', 'term', 'Neg_ammortization', 'interest_only', 'lump_sum_payment', 'property_value', 'construction_type', 'occupancy_type', 'Secured_by', 'total_units', 'income', 'credit_type', 'Credit_Score', 'co-applicant_credit_type', 'age', 'submission_of_application', 'LTV']


In [4]:
# Missing and target check
print("\nTarget 'Status' dtype and unique values:", df['Status'].dtype, df['Status'].unique()[:10])
print("\nMissing value counts (top 20):")
print(df.isnull().sum().sort_values(ascending=False).head(20))


Target 'Status' dtype and unique values: int64 [1 0]

Missing value counts (top 20):
Upfront_charges              13287
Interest_rate_spread         12281
rate_of_interest             12209
dtir1                         8072
property_value                5104
LTV                           5104
income                        3008
loan_limit                    1109
approv_in_adv                  277
age                             72
submission_of_application       72
Neg_ammortization               40
loan_purpose                    38
term                            16
year                             0
ID                               0
business_or_commercial           0
open_credit                      0
lump_sum_payment                 0
interest_only                    0
dtype: int64


In [5]:
# Preprocess same way as notebook (impute + encode) but DO NOT drop anything yet
num_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

In [6]:
# impute
num_imputer = SimpleImputer(strategy="median")
df[num_cols] = num_imputer.fit_transform(df[num_cols])

cat_imputer = SimpleImputer(strategy="most_frequent")
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

In [7]:
# one-hot encode categoricals (if none, encoded_df will be empty)
if len(cat_cols) > 0:
    encoder = OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore")
    encoded_array = encoder.fit_transform(df[cat_cols])
    encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(cat_cols), index=df.index)
else:
    encoded_df = pd.DataFrame(index=df.index)


In [8]:
# Build X,y but KEEP columns intact for diagnostics (we will drop identifiers later if needed)
X = pd.concat([df[num_cols].drop("Status", axis=1, errors="ignore"), encoded_df], axis=1)
y = df['Status']

print("\nX shape:", X.shape, "y shape:", y.shape)
print("Is 'Status' in X columns?", "Status" in X.columns)


X shape: (49999, 50) y shape: (49999,)
Is 'Status' in X columns? False


In [9]:
# 1) Check for columns with near-unique values (possible ID columns)
n = len(X)
id_like = [c for c in X.columns if X[c].nunique() >= 0.95 * n]
print("\nColumns with very high cardinality (possible IDs) -> drop or investigate:")
print(id_like)


Columns with very high cardinality (possible IDs) -> drop or investigate:
['ID']


In [10]:
# 2) Check if any column exactly matches the target on the test set later (we'll split first)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print("\nTrain/test shapes:", X_train.shape, X_test.shape)


Train/test shapes: (39999, 50) (10000, 50)


In [11]:
# 3) Quick overlap check of indices (should be zero)
index_overlap = len(set(X_train.index).intersection(set(X_test.index)))
print("Index overlap between train & test:", index_overlap)

Index overlap between train & test: 0


In [12]:
# 4) Quick row overlap check using hashing on a sample if dataset large
sample_size = min(5000, len(X_train))
sample_train = X_train.sample(sample_size, random_state=42)
sample_test = X_test.sample(min(5000, len(X_test)), random_state=42)
train_tuples = set(map(tuple, sample_train.values))
test_tuples = set(map(tuple, sample_test.values))
overlap_sample = len(train_tuples.intersection(test_tuples))
print(f"Sampled row-level overlap (train vs test) among {sample_size} rows: {overlap_sample} (should be 0)")

Sampled row-level overlap (train vs test) among 5000 rows: 0 (should be 0)


In [13]:
# 5) Check whether any single feature perfectly predicts y on the test set (AUC=1.0 or exact match)
perfect_predictors = []
for col in X.columns:
    # skip if constant
    if X_test[col].nunique() <= 1:
        continue
    try:
        auc = roc_auc_score(y_test, X_test[col])
        if np.isclose(auc, 1.0) or np.isclose(auc, 0.0):
            perfect_predictors.append((col, float(auc)))
    except Exception:
        # if not numeric, try exact match
        try:
            if X_test[col].equals(y_test):
                perfect_predictors.append((col, "exact_match"))
        except Exception:
            pass

print("\nPerfect predictors (AUC ~1 or exact match) on test set (if any):")
print(perfect_predictors)


Perfect predictors (AUC ~1 or exact match) on test set (if any):
[]


In [14]:
# 6) If any perfect predictor found, print its first 20 values vs target to inspect
if perfect_predictors:
    for col, val in perfect_predictors:
        print(f"\nInspecting column: {col}, value={val}")
        print(pd.DataFrame({col: X_test[col].head(20), "y_test": y_test.head(20)}))

In [15]:
# 7) Print top 10 correlations with target (numerical only)
correlations = X_test.select_dtypes(include=[np.number]).corrwith(y_test).abs().sort_values(ascending=False)
print("\nTop numeric correlations with target (abs):")
print(correlations.head(20))

  c /= stddev[:, None]
  c /= stddev[None, :]



Top numeric correlations with target (abs):
credit_type_EQUI                     0.610473
lump_sum_payment_not_lpsm            0.172626
Neg_ammortization_not_neg            0.169903
co-applicant_credit_type_EXP         0.147359
credit_type_EXP                      0.134748
credit_type_CRIF                     0.124802
submission_of_application_to_inst    0.119259
LTV                                  0.107309
Upfront_charges                      0.096680
property_value                       0.095971
business_or_commercial_nob/c         0.083461
loan_type_type2                      0.083461
Gender_Joint                         0.077803
dtir1                                0.076011
income                               0.065698
loan_limit_ncf                       0.053913
Gender_Sex Not Available             0.053695
Interest_rate_spread                 0.047940
Region_south                         0.045702
approv_in_adv_pre                    0.044530
dtype: float64


In [None]:
from sklearn.model_selection import cross_val_score, learning_curve, ShuffleSplit
import matplotlib.pyplot as plt
import numpy as np

# 1. Shuffled Target Test
def shuffled_target_test(model, X_train, y_train, X_test, y_test):
    y_shuffled = np.random.permutation(y_train)
    model.fit(X_train, y_shuffled)
    acc = model.score(X_test, y_test)
    print(f"Accuracy with shuffled target: {acc:.4f}")

# 2. Shallow Tree Test (only for tree-based models)
from sklearn.tree import DecisionTreeClassifier
def shallow_tree_test(X_train, y_train, X_test, y_test):
    dt = DecisionTreeClassifier(max_depth=1, random_state=42)
    dt.fit(X_train, y_train)
    acc = dt.score(X_test, y_test)
    print(f"Shallow tree test accuracy: {acc:.4f}")

# 3. Cross-validation
def cross_validation_test(model, X, y):
    scores = cross_val_score(model, X, y, cv=5)
    print(f"Cross-validation scores: {scores}")
    print(f"Mean accuracy: {scores.mean():.4f}")

# 4. Learning Curve
def plot_learning_curve(model, X, y, title="Learning Curve"):
    train_sizes, train_scores, test_scores = learning_curve(
        model, X, y, cv=5, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 5)
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)

    plt.figure(figsize=(6,4))
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.legend(loc="best")
    plt.grid()
    plt.show()
