In [7]:
!pip install --upgrade pandas



In [1]:
#Imports

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [2]:
file_path = '../Data/Raw_data/application_train.csv'
df_train = pd.read_csv(file_path)

baseline_features = ['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','DAYS_BIRTH']

df_baseline = df_train.copy()

for col in baseline_features:
    df_baseline[col].fillna(df_baseline[col].median(), inplace=True)
df_baseline[baseline_features]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_baseline[col].fillna(df_baseline[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_baseline[col].fillna(df_baseline[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate o

Unnamed: 0,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,DAYS_BIRTH
0,202500.0,406597.5,24700.5,-9461
1,270000.0,1293502.5,35698.5,-16765
2,67500.0,135000.0,6750.0,-19046
3,135000.0,312682.5,29686.5,-19005
4,121500.0,513000.0,21865.5,-19932
...,...,...,...,...
307506,157500.0,254700.0,27558.0,-9327
307507,72000.0,269550.0,12001.5,-20775
307508,153000.0,677664.0,29979.0,-14966
307509,171000.0,370107.0,20205.0,-11961


In [3]:
X = df_baseline[baseline_features]
y = df_baseline['TARGET']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10, stratify=y)

model = LogisticRegression(random_state=10)
model.fit(X_train, y_train)

In [4]:
val_preds = model.predict_proba(X_val)[:, 1]
baseline_auc = roc_auc_score(y_val, val_preds)

print(f"Baseline Model Validation AUC: {baseline_auc:.4f}")

Baseline Model Validation AUC: 0.5901


In [5]:
numeric_cols = df_train.select_dtypes(include=np.number).columns.tolist()
numeric_cols.remove('TARGET')
numeric_cols.remove('SK_ID_CURR')

X = df_train[numeric_cols]
y = df_train['TARGET']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10, stratify=y)

imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val) 

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_val_scaled = scaler.transform(X_val_imputed)

model = LogisticRegression(random_state=10)
model.fit(X_train_scaled, y_train)

val_preds = model.predict_proba(X_val_scaled)[:, 1]
numeric_auc = roc_auc_score(y_val, val_preds)

print(f"Baseline Model AUC: {baseline_auc:.4f}")
print(f"Model with all numeric features AUC: {numeric_auc:.4f}")

Baseline Model AUC: 0.5901
Model with all numeric features AUC: 0.7346


In [6]:
X = df_train.drop(columns=['TARGET', 'SK_ID_CURR'])
y = df_train['TARGET']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10, stratify=y)

numeric_cols = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

X_train_num = X_train[numeric_cols].copy()
X_val_num = X_val[numeric_cols].copy()

imputer = SimpleImputer(strategy='median')
X_train_num_imputed = pd.DataFrame(imputer.fit_transform(X_train_num), columns=numeric_cols, index=X_train.index)
X_val_num_imputed = pd.DataFrame(imputer.transform(X_val_num), columns=numeric_cols, index=X_val.index)

scaler = StandardScaler()
X_train_num_scaled = pd.DataFrame(scaler.fit_transform(X_train_num_imputed), columns=numeric_cols, index=X_train.index)
X_val_num_scaled = pd.DataFrame(scaler.transform(X_val_num_imputed), columns=numeric_cols, index=X_val.index)

X_train_cat = X_train[categorical_cols].copy()
X_val_cat = X_val[categorical_cols].copy()

X_train_cat_encoded = pd.get_dummies(X_train_cat, handle_unknown='ignore')
X_val_cat_encoded = pd.get_dummies(X_val_cat, handle_unknown='ignore')

X_train_cat_final, X_val_cat_final = X_train_cat_encoded.align(X_val_cat_encoded, join='inner', axis=1)

X_train_final = pd.concat([X_train_num_scaled, X_train_cat_final], axis=1)
X_val_final = pd.concat([X_val_num_scaled, X_val_cat_final], axis=1)

model = LogisticRegression(random_state=42, C=0.01)
model.fit(X_train_final, y_train)

val_preds = model.predict_proba(X_val_final)[:, 1]
final_auc = roc_auc_score(y_val, val_preds)

print(f"Baseline Model AUC: {baseline_auc:.4f}")
print(f"Model with all numeric features AUC: {numeric_auc:.4f}")
print(f"Model with all features (numeric + categorical) AUC: {final_auc:.4f}")

TypeError: get_dummies() got an unexpected keyword argument 'handle_unknown'