In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix,accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, f_classif,chi2
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

df = pd.read_csv('./train1.csv')
X_train = df.drop(['target','id'], axis=1)


In [3]:
y_train = df['target']

In [4]:
X_train,X_test,y_train,y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=29173, stratify=y_train)

In [5]:
X_test.shape

(88863, 65)

In [6]:
"""df_test = pd.read_csv('test.csv')
X_test = df_test.drop('id',axis=1) 
test_ids = df_test['id']"""

"df_test = pd.read_csv('test.csv')\nX_test = df_test.drop('id',axis=1) \ntest_ids = df_test['id']"

In [7]:
categorical_cols = [col for col in X_train.columns if col.endswith('_cat')]
X_train[categorical_cols] = X_train[categorical_cols].astype('category')
X_test[categorical_cols] = X_test[categorical_cols].astype('category')
binary_cols = [col for col in X_train.columns if col.endswith('_bin')]
X_train[binary_cols] = X_train[binary_cols].astype(bool)
X_test[binary_cols] = X_test[binary_cols].astype(bool)

In [8]:
def simple_imputer(X_train, X_val, categorical_columns=None, numerical_columns=None):
    """
    Impute missing values in train and validation/test sets using only training data statistics.
    - categorical_columns: list of categorical columns to impute with mode
    - numerical_columns: list of numerical columns to impute with mean
    """
    X_train_imputed = X_train.copy()
    X_val_imputed = X_val.copy()

    # Categorical columns
    if categorical_columns:
        for col in categorical_columns:
            if col in X_train_imputed.columns:
                mode_value = X_train_imputed[col].mode(dropna=True)
                if not mode_value.empty:
                    mode_value = mode_value[0]
                    X_train_imputed[col].fillna(mode_value, inplace=True)
                    if col in X_val_imputed.columns:
                        X_val_imputed[col].fillna(mode_value, inplace=True)

    # Numerical columns
    if numerical_columns:
        for col in numerical_columns:
            if col in X_train_imputed.columns:
                mean_value = X_train_imputed[col].mean(skipna=True)
                if pd.notna(mean_value):  # ensure mean is valid
                    X_train_imputed[col].fillna(mean_value, inplace=True)
                    if col in X_val_imputed.columns:
                        X_val_imputed[col].fillna(mean_value, inplace=True)

    return X_train_imputed, X_val_imputed


In [9]:
num_cols = [col for col in X_train.columns if not col.endswith(('_cat', '_bin'))]

In [10]:
X_train_imputed,X_test_imputed = simple_imputer(X_train, X_test, categorical_cols,num_cols)

In [11]:
selected_features_anova_chi = ['ps_car_13', 'ps_reg_02', 'ps_car_12', 'feature4', 'ps_reg_03', 'feature2', 'ps_car_15', 'ps_ind_15', 'ps_reg_01', 'ps_ind_01', 'feature5', 'ps_car_14', 'feature7', 'ps_ind_03', 'ps_calc_01', 'ps_car_04_cat', 'ps_ind_05_cat', 'ps_car_11_cat', 'ps_car_06_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_ind_04_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_05_cat', 'ps_ind_17_bin', 'ps_ind_07_bin', 'ps_ind_06_bin', 'ps_ind_16_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_12_bin']


In [12]:
X_train_selected_features = X_train_imputed[selected_features_anova_chi]
X_test_selected_features = X_test_imputed[selected_features_anova_chi]

In [13]:
significant_cat_features = ['ps_car_04_cat', 'ps_ind_05_cat', 'ps_car_11_cat', 'ps_car_06_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_ind_04_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_05_cat']

In [14]:
X_train_encoded = pd.get_dummies(X_train_selected_features, columns=significant_cat_features, drop_first=False)
print("Encoded DataFrame shape:", X_train_encoded.shape)

Encoded DataFrame shape: (207346, 186)


In [15]:
X_test_encoded = pd.get_dummies(X_test_selected_features, columns=significant_cat_features, drop_first=False)
print("Encoded DataFrame shape:", X_test_encoded.shape)

Encoded DataFrame shape: (88863, 186)


In [16]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)
pca = PCA(n_components=.9)

X_train_pca = pca.fit_transform(X_train_scaled)

X_val_pca = pca.transform(X_test_scaled)


X_train_pca_df = pd.DataFrame(X_train_pca, columns=[f'PC{i+1}' for i in range(pca.n_components_)])
X_test_pca_df = pd.DataFrame(X_val_pca, columns=[f'PC{i+1}' for i in range(pca.n_components_)])


In [17]:
from sklearn.neighbors import KNeighborsClassifier

knn_baseline = KNeighborsClassifier()
knn_baseline.fit(X_train_pca_df, y_train)
y_pred = knn_baseline.predict(X_test_pca_df)

print("Baseline Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Baseline Accuracy: 0.9474809538278024
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     84307
           1       0.07      0.00      0.00      4556

    accuracy                           0.95     88863
   macro avg       0.51      0.50      0.49     88863
weighted avg       0.90      0.95      0.92     88863



In [18]:
y_pred_proba = knn_baseline.predict_proba(X_test_pca_df)[:, 1]
print(f"AUROC: {roc_auc_score(y_test, y_pred_proba):.4f}")

AUROC: 0.5211


In [20]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 15, 21],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}
grid_search = GridSearchCV(
    estimator=knn_baseline,
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_pca_df, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 5 folds for each of 28 candidates, totalling 140 fits
Best Parameters: {'metric': 'euclidean', 'n_neighbors': 21, 'weights': 'distance'}
Best Score: 0.5497876271742241


In [19]:
knn = KNeighborsClassifier(
    n_neighbors=21,
    weights='distance',
    metric='euclidean',
    n_jobs=-1  
)

knn.fit(X_train_pca_df, y_train)




0,1,2
,n_neighbors,21
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'euclidean'
,metric_params,
,n_jobs,-1


In [20]:
y_pred_proba = knn.predict_proba(X_test_pca_df)[:, 1]
print(f"AUROC: {roc_auc_score(y_test, y_pred_proba):.4f}")

AUROC: 0.5462
