In [67]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix,accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, f_classif,chi2
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

df = pd.read_csv('./train1.csv')
X_train = df.drop(['target','id'], axis=1)


In [68]:
y_train = df['target']

In [48]:
X_train,X_test,y_train,y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=29173, stratify=y_train)

In [49]:
X_test.shape

(88863, 65)

In [None]:
"""df_test = pd.read_csv('test.csv')
X_test = df_test.drop('id',axis=1) 
test_ids = df_test['id']"""

"df_test = pd.read_csv('test.csv')\nX_test = df_test.drop('id',axis=1) "

In [51]:
categorical_cols = [col for col in X_train.columns if col.endswith('_cat')]
X_train[categorical_cols] = X_train[categorical_cols].astype('category')
X_test[categorical_cols] = X_test[categorical_cols].astype('category')
binary_cols = [col for col in X_train.columns if col.endswith('_bin')]
X_train[binary_cols] = X_train[binary_cols].astype(bool)
X_test[binary_cols] = X_test[binary_cols].astype(bool)

In [52]:
print(len(categorical_cols))

14


In [53]:
def simple_imputer(X_train, X_val, categorical_columns=None, numerical_columns=None):
    """
    Impute missing values in train and validation/test sets using only training data statistics.
    - categorical_columns: list of categorical columns to impute with mode
    - numerical_columns: list of numerical columns to impute with mean
    """
    X_train_imputed = X_train.copy()
    X_val_imputed = X_val.copy()

    # Categorical columns
    if categorical_columns:
        for col in categorical_columns:
            if col in X_train_imputed.columns:
                mode_value = X_train_imputed[col].mode(dropna=True)
                if not mode_value.empty:
                    mode_value = mode_value[0]
                    X_train_imputed[col].fillna(mode_value, inplace=True)
                    if col in X_val_imputed.columns:
                        X_val_imputed[col].fillna(mode_value, inplace=True)

    # Numerical columns
    if numerical_columns:
        for col in numerical_columns:
            if col in X_train_imputed.columns:
                mean_value = X_train_imputed[col].mean(skipna=True)
                if pd.notna(mean_value):  # ensure mean is valid
                    X_train_imputed[col].fillna(mean_value, inplace=True)
                    if col in X_val_imputed.columns:
                        X_val_imputed[col].fillna(mean_value, inplace=True)

    return X_train_imputed, X_val_imputed


In [54]:
num_cols = [col for col in X_train.columns if not col.endswith(('_cat', '_bin'))]

In [55]:
X_train_imputed,X_test_imputed = simple_imputer(X_train, X_test, categorical_cols,num_cols)

In [58]:
X_train_imputed['ps_car_12'].isnull().sum()

np.int64(0)

In [59]:
X_test_imputed.isna().sum()[X_test.isna().sum() > 0]


ps_ind_02_cat    0
ps_ind_04_cat    0
ps_ind_05_cat    0
ps_car_01_cat    0
ps_car_03_cat    0
ps_car_05_cat    0
ps_car_07_cat    0
ps_car_09_cat    0
ps_reg_03        0
ps_car_11        0
ps_car_12        0
ps_car_14        0
feature4         0
dtype: int64

In [61]:
baseline_model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),  # weak learner
    n_estimators=50,
    learning_rate=1.0,
    random_state=29173
)

baseline_model.fit(X_train_imputed, y_train)
y_pred_proba = baseline_model.predict_proba(X_test_imputed)[:, 1]  # probabilities for positive class

roc_auc = roc_auc_score(y_test, y_pred_proba)

print("Baseline AUROC:", roc_auc)

Baseline AUROC: 0.6247495565586924


In [64]:
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0],
    'estimator__max_depth': [1, 2],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__min_samples_leaf': [1, 2, 5]
}


In [66]:

base_estimator = DecisionTreeClassifier(random_state=29173)

ada = AdaBoostClassifier(
    estimator=base_estimator,
    random_state=29173
)

grid_search = GridSearchCV(
    estimator=ada,
    param_grid=param_grid,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_imputed, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best roc_auc score:", grid_search.best_score_)


Fitting 3 folds for each of 180 candidates, totalling 540 fits
Best parameters: {'estimator__max_depth': 2, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'learning_rate': 0.5, 'n_estimators': 200}
Best roc_auc score: 0.6254187961129053


In [None]:
X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols, drop_first=False)
print("Encoded DataFrame shape:", X_train_encoded.shape)

In [None]:
X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, drop_first=False)
print("Encoded DataFrame shape:", X_test_encoded.shape)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)
pca = PCA(n_components=.8)

X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)



X_train_pca_df = pd.DataFrame(X_train_pca, columns=[f'PC{i+1}' for i in range(pca.n_components_)])
X_test_pca_df = pd.DataFrame(X_test_pca, columns=[f'PC{i+1}' for i in range(pca.n_components_)])


In [None]:

dtPCA = DecisionTreeClassifier(class_weight= None, criterion= 'gini', max_depth= 5, min_samples_leaf= 5, min_samples_split= 2)
dtPCA.fit(X_train_pca_df, y_train)
y_pred_proba_pca = dtPCA.predict_proba(X_test_pca_df)[:, 1]

# saving the results
submission = pd.DataFrame({'id': test_ids, 'target': y_pred_proba_pca})
submission.to_csv('submission_pca.csv', index=False)


In [None]:
""" param_grid = { 
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'class_weight': [None, 'balanced']
}
grid_search = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,               
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_pca_df, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best AUROC (CV):", grid_search.best_score_)  """



In [None]:
""" param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'class_weight': [None, 'balanced']
}
grid_search = RandomizedSearchCV(
    estimator=dt,
    param_distributions=param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=2,
    n_iter=20, 
    random_state=29173
)

grid_search.fit(X_train_pca_df, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best AUROC (CV):", grid_search.best_score_)
"""