In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix,accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, f_classif,chi2
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

df = pd.read_csv('./train1.csv')
X_train = df.drop(['target','id'], axis=1)


In [51]:
y_train = df['target']

In [40]:
df_test = pd.read_csv('test.csv')
X_test = df_test.drop('id',axis=1)

In [41]:
categorical_cols = [col for col in X_train.columns if col.endswith('_cat')]
X_train[categorical_cols] = X_train[categorical_cols].astype('category')
X_test[categorical_cols] = X_test[categorical_cols].astype('category')
binary_cols = [col for col in X_train.columns if col.endswith('_bin')]
X_train[binary_cols] = X_train[binary_cols].astype(bool)
X_test[binary_cols] = X_test[binary_cols].astype(bool)

In [42]:
print(len(categorical_cols))

14


In [43]:
def simple_imputer(X_train, X_val, categorical_columns=None, numerical_columns=None):
    """
    Imputation using only training data statistics.
    No target variable needed - works for test data!
    """
    X_train_imputed = X_train.copy()
    X_val_imputed = X_val.copy()
    
    # Categorical: use mode from training set
    if categorical_columns:
        for col in categorical_columns:
            if X_train_imputed[col].isnull().sum() > 0:
                mode_value = X_train_imputed[col].mode()[0]
                X_train_imputed[col].fillna(mode_value, inplace=True)
                X_val_imputed[col].fillna(mode_value, inplace=True)
    
    # Numerical: use mean from training set
    if numerical_columns:
        for col in numerical_columns:
            if X_train_imputed[col].isnull().sum() > 0:
                mean_value = X_train_imputed[col].mean()
                X_train_imputed[col].fillna(mean_value, inplace=True)
                X_val_imputed[col].fillna(mean_value, inplace=True)
    
    return X_train_imputed, X_val_imputed

In [44]:
num_cols = [col for col in X.columns if not col.endswith(('_cat', '_bin'))]

In [45]:
X_train,X_test = simple_imputer(X_train, X_test, categorical_cols,num_cols)

In [49]:
X_train.shape

(296209, 65)

In [54]:
dt = DecisionTreeClassifier(criterion = 'gini', max_depth= 10, min_samples_leaf= 1, min_samples_split= 10)

In [56]:
dt.fit(X_train,y_train)
y_pred = dt.predict(X_train)
print("="*60)
print("DECISION TREE RESULTS")
print("="*60)
print(f"Accuracy: {accuracy_score(y_train, y_pred):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_train, y_pred))
print("\nClassification Report:")
print(classification_report(y_train, y_pred))
y_pred_proba = dt.predict_proba(X_train)[:, 1]
roc_auc = roc_auc_score(y_train, y_pred_proba)
print(f"Validation AUROC: {roc_auc:.4f}")

DECISION TREE RESULTS
Accuracy: 0.9499

Confusion Matrix:
[[280911    112]
 [ 14716    470]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97    281023
           1       0.81      0.03      0.06     15186

    accuracy                           0.95    296209
   macro avg       0.88      0.52      0.52    296209
weighted avg       0.94      0.95      0.93    296209

Validation AUROC: 0.6382


In [58]:
y_pred_proba = dt.predict_proba(X_test)[:, 1]


In [57]:
test_ids = df_test['id']

In [59]:
submission = pd.DataFrame({'id': test_ids, 'target': y_pred_proba})
submission.to_csv('submission.csv', index=False)

In [60]:
# X_num: only numerical columns
X_num = X_train[num_cols]

# Apply ANOVA F-test
selector = SelectKBest(score_func=f_classif, k='all')  # 'all' gives scores for all features
selector.fit(X_num, y_train)

# Get scores and p-values
anova_scores = pd.DataFrame({
    'Feature': X_num.columns,
    'F_score': selector.scores_,
    'p_value': selector.pvalues_
}).sort_values(by='F_score', ascending=False)

significant_num_features = anova_scores[anova_scores['p_value'] <= 0.01]['Feature'].tolist()

print(f"Significant num features (p <= 0.01):")
print(significant_num_features)

Significant num features (p <= 0.01):
['ps_car_13', 'ps_car_12', 'ps_reg_02', 'ps_reg_03', 'feature4', 'feature2', 'ps_car_15', 'ps_reg_01', 'ps_ind_15', 'ps_ind_01', 'feature5', 'ps_car_14', 'feature7', 'ps_ind_03', 'ps_ind_14']


  f = msb / msw


In [61]:
# X_cat: select only categorical
X_cat = X_train[categorical_cols].copy()


# Apply Chi-Square test
chi_selector = SelectKBest(score_func=chi2, k='all')
chi_selector.fit(X_cat, y_train)

# Create a DataFrame with scores and p-values
chi_scores_cat = pd.DataFrame({
    'Feature': X_cat.columns,
    'Chi2_score': chi_selector.scores_,
    'p_value': chi_selector.pvalues_
}).sort_values(by='Chi2_score', ascending=False)

significant_cat_features = chi_scores_cat[chi_scores_cat['p_value'] <= 0.01]['Feature'].tolist()

print(f"Significant cat features (p <= 0.01):")
print(significant_cat_features)


Significant cat features (p <= 0.01):
['ps_car_04_cat', 'ps_ind_05_cat', 'ps_car_11_cat', 'ps_car_06_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_ind_04_cat', 'ps_car_08_cat', 'ps_car_05_cat', 'ps_car_09_cat']


In [41]:
print(len(significant_cat_features))

10


In [62]:
# X_bin: select only binary columns
X_bin = X_train[binary_cols].copy()


# Apply Chi-Square test
chi_selector = SelectKBest(score_func=chi2, k='all')
chi_selector.fit(X_bin, y_train)

# Create a DataFrame with scores and p-values
chi_scores_bin = pd.DataFrame({
    'Feature': X_bin.columns,
    'Chi2_score': chi_selector.scores_,
    'p_value': chi_selector.pvalues_
}).sort_values(by='Chi2_score', ascending=False)

significant_bin_features = chi_scores_bin[chi_scores_bin['p_value'] <= 0.01]['Feature'].tolist()

print(f"Significant binary features (p <= 0.01):")
print(significant_bin_features)


Significant binary features (p <= 0.01):
['ps_ind_17_bin', 'ps_ind_07_bin', 'ps_ind_06_bin', 'ps_ind_16_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_12_bin', 'ps_ind_18_bin']


In [44]:
print(len(significant_bin_features))

7


In [16]:
top_n = 17  # choose top features
top_features_bin = chi_scores_bin.head(top_n)['Feature'].tolist()

print(f"Top {top_n} categorical/binary features based on Chi-Square test:")
print(top_features_bin)

Top 17 categorical/binary features based on Chi-Square test:
['ps_ind_17_bin', 'ps_ind_07_bin', 'ps_ind_06_bin', 'ps_ind_16_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_12_bin', 'ps_ind_18_bin', 'ps_calc_18_bin', 'ps_calc_20_bin', 'ps_ind_10_bin', 'ps_calc_19_bin', 'ps_calc_16_bin', 'ps_ind_11_bin', 'ps_ind_13_bin', 'ps_calc_15_bin', 'ps_calc_17_bin']


In [63]:
# Combine numerical, categorical, and binary top features
selected_features = significant_num_features + significant_cat_features + significant_bin_features
print(f"Total selected features: {len(selected_features)}")


Total selected features: 33


In [64]:
dt.fit(X_train[selected_features], y_train)
y_pred_proba_chi_anova = dt.predict_proba(X_test[selected_features])[:, 1]


submission = pd.DataFrame({'id': test_ids, 'target': y_pred_proba_chi_anova})
submission.to_csv('submission_chi_anova.csv', index=False)

In [65]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'class_weight': [None, 'balanced']  
}
grid_search = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,          
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train[selected_features], y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best AUROC (CV):", grid_search.best_score_)




Fitting 5 folds for each of 180 candidates, totalling 900 fits
Best Parameters: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best AUROC (CV): 0.6019831587120853


In [67]:
# Evaluate on test
best_dt = grid_search.best_estimator_
y_pred_proba_best_chi_anova = best_dt.predict_proba(X_test[selected_features])[:, 1]
submission = pd.DataFrame({'id': test_ids, 'target': y_pred_proba_best_chi_anova})
submission.to_csv('submission_best_chi_anova.csv', index=False)

In [None]:
#wrapper Backward
dt = DecisionTreeClassifier(
    class_weight=None, 
    criterion='gini', 
    max_depth=5, 
    min_samples_leaf=5, 
    min_samples_split=2,
    random_state=29173
)
dt.fit(X_train[['ps_ind_05_cat', 'ps_car_04_cat', 'ps_ind_17_bin', 'ps_reg_03', 'ps_car_14', 'ps_car_15']], y_train)

y_pred_proba_backward_wrapper = dt.predict_proba(X_test[['ps_ind_05_cat', 'ps_car_04_cat', 'ps_ind_17_bin', 'ps_reg_03', 'ps_car_14', 'ps_car_15']])[:, 1]
submission = pd.DataFrame({'id': test_ids, 'target': y_pred_proba_backward_wrapper})
submission.to_csv('submission_backward_wrapper.csv', index=False)



In [72]:
forward_cols = ['ps_car_13', 'ps_ind_17_bin', 'feature4', 'ps_ind_15', 'ps_car_04_cat','ps_reg_02', 'ps_calc_20_bin']
dt.fit(X_train[forward_cols], y_train)

y_pred_proba_forward_wrapper = dt.predict_proba(X_test[forward_cols])[:, 1]
submission = pd.DataFrame({'id': test_ids, 'target': y_pred_proba_forward_wrapper})
submission.to_csv('submission_forward_wrapper.csv', index=False)


In [80]:
X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols, drop_first=False)
print("Encoded DataFrame shape:", X_train_encoded.shape)

Encoded DataFrame shape: (296209, 226)


In [81]:
X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, drop_first=False)
print("Encoded DataFrame shape:", X_test_encoded.shape)

Encoded DataFrame shape: (126948, 226)


In [86]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)
pca = PCA(n_components=.8)

X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)



X_train_pca_df = pd.DataFrame(X_train_pca, columns=[f'PC{i+1}' for i in range(pca.n_components_)])
X_test_pca_df = pd.DataFrame(X_test_pca, columns=[f'PC{i+1}' for i in range(pca.n_components_)])


In [87]:

dtPCA = DecisionTreeClassifier(class_weight= None, criterion= 'gini', max_depth= 5, min_samples_leaf= 5, min_samples_split= 2)
dtPCA.fit(X_train_pca_df, y_train)
y_pred_proba_pca = dtPCA.predict_proba(X_test_pca_df)[:, 1]

# saving the results
submission = pd.DataFrame({'id': test_ids, 'target': y_pred_proba_pca})
submission.to_csv('submission_pca.csv', index=False)


In [None]:
""" param_grid = { 
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'class_weight': [None, 'balanced']
}
grid_search = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,               
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_pca_df, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best AUROC (CV):", grid_search.best_score_)  """



SyntaxError: closing parenthesis ')' does not match opening parenthesis '{' (3119908351.py, line 1)

In [None]:
""" param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'class_weight': [None, 'balanced']
}
grid_search = RandomizedSearchCV(
    estimator=dt,
    param_distributions=param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=2,
    n_iter=20, 
    random_state=29173
)

grid_search.fit(X_train_pca_df, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best AUROC (CV):", grid_search.best_score_)
"""

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 5, 'criterion': 'entropy', 'class_weight': None}
Best AUROC (CV): 0.5916067936536942


Validation AUROC: 0.5924
