In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
train_data = pd.read_csv('data/train_final.csv')
test_data = pd.read_csv('data/test_final.csv')
X = train_data.drop(columns=['income>50K']) 
y = train_data['income>50K']

# Convert the target variable to 0/1 format
y = y.map({1: 1, 0: 0})

FileNotFoundError: [Errno 2] No such file or directory: 'data/train_final.csv'

In [None]:
train_data.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income>50K        0
dtype: int64

In [None]:
categorical_features = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
continuous_features = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']

# Data preprocessing: imputing missing values ​​and standardizing/encoding features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')), 
            ('scaler', StandardScaler()) 
        ]), continuous_features),
        
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')), 
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])
param_grid = {
    'classifier__n_estimators': [50, 100, 200],         
    'classifier__max_depth': [10, 20, 30, None],           
    'classifier__min_samples_split': [2, 5, 10],        
    'classifier__min_samples_leaf': [1, 2, 4],          
}
grid_search = GridSearchCV(
    estimator=model_pipeline,         
    param_grid=param_grid,            
    cv=5,                              
    scoring='roc_auc',              
    verbose=1,                        
    n_jobs=-1                          
)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [None]:
print("Best hyperparameters:", grid_search.best_params_)

print("Best AUC score:", grid_search.best_score_)

Best hyperparameters: {'classifier__max_depth': 20, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Best AUC score: 0.9168334854538862


In [None]:
best_model = grid_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

auc_score = roc_auc_score(y_test, y_pred_proba)
print(f'Test AUC Score: {auc_score}')

Test AUC Score: 0.9137392661982826


In [None]:
X_test_submission = test_data[categorical_features + continuous_features]

test_predictions_proba = best_model.predict_proba(X_test_submission)[:, 1]

submission = pd.DataFrame({
    'ID': test_data['ID'], 
    'Prediction': test_predictions_proba
})

submission.to_csv('_submission.csv', index=False)


SVC

In [None]:
from sklearn.svm import SVC

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SVC(probability=True, random_state=42))
])

param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf']
}

grid_search = GridSearchCV(
    estimator=model_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best hyperparameters:", grid_search.best_params_)
print("Best AUC score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f'Test AUC Score: {auc_score}')

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best hyperparameters: {'classifier__C': 0.1, 'classifier__kernel': 'linear'}
Best AUC score: 0.9044543994139049
Test AUC Score: 0.9029101638649588


In [None]:
from sklearn.neighbors import KNeighborsClassifier

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier())
])

param_grid = {
    'classifier__n_neighbors': [3, 5, 7],
    'classifier__weights': ['uniform', 'distance']
}

grid_search = GridSearchCV(
    estimator=model_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best hyperparameters:", grid_search.best_params_)
print("Best AUC score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f'Test AUC Score: {auc_score}')

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best hyperparameters: {'classifier__n_neighbors': 7, 'classifier__weights': 'uniform'}
Best AUC score: 0.8678527675284607
Test AUC Score: 0.8734748645168238


In [None]:
from sklearn.tree import DecisionTreeClassifier

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

param_grid = {
    'classifier__max_depth': [5, 10, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}


grid_search = GridSearchCV(
    estimator=model_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best hyperparameters:", grid_search.best_params_)
print("Best AUC score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f'Test AUC Score: {auc_score}')

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best hyperparameters: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10}
Best AUC score: 0.8927724299766725
Test AUC Score: 0.8964964758309824
