In [3]:
import logging
import matplotlib.pyplot as plt
import numpy as np
import optuna
import os
import pandas as pd
import sys
import time

from joblib import dump, load
from lightgbm import LGBMClassifier, early_stopping
from optuna.samplers import TPESampler
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import StratifiedKFold
from typing import Dict, List, Tuple, Union

In [4]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
start_time = time.time()

In [5]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

target = 'class'

logger.info(f"Train data load completed. Time elapsed: {time.time() - start_time:.2f} seconds")

train_df.head(2)

INFO:__main__:Train data load completed. Time elapsed: 4.18 seconds


Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w


In [6]:
test_df.head(2)

Unnamed: 0,id,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,3116945,8.64,x,,n,t,,,w,11.13,...,b,,w,u,w,t,g,,d,a
1,3116946,6.9,o,t,o,f,,c,y,1.27,...,,,n,,,f,f,,d,a


In [7]:
def convert_data_types(train, test, target_column):
    # Create copies of DataFrames to avoid warnings
    train = train.copy()
    test = test.copy()

    # Save the target column separately
    target = train[target_column].copy()

    # Remove the target column from train for processing
    train = train.drop(columns=[target_column])

    # Combine train and test for processing
    combined = pd.concat([train, test], keys=['train', 'test'])

    # Function to process individual columns
    def process_column(df, col):
        if df[col].dtype == 'object':
            # For object types, fill NAs with 'unk' and convert to category
            df[col] = df[col].fillna('unk').astype('category')
        elif df[col].dtype.name == 'category':
            # For category types, add 'unk' category if there are NAs
            if df[col].isnull().any():
                df[col] = df[col].cat.add_categories('unk').fillna('unk')
        return df

    # Apply processing to all columns
    for column in combined.columns:
        combined = process_column(combined, column)

    # Split back into train and test
    new_train = combined.loc['train'].copy()
    new_test = combined.loc['test'].copy()

    # Add the target column back to train
    new_train[target_column] = target

    return new_train, new_test

# Example usage
train_df, test_df = convert_data_types(train_df, test_df, target)

In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3116945 entries, 0 to 3116944
Data columns (total 22 columns):
 #   Column                Dtype   
---  ------                -----   
 0   id                    int64   
 1   cap-diameter          float64 
 2   cap-shape             category
 3   cap-surface           category
 4   cap-color             category
 5   does-bruise-or-bleed  category
 6   gill-attachment       category
 7   gill-spacing          category
 8   gill-color            category
 9   stem-height           float64 
 10  stem-width            float64 
 11  stem-root             category
 12  stem-surface          category
 13  stem-color            category
 14  veil-type             category
 15  veil-color            category
 16  has-ring              category
 17  ring-type             category
 18  spore-print-color     category
 19  habitat               category
 20  season                category
 21  class                 object  
dtypes: category(17), float6

In [9]:
from sklearn.preprocessing import LabelEncoder

# Create an instance of LabelEncoder
le = LabelEncoder()

# Fit the LabelEncoder on the target column
le_fitted = le.fit(train_df[target])

# Transform the target variable into numerical format
train_df[target] = le_fitted.transform(train_df[target])

# Display the transformed target column
train_df[target]

0          0
1          1
2          0
3          0
4          0
          ..
3116940    0
3116941    0
3116942    1
3116943    0
3116944    1
Name: class, Length: 3116945, dtype: int32

In [10]:
categorical_features = train_df.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_features

categorical_features

['cap-shape',
 'cap-surface',
 'cap-color',
 'does-bruise-or-bleed',
 'gill-attachment',
 'gill-spacing',
 'gill-color',
 'stem-root',
 'stem-surface',
 'stem-color',
 'veil-type',
 'veil-color',
 'has-ring',
 'ring-type',
 'spore-print-color',
 'habitat',
 'season']

In [12]:
import numpy as np
import pandas as pd
import optuna
from optuna.samplers import TPESampler
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from functools import partial
from joblib import dump, load
import os
import torch
from typing import Dict, List, Tuple, Union, Any, Optional

# Define the objective function for CatBoost optimization
def catboost_objective(trial: optuna.Trial, train: pd.DataFrame, test: pd.DataFrame, target: str, categorical_feats: List[str], base_params: Dict[str, Any]) -> float:
    boosting_type = trial.suggest_categorical('boosting_type', ['Ordered', 'Plain'])
    grow_policy = 'Depthwise'
    all_score_functions = ['Cosine', 'L2']
    score_function = trial.suggest_categorical('score_function', all_score_functions)

    if boosting_type == 'Ordered' and score_function in ['LOOL2', 'SolarL2', 'L2', 'NewtonL2']:
        raise optuna.exceptions.TrialPruned()

    if boosting_type == 'Ordered' and grow_policy in ['Lossguide', 'Depthwise']:
        raise optuna.exceptions.TrialPruned()

    params = {
        'iterations': trial.suggest_int('iterations', 50, 1500),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'depth': trial.suggest_int('depth', 2, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10, log=True),
        'boosting_type': boosting_type,
        'border_count': trial.suggest_int('border_count', 32, 255),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'score_function': score_function,
        'feature_border_type': trial.suggest_categorical('feature_border_type', ['GreedyLogSum', 'MinEntropy', 'Median', 'UniformAndQuantiles']),
        'leaf_estimation_method': trial.suggest_categorical('leaf_estimation_method', ['Gradient', 'Newton']),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 10),
        'one_hot_max_size': trial.suggest_int('one_hot_max_size', 0, 25),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'No', 'MVS']),
        **base_params
    }

    if params['grow_policy'] == 'Lossguide':
        params['max_leaves'] = trial.suggest_int('max_leaves', 2, 64)

    if params['bootstrap_type'] in ['Bernoulli', 'Poisson']:
        params['subsample'] = trial.suggest_float('subsample', 0.1, 1.0)

    if params['bootstrap_type'] == 'Bayesian':
        params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)

    scores, _, _ = fit_catboost(params, train, test, target, categorical_feats)
    return np.mean(scores)

# Function to fit CatBoost model
def fit_catboost(params: Dict[str, Union[int, float, str]], train: pd.DataFrame, test: pd.DataFrame, target: str, categorical_feats: List[str]) -> Tuple[List[float], List[np.ndarray], np.ndarray]:
    target_columns = [target]
    train_cols = [col for col in train.columns if col not in target_columns]
    scores = []

    mskf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof_valid_preds = np.zeros((train[train_cols].shape[0], 1))
    test_predict_list = []

    for fold, (train_idx, valid_idx) in enumerate(mskf.split(train[train_cols], train[target_columns])):
        X_train, y_train = train[train_cols].iloc[train_idx], train[target_columns].iloc[train_idx]
        X_valid, y_valid = train[train_cols].iloc[valid_idx], train[target_columns].iloc[valid_idx]

        model = CatBoostClassifier(**params)
        model.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=250, verbose=100, cat_features=categorical_feats)

        valid_preds = model.predict(X_valid)
        oof_valid_preds[valid_idx] = valid_preds.reshape(-1, 1)
        test_predict = model.predict_proba(test[train_cols])[:, 1]
        test_predict_list.append(test_predict)

        mcc = matthews_corrcoef(y_valid, valid_preds)
        scores.append(mcc)

    oof_score = matthews_corrcoef(train[target_columns], oof_valid_preds)
    scores.append(oof_score)
    print(f'The average Matthews Correlation Coefficient is {np.mean(scores)}')

    return scores, test_predict_list, oof_valid_preds

# Optimize CatBoost hyperparameters
def optimize_catboost(train: pd.DataFrame, test: pd.DataFrame, target: str, categorical_feats: List[str], base_params: Dict[str, Any], n_trials: int = 100) -> Dict[str, Union[int, float, str]]:
    study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
    study.optimize(partial(catboost_objective, train=train, test=test, target=target, categorical_feats=categorical_feats, base_params=base_params), n_trials=n_trials, show_progress_bar=True)

    print("Best trial:")
    trial = study.best_trial
    print("  Value:", trial.value)
    print("  Params:")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    return study.best_params

# Define the objective function for XGBoost optimization
def xgboost_objective(trial: optuna.Trial, train: pd.DataFrame, test: pd.DataFrame, target: str, base_params: Dict[str, Any]) -> float:
    params = {
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 50, 1500),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 1),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 1),
        "gamma": trial.suggest_float("gamma", 0, 1),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        "max_delta_step": trial.suggest_int("max_delta_step", 0, 10),
        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        'enable_categorical': True,
        'max_cat_to_onehot': trial.suggest_int("max_cat_to_onehot", 4, 32),
        'max_cat_threshold': trial.suggest_int("max_cat_threshold", 32, 32),
        **base_params
    }
    scores, _, _ = fit_xgboost(params, train, test, target)
    return np.mean(scores)

# Function to fit XGBoost model
def fit_xgboost(params: Dict[str, Union[int, float, str, bool]], train: pd.DataFrame, test: pd.DataFrame, target: str) -> Tuple[List[float], List[np.ndarray], np.ndarray]:
    label_columns = [target]
    train_cols = [col for col in train.columns if col not in label_columns]
    scores = []

    mskf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof_valid_preds = np.zeros((train[train_cols].shape[0], 1))
    test_predict_list = []

    for fold, (train_idx, valid_idx) in enumerate(mskf.split(train[train_cols], train[label_columns])):
        X_train, y_train = train[train_cols].iloc[train_idx], train[label_columns].iloc[train_idx]
        X_valid, y_valid = train[train_cols].iloc[valid_idx], train[label_columns].iloc[valid_idx]

        model = XGBClassifier(**params)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
        valid_preds = model.predict(X_valid)
        oof_valid_preds[valid_idx] = valid_preds.reshape(-1, 1)
        test_predict = model.predict_proba(test[train_cols])[:, 1]
        test_predict_list.append(test_predict)

        mcc = matthews_corrcoef(y_valid, valid_preds)
        scores.append(mcc)

    oof_score = matthews_corrcoef(train[label_columns], oof_valid_preds)
    scores.append(oof_score)
    print(f'The average Matthews Correlation Coefficient is {np.mean(scores)}')

    return scores, test_predict_list, oof_valid_preds

# Optimize XGBoost hyperparameters
def optimize_xgboost(train: pd.DataFrame, test: pd.DataFrame, target: str, base_params: Dict[str, Any], n_trials: int = 100) -> Dict[str, Union[int, float, str]]:
    study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
    study.optimize(partial(xgboost_objective, train=train, test=test, target=target, base_params=base_params), n_trials=n_trials, show_progress_bar=True)

    print("Best trial:")
    trial = study.best_trial
    print("  Value:", trial.value)
    print("  Params:")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

    return study.best_params

# Function to get model configurations
def get_model_configs(use_gpu: bool = False) -> List[Tuple[object, Dict[str, Any], List[str], str]]:
    configs = []

    if use_gpu:
        base_params_xgb = {"tree_method": "gpu_hist", "predictor": "gpu_predictor"}
        base_params_catboost = {"task_type": "GPU", "devices": "0"}
    else:
        base_params_xgb = {"tree_method": "hist", "predictor": "cpu_predictor"}
        base_params_catboost = {"task_type": "CPU"}

    configs.append((CatBoostClassifier, base_params_catboost, [], 'catboost'))
    configs.append((XGBClassifier, base_params_xgb, [], 'xgboost'))

    return configs

# Function to perform model optimization
def model_optimization(train_df: pd.DataFrame, test_df: pd.DataFrame, target: str, use_gpu: bool = False, n_trials: int = 100) -> None:
    model_configs = get_model_configs(use_gpu)
    for model_class, base_params, categorical_feats, model_name in model_configs:
        if model_name == 'catboost':
            best_params = optimize_catboost(train_df, test_df, target, categorical_feats, base_params, n_trials)
        elif model_name == 'xgboost':
            best_params = optimize_xgboost(train_df, test_df, target, base_params, n_trials)
        
        # Train final model with best parameters
        if model_name == 'catboost':
            model = CatBoostClassifier(**best_params)
        elif model_name == 'xgboost':
            model = XGBClassifier(**best_params)
        
        model.fit(train_df.drop(columns=[target]), train_df[target])
        test_predictions = model.predict_proba(test_df)[:, 1]

        # Save predictions and parameters
        np.savetxt(f"{model_name}_test_predictions.csv", test_predictions, delimiter=",")
        with open(f"{model_name}_best_params.json", "w") as f:
            json.dump(best_params, f, indent=4)

# Usage Example:
# train_df = pd.read_csv('path/to/train.csv')
# test_df = pd.read_csv('path/to/test.csv')
target = 'class'
model_optimization(train_df, test_df, target, use_gpu=True, n_trials=100)


ModuleNotFoundError: No module named 'torch'