## Setup

In [None]:
!pip install optuna lightgbm requests rdata scikit-learn ipywidgets matplotlib

In [8]:
import pandas as pd
from lightgbm import LGBMClassifier                                                                                                                                                                                                                                                                                                                                                             
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, ConfusionMatrixDisplay
import optuna

import rdata
import requests
import io

## Download data

In [2]:
url = "https://github.com/dutangc/CASdatasets/raw/refs/heads/master/data/pg15training.rda"
output_directory = "../data/"
file_name = "pg15training.csv"

# Download the file from the URL
response = requests.get(url)

# Write the content to a file
f = io.BytesIO(response.content)
r_data = rdata.read_rda(f)['pg15training']
r_data.to_csv(output_directory + file_name, index=False)

## Load Data

In [None]:
data = pd.read_csv("../data/pg15training.csv")
data.head()

## Prep data

In [4]:
# Create the target column
data['target'] = data['Numtppd'].apply(lambda x: 1 if x != 0 else 0)

# Drop some columns
data = data.drop(columns=["Numtppd", "Numtpbi", "Indtppd", "Indtpbi"])

# Add one hot encoder processor
categorical_columns = ['CalYear', 'Gender', 'Type', 'Category', 'Occupation', 'SubGroup2', 'Group2', 'Group1']
data = pd.get_dummies(data, columns=categorical_columns)

## Split data

In [5]:
X = data.drop("target", axis=1)
y = data['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Train model

In [None]:
def objective(trial, X_train, y_train):
    # Split the data into training and validation sets
    X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    # Define the hyperparameter search space
    # Optuna will choose values in this search space such that the search is the most efficient
    param = {
        'objective': 'binary',
        'n_jobs' : -1,
        'n_estimators': trial.suggest_int('n_estimators', 10, 200),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    }
    
    # Initialize the model with the chosen set of hyperparameters
    model = LGBMClassifier(**param, verbosity=-1)
    
    # Train the model
    model.fit(X_train_sub, y_train_sub)
    
    # Make predictions on the validation set
    y_pred = model.predict(X_val)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_pred)
    
    return accuracy

# Setting the logging level WARNING, the INFO logs are suppressed.
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=20, show_progress_bar=True)

# Retrieve the best hyperparameters
best_params = study.best_params
print("Best hyperparameters: ", best_params)

# Train the final model with the best hyperparameters
best_model = LGBMClassifier(**best_params)
best_model.fit(X_train, y_train)

## Evaluate model

In [None]:
# Make predictions
y_pred = best_model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = ConfusionMatrixDisplay.from_predictions(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")