# Import libraries

In [None]:
#!pip install lightgbm --upgrade 

In [None]:
import multiprocessing

import time

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.ticker import MaxNLocator
import plotly.express as px
import seaborn as sns
import plotly.graph_objects as go

import optuna
from optuna.samplers import TPESampler
import lightgbm as lgb
import optuna.integration.lightgbm as lgb
from lightgbm import early_stopping
from lightgbm import log_evaluation
from lightgbm import LGBMClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
import sklearn.datasets
import sklearn.metrics

from sklearn.metrics import log_loss
from sklearn.utils.class_weight import compute_sample_weight

# Set notebook environment

In [None]:
# Change notebook environment

# https://ipython.readthedocs.io/en/stable/api/generated/IPython.core.interactiveshell.html
# https://stackoverflow.com/questions/36786722/how-to-display-full-output-in-jupyter-not-only-last-result
# Display full output in output cell, not only last result
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# https://www.kaggle.com/questions-and-answers/118932
# Max rows and columns of pandas dataframe 
pd.options.display.max_rows,pd.options.display.max_columns

# https://thispointer.com/python-pandas-how-to-display-full-dataframe-i-e-print-all-rows-columns-without-truncation/
# Print all the contents of a pandas dataframe
pd.set_option('display.max_rows', None) # Print unlimited number of rows by setting to None, default is 10
pd.set_option('display.max_columns', None) # Do not truncate columns to display all of them by setting to None
pd.set_option('display.width', None) # Auto-detect the width of dataframe to display all columns in single line by setting to None
pd.set_option('display.max_colwidth', None) # Auto detect the max size of column and print contents of that column without truncation

# https://stackoverflow.com/questions/8924173/how-to-print-bold-text-in-python
start = "\033[1m" # Bold text
end = "\033[0;0m" # Reset 

import gc

In [None]:
# Free up memory that is no longer being used
gc.collect()

# Load datasets

In [None]:
train_df=pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/train.csv")
greeks_df=pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/greeks.csv")
test_df=pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/test.csv")
sample_submission_df=pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv")

# Data glance

In [None]:
print(start+"train.csv:"+end)
train_df.head()
print(start+"\ngreeks.csv:"+end)
greeks_df.head()
print(start+"\ntest.csv:"+end)
test_df.head()
print(start+"\nsample_submission_df.csv:"+end)
sample_submission_df.head()

# Check column names

In [None]:
print(start+"train.csv:"+end)
train_df.columns
print(start+"\ngreeks.csv:"+end)
greeks_df.columns
print(start+"\ntest.csv:"+end)
test_df.columns
print(start+"\nsample_submission_df.csv:"+end)
sample_submission_df.columns

# Assess data
- Attribute
- Summary statistics

In [None]:
print(start+"train:", train_df.shape,end)
print(start+"greeks:", greeks_df.shape,end)
print(start+"test:", test_df.shape,end)
print(start+"sample submission:", sample_submission_df.shape,end)

In [None]:
print(start+"train.csv:"+end)
train_df.info()
print(start+"\ngreeks_df:"+end)
greeks_df.info()
print(start+"\ntest_df:"+end)
test_df.info()
print(start+"\nsample_submission_df:"+end)
sample_submission_df.info()

In [None]:
print(start+"train_df:"+end)
train_df.describe()
print(start+"\ngreeks_df:"+end)
greeks_df.describe()
print(start+"\ntest_df:"+end)
test_df.describe()
print(start+"\nsample_submission_df:"+end)
sample_submission_df.describe()

# Check missing data

In [None]:
# Check for missing values

print(start+'Check missing values:'+end)

print(start+"\ngreeks_df:"+end)
print(greeks_df.isnull().sum())
print(start+"\ntrain_df:"+end)
print(train_df.isnull().sum())
print(start+"\ntest_df:"+end)
print(test_df.isnull().sum())
print(start+"\nsample_submission_df:"+end)
print(sample_submission_df.isnull().sum())

In [None]:
# Keep all rows with at least one missing data
train_df[train_df.isna().any(axis=1)]

# Check duplication

In [None]:
# Check for duplicates

print(start+'Train set: There are a total of', start+str(train_df.duplicated().sum())+end, start+'duplicate rows.\n'+end)
print(start+'Greeks: There are a total of', start+str(greeks_df.duplicated().sum())+end, start+'duplicate rows.\n'+end)
print(start+'Test set: There are a total of', start+str(test_df.duplicated().sum())+end, start+'duplicate rows.\n'+end)
print(start+'Submission: There are a total of', start+str(sample_submission_df.duplicated().sum())+end, start+'duplicate rows.\n'+end)

In [None]:
# Check duplication by Id
train_df[train_df.Id.duplicated(keep=False)].sort_values("Id")
greeks_df[greeks_df.Id.duplicated(keep=False)].sort_values("Id")

# Class imbalance

In [None]:
piefreq=train_df.Class.value_counts()

print('Class imbalance (counts): Class 1, Class 0')
print(piefreq,'\n')

# Imbalance class
print('Normalized class: Class 1, Class 0')
print(train_df.Class.value_counts(normalize=True))

# Pie chart of class imbalance
fig=go.Figure(data=[go.Pie(labels=['Class 0    (n=' + str(piefreq[0]) +')',
                                   'Class 1    (n=' + str(piefreq[1]) +')'],
                           values=train_df.Class.value_counts())])
fig.update_layout(title=dict(text="<b>Pie chart of 'Class'</b>",
                             y=0.85,x=0.4,
                             xanchor='center',
                             yanchor='top',
                             font=dict(size=14)
                            )
                  ,width=600
                  ,height=600)

# Delete piefreq dataframe to release memory
del piefreq, fig

# Merge datasets by Id
- Train and Greek 

In [None]:
# Merge train and greeks dataframe by Id
merged_df = pd.merge(train_df, greeks_df, on="Id")

In [None]:
merged_df.head()

In [None]:
# Categorical feature
merged_df.EJ.value_counts()

In [None]:
# Categorical feature
print(start+"Alpha:"+end)
greeks_df[~greeks_df["Alpha"].str.isnumeric()]["Alpha"].value_counts()
print("\n"+start+"Beta:"+end)
greeks_df[~greeks_df["Beta"].str.isnumeric()]["Beta"].value_counts()
print("\n"+start+"Gamma:"+end)
greeks_df[~greeks_df["Gamma"].str.isnumeric()]["Gamma"].value_counts()
print("\n"+start+"Delta:"+end)
greeks_df[~greeks_df["Delta"].str.isnumeric()]["Delta"].value_counts()
print("\n"+start+"Epsilon:"+end)
greeks_df[~greeks_df["Epsilon"].str.isnumeric()]["Epsilon"].value_counts()

In [None]:
# Print counts of 'Unknown' Epsilon
unknown_df=greeks_df.query("Epsilon=='Unknown'")
unknown_df.value_counts()

# Pearson Correlation  

In [None]:
# Heatmap

plt.figure(figsize=(30, 10))

# Compute correlation matrix
corr_matrix = merged_df.select_dtypes(include=np.number).corr()

# Mask off-diagonal values
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
corr_matrix = corr_matrix.mask(mask)

# Create heatmap
ax = sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', annot_kws={"size": 8}, fmt=".2f")

# Update axis labels
labels = corr_matrix.columns
ax.set_xticklabels(labels, rotation=290, ha='center', fontsize=8)
ax.set_yticks(np.arange(len(labels)))
ax.set_yticklabels(labels, rotation=0, va='center', fontsize=8)

ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
ax.xaxis.set_label_coords(0.5, 1.05)
ax.yaxis.set_label_coords(-0.05, 0.5)

plt.title("Pearson Correlation Matrix", fontsize=12)
plt.tight_layout()
plt.show();

del mask, ax, labels, corr_matrix

In [None]:
# Compute the correlation matrix
corr_matrix = merged_df.select_dtypes(include=np.number).corr()

# Sort the correlation values in descending order
corr_sorted = corr_matrix['Class'].sort_values(ascending=False)

# Create the heatmap
plt.figure(figsize=(5,10))
sns.heatmap(pd.DataFrame(corr_sorted), annot=True, cmap='coolwarm',annot_kws={"size": 10},fmt=".2f")

ax = plt.gca()
ax.set_xticklabels(['Class 0 / Class 1'], ha='center',fontsize=10)
#ax.set_yticklabels(rotation=0,fontsize=10)

# Add a title
plt.title('Pearson Correlation Matrix with "Class"',fontsize=12)

# Show the plot
plt.tight_layout()
plt.show();

del corr_matrix,ax

# Bar Charts 
- Categorical features by Class

In [None]:
# Define the list of categorical features
categorical_lst = ['EJ','Alpha','Beta', 'Gamma', 'Delta']

for var in categorical_lst:
    # Grouping by the variable and 'Class' column
    var_class = merged_df.groupby([var, 'Class']).size().unstack()
 
    # Replace any empty cells with 0
    var_class = var_class.fillna(0)

    # Calculate the proportion of Class 0 and Class 1 for each value of the variable
    var_class['Class 0'] = var_class[0] / (var_class[0] + var_class[1])
    var_class['Class 1'] = var_class[1] / (var_class[0] + var_class[1])
    print(var_class)
        
    # Plotting the stacked bar chart
    ax = var_class[['Class 0', 'Class 1']].plot(kind='bar', stacked=True, figsize=(15, 3))

    # Adding labels and titles
    ax.set_xlabel(var)
    ax.set_ylabel('Proportion')
    ax.set_title('Class 0 / Class 1 by '+var)
    
    # Rotating x-axis labels by 45 degrees
    ax.set_xticklabels(ax.get_xticklabels(), rotation=360, ha='right')
    
    # Adding proportion values within each bar
    for p in ax.patches:
        width, height = p.get_width(), p.get_height()
        x, y = p.get_xy()
        if height > 0:
            ax.annotate(f'{height:.2%}', (x + width / 2, y + height / 2), ha='center', va='center',fontsize=10)

    # Moving the legend outside the plot area to the top right
    ax.legend(['Class 0', 'Class 1'], loc='best', bbox_to_anchor=(1, 1.02))

    # Adjusting the plot margins
    plt.subplots_adjust(right=0.8)

    # Displaying the chart
    plt.show();


In [None]:
# Quick check
# #len(merged_df[(merged_df['Alpha'] == 'A') & (merged_df['Class'] == 1)])

# Box-whisker plot
- Numerical features by 'Class'

In [None]:
# Define the list of numeric features
numeric_lst = merged_df.select_dtypes(include=np.number).columns.tolist()
y_axis = numeric_lst

# Compute the number of rows and columns for subplot
nrows = len(numeric_lst) // 3 + 1
ncols = 3

# Create the figure and axes for subplot
fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 200))

for idx, feature in enumerate(numeric_lst):
    lst0 = merged_df[merged_df['Class'] == 0][feature].tolist()
    #print(f"Feature: {feature}, Length of lst0: {len(lst0)}")
    lst1 = merged_df[merged_df['Class'] == 1][feature].tolist()
    columns = [lst0, lst1]
    
    # Compute the subplot indices
    row_idx = idx // ncols
    col_idx = idx % ncols
    
    # Create the box plot with mean markers
    box = ax[row_idx, col_idx].boxplot(columns, notch=True, patch_artist=True, showmeans=True,
                                       meanprops={"marker": "s", "markerfacecolor": "white", "markeredgecolor": "Cyan"})
    
    ax[row_idx, col_idx].yaxis.set_major_locator(MaxNLocator(integer=True))
    ax[row_idx, col_idx].set_xticklabels(["Class 0", "Class 1"], size=10)
    ax[row_idx, col_idx].set_ylabel(y_axis[idx], size=10)
    
    colors = ['lightblue', 'lightgreen']
    for patch, color in zip(box['boxes'], colors):
        patch.set_facecolor(color)
    
    # Add legend for median and mean markers
    ax[row_idx, col_idx].legend([box["medians"][0], box["means"][0]], ['Median', 'Mean'], loc='upper right')

plt.show();

# Modeling

In [None]:
X_df = merged_df.drop(["Id", "Alpha", "Beta", "Gamma", "Delta","Epsilon","Class"], axis=1)
y_df = merged_df['Class']

In [None]:
X = X_df.to_numpy()
y = y_df.to_numpy()

In [None]:
# Define the column transformer to perform encoding and imputation
preprocessor = ColumnTransformer(transformers=[('cat', 
                                                Pipeline(steps=[('imputer', 
                                                                 SimpleImputer(strategy='most_frequent')
                                                                ),
                                                                ('encoder', 
                                                                 OneHotEncoder(handle_unknown='ignore',
                                                                               sparse_output=False)
                                                                )
                                                               ]
                                                        ),
                                                [39]),  # Use integer column index instead of string column name
                                               ('num', 
                                                SimpleImputer(strategy='median'), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
                                                                                   11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
                                                                                   21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
                                                                                   31, 32, 33, 34, 35, 36, 37, 38, 40,
                                                                                   41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
                                                                                   51, 52, 53, 54, 55]
                                               )  
                                              ],
                                 remainder='passthrough', sparse_threshold=0) # https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

## LightGBMTunerCV

In [None]:
def objective(trial):
    param = {"objective": "binary",
             "metric": "binary_logloss",
             "verbosity": -1,
             'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'rf', 'dart']),
             "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
             "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
             "num_leaves": trial.suggest_int("num_leaves", 2, 256),
             "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
             "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
             "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
             "pos_bagging_fraction": trial.suggest_float("pos_bagging_fraction", 0.4, 1.0),
             "neg_bagging_fraction": trial.suggest_float("neg_bagging_fraction", 0.4, 1.0),
             "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
             "seed": 42,
             "learning_rate":trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
             "is_unbalance":True,
             "num_threads": multiprocessing.cpu_count()
            }
    
    tuner = lgb.LightGBMTunerCV(param,
                                dtrain,
                                folds=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
                                nfold=5, stratified=True,
                                callbacks=[early_stopping(100), log_evaluation(100)],
                                seed=42,
                                #return_cvbooster=True,
                                optuna_seed=42
                               )

    tuner.run()
    
    return tuner.best_score

if __name__ == "__main__":
    start_time = time.time()
    
    preprocessor.fit(X)

    # Transform the training and validation data
    X = preprocessor.transform(X)

    dtrain = lgb.Dataset(X,label=y)    

    # https://optuna.readthedocs.io/en/stable/faq.html#how-can-i-obtain-reproducible-optimization-results
    # Make the sampler behave in a deterministic way
    sampler = TPESampler(seed=42)  
    study = optuna.create_study(direction="minimize",sampler=sampler)
    
    # Set the number of threads to the maximum number of available CPU cores
    #lgb.params["num_threads"] = multiprocessing.cpu_count()
    
    study.optimize(objective, 
                   n_trials=25) 

    end_time = time.time()
    total_training_time = end_time - start_time
    
    print("Total training time: {:.4f} seconds".format(total_training_time))
    
    print("Best score:", study.best_value)
    best_params = study.best_params
    print("Best params:", best_params)
    print("Params: ")
    for key, value in best_params.items():
        print(" {}: {}".format(key, value))

### Submission

In [None]:
# Retrieve the best hyperparameters
best_params = study.best_params

# Fit the preprocessor on the entire training data
preprocessor.fit(X_df)

# Transform the training data
X_transformed = preprocessor.transform(X_df)

# Create a LightGBM dataset from the transformed training data
#dtrain = lgb.Dataset(X_transformed, label=y_df)

# Preprocess the test data using the preprocessor fit on the training data
test_df_transformed = preprocessor.transform(test_df)

# Create a LightGBM dataset for the test data
#dtest = lgb.Dataset(test_df_transformed,reference=dtrain)

# Merge the best_params dictionary with the force_col_wise parameter
params = {**best_params, 
          **{'force_col_wise': True}}

gbm = LGBMClassifier(**params)
gbm = gbm.fit(X_transformed, y_df)

In [None]:
# Train a LightGBM model using the best hyperparameters
#gbm = lgb.train(params, dtrain, valid_sets=[dtrain, dtest])

In [None]:
# Make predictions on the unseen data using the trained LightGBM model
preds = gbm.predict_proba(test_df_transformed)[:,1]

# The predictions are probabilities for class 1
prob_class_1 = np.clip(preds,1e-15,1 - 1e-15)

# To get probabilities for class 0, subtract from 1
prob_class_0 = 1 - prob_class_1

# https://www.kaggle.com/competitions/icr-identify-age-related-conditions/discussion/412946
# https://www.kaggle.com/competitions/icr-identify-age-related-conditions/discussion/409801
submission = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')
submission['class_1'] = prob_class_1
submission['class_0'] = prob_class_0
submission.to_csv('submission.csv', index = False)

# Display the contents of the submission.csv
display(pd.read_csv('submission.csv'))

# References/Resources
- https://www.kaggle.com/competitions/icr-identify-age-related-conditions
- https://optuna.readthedocs.io/en/stable/index.html
- https://lightgbm.readthedocs.io/en/latest/index.html
- https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
- https://github.com/optuna/optuna-examples/blob/main/lightgbm/lightgbm_simple.py
- https://github.com/optuna/optuna-examples/blob/main/lightgbm/lightgbm_tuner_simple.py
- https://github.com/optuna/optuna-examples/blob/main/lightgbm/lightgbm_tuner_cv.py
- https://github.com/optuna/optuna-examples/blob/main/lightgbm/lightgbm_integration.py
- Balanced Log Loss Explained:
<br>https://www.kaggle.com/competitions/icr-identify-age-related-conditions/discussion/422442
- Code to validate submission / help solve submission errors:
<br>https://www.kaggle.com/competitions/icr-identify-age-related-conditions/discussion/422194
- Keep getting scoring error even though my csv is identical to sample submission:
<br>https://www.kaggle.com/competitions/icr-identify-age-related-conditions/discussion/412946
- Submission error counts as 1 submission ?:
<br>https://www.kaggle.com/competitions/icr-identify-age-related-conditions/discussion/409801
- https://chat.openai.com/
- https://bard.google.com/
- https://www.bing.com/search?q=Bing+AI&showconv=1&FORM=hpcodx&sydconv=1