In [1]:
!pip -q install /kaggle/input/pytorchtabnet/pytorch_tabnet-4.1.0-py3-none-any.whl

In [2]:
import os
import gc
import re
import copy
import torch
import optuna
import random
import pickle
import warnings
import numpy as np
import pandas as pd
import polars as pl
import torch.nn as nn
import lightgbm as lgb
import plotly.express as px
import torch.optim as optim
import polars.selectors as cs
import matplotlib.pyplot as plt

from tqdm import tqdm
from itertools import product
from sklearn.base import clone
from colorama import Fore, Style
from scipy.optimize import minimize
from IPython.display import clear_output
from pytorch_tabnet.callbacks import Callback
from sklearn.metrics import cohen_kappa_score
from sklearn.preprocessing import StandardScaler
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import StratifiedKFold
from pytorch_tabnet.tab_model import TabNetRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor

warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

SEED = 2024
n_splits = 10
num_trials = 50

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(SEED)

In [3]:
def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

# Define the quadratic weighted kappa evaluation
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

# Function to apply thresholds and convert oof to discrete values
def apply_thresholds(oof, thresholds):
    return np.digitize(oof, bins=thresholds)

# Function to evaluate predictions based on thresholds
def evaluate_predictions(thresholds, y_true, oof):
    y_pred = apply_thresholds(oof, thresholds)
    return -quadratic_weighted_kappa(y_true, y_pred)  # Negative for minimization


In [4]:
def process_actigraphy(base_dir):
    # Initialize results lists
    all_nights_results = []
    all_days_results = []
    
    # Iterate through each ID folder
    for id_folder in tqdm(sorted(os.listdir(base_dir))):
        id_path = os.path.join(base_dir, id_folder)
        parquet_file = os.path.join(id_path, "part-0.parquet")
    
        if not os.path.exists(parquet_file):
            continue
    
        # Read the Parquet file
        df = pd.read_parquet(parquet_file)
    
        # Basic Preprocessing
        df['time_of_day'] = (df['time_of_day'] / 1e9).astype(int) # Convert to seconds
        start_quarter = df.iloc[0]['quarter'] # Get the start quarter
        start_day = int(df['relative_date_PCIAT'].min())
        end_day = int(df['relative_date_PCIAT'].max())
    
        # All-nights analysis
        night_results = []
        for day in range(start_day, end_day + 1):
            # For each day, extract the data between 7 PM and 10 AM the next day, and validate the data(>90% valid data)
            sec_1am = 1 * 3600
            sec_5am = 5 * 3600
            sec_10am = 10 * 3600
            sec_7pm = 19 * 3600
            sec_start = sec_7pm
            sec_end = sec_10am
            
            condition = (
                ((df['relative_date_PCIAT'] == day) & (df['time_of_day'] >= sec_start)) |
                ((df['relative_date_PCIAT'] == day + 1) & (df['time_of_day'] < sec_end))
            )
    
            day_data = df[condition].copy()
    
            total_expected_points = ((24 * 3600 - sec_start) + sec_end) // 5
            valid_points = day_data[day_data['non-wear_flag'] != 1].shape[0]
            percentage_valid_data = (valid_points / total_expected_points) * 100 if total_expected_points > 0 else 0
    
            if percentage_valid_data < 90:
                continue
                
            # Calculate the mean ENMO for each 5-minute interval
            day_data['time_5min'] = ((day_data['time_of_day'] // 300) * 300)
            mean_enmo = day_data.groupby(['relative_date_PCIAT', 'time_5min'])['enmo'].mean().reset_index()
    
            # Identify the must-be-sleeping data points from mean ENMO
            condition_night = (
                ((mean_enmo['relative_date_PCIAT'] == day + 1) & (mean_enmo['time_5min'] >= sec_1am)) &
                (mean_enmo['time_5min'] <= sec_5am)
            )
            max_5min_enmo = mean_enmo[condition_night]['enmo'].max() # Get the max mean ENMO during the must-be-sleeping period
    
            # Identify the sleep and wake candidates
            #   - Sleep candidates: ENMO > 1.2 * max_5min_enmo between 7 PM and 1 AM
            condition_sleep = (
                ((mean_enmo['relative_date_PCIAT'] == day + 1) & ((mean_enmo['time_5min'] < sec_1am))) |
                ((mean_enmo['relative_date_PCIAT'] == day) & (mean_enmo['time_5min'] >= sec_7pm))
            )
            sleep_candidates = mean_enmo[condition_sleep]
            sleep_candidates = sleep_candidates[sleep_candidates['enmo'] > 1.2 * max_5min_enmo]
            
            sleep_time = None
            if not sleep_candidates.empty:
                for i in range(len(sleep_candidates) - 1):
                    if sleep_candidates.iloc[i + 1]['time_5min'] - sleep_candidates.iloc[i]['time_5min'] == 300:
                        sleep_time = sleep_candidates.iloc[i + 1]['time_5min']
    
            if sleep_time is not None and sleep_time < sec_7pm:
                sleep_time += 24 * 3600
    
            #   - Wake candidates: ENMO > 1.2 * max_5min_enmo between 5 AM and 10 AM
            condition_wake = ((mean_enmo['relative_date_PCIAT'] == day + 1) & (mean_enmo['time_5min'] > sec_5am))
            wake_candidates = mean_enmo[condition_wake]
            wake_candidates = wake_candidates[wake_candidates['enmo'] > 1.2 * max_5min_enmo]
    
            wake_time = None
            if not wake_candidates.empty:
                for i in range(len(wake_candidates) - 1):
                    if wake_candidates.iloc[i + 1]['time_5min'] - wake_candidates.iloc[i]['time_5min'] == 300:
                        wake_time = wake_candidates.iloc[i]['time_5min']
                        break
    
            if sleep_time is not None and wake_time is not None:
                night_results.append({
                    'day': day,
                    'percentage_valid_data': percentage_valid_data,
                    'sleep_time_sec': sleep_time,
                    'wake_time_sec': wake_time,
                    'wake_minus_sleep_sec': (wake_time - sleep_time) % (24 * 3600),
                    'max_5min_enmo': max_5min_enmo,
                })
            else:
                night_results.append({
                    'day': day,
                    'percentage_valid_data': percentage_valid_data,
                    'sleep_time_sec': np.nan,
                    'wake_time_sec': np.nan,
                    'wake_minus_sleep_sec': np.nan,
                    'max_5min_enmo': max_5min_enmo,
                })
    
        # Now, we have the sleep and wake times for each night, calculate the statistics
        if night_results:
            valid_nights = pd.DataFrame(night_results).dropna(subset=['sleep_time_sec', 'wake_time_sec'])
    
            sleep_stats = valid_nights['sleep_time_sec'].agg(['min', 'max', 'mean', 'std']).to_dict()
            wake_stats = valid_nights['wake_time_sec'].agg(['min', 'max', 'mean', 'std']).to_dict()
            wake_minus_sleep_stats = valid_nights['wake_minus_sleep_sec'].agg(['min', 'max', 'mean', 'std']).to_dict()
            enmo_stats = valid_nights['max_5min_enmo'].agg(['min', 'max', 'mean', 'std']).to_dict()
    
            all_nights_results.append({
                'id': id_folder.replace('id=', ''),
                'num_valid_nights': len(valid_nights),
                'num_total_days': end_day - start_day + 1,
                'percentage_valid_nights': len(valid_nights) / (end_day - start_day + 1) * 100,
                **{f'sleep_{key}': value for key, value in sleep_stats.items()},
                **{f'wake_{key}': value for key, value in wake_stats.items()},
                **{f'wake_minus_sleep_{key}': value for key, value in wake_minus_sleep_stats.items()},
                **{f'enmo_{key}': value for key, value in enmo_stats.items()}
            })
        
        else:
            empty_features = {
                'sleep_min': None, 'sleep_max': None, 'sleep_mean': None, 'sleep_std': None,
                'wake_min': None, 'wake_max': None, 'wake_mean': None, 'wake_std': None,
                'wake_minus_sleep_min': None, 'wake_minus_sleep_max': None, 'wake_minus_sleep_mean': None, 'wake_minus_sleep_std': None,
                'enmo_min': None, 'enmo_max': None, 'enmo_mean': None, 'enmo_std': None
            }
    
            # Append the result with features set to None
            all_nights_results.append({
                'id': id_folder.replace('id=', ''),
                'num_valid_nights': 0,
                'num_total_days': end_day - start_day + 1,
                'percentage_valid_nights': 0,
                **empty_features
            })
    
        # All-day analysis
        day_results = []
    
        for day in range(start_day, end_day + 1):
            day_data = df[df['relative_date_PCIAT'] == day]
            total_valid_points = day_data[day_data['non-wear_flag'] != 1].shape[0] 
            total_expected_points = 86400 // 5
            percentage_valid_data = (total_valid_points / total_expected_points) * 100
    
            if percentage_valid_data < 90:
                continue
    
            day_mean_enmo = day_data['enmo'].mean()
            day_max_enmo = day_data['enmo'].max()
            day_std_enmo = day_data['enmo'].std()
            day_mean_light = day_data['light'].mean()
            day_max_light = day_data['light'].max()
            day_std_light = day_data['light'].std()
    
            day_results.append({
                'day': day,
                'mean_enmo': day_mean_enmo,
                'max_enmo': day_max_enmo,
                'std_enmo': day_std_enmo,
                'mean_light': day_mean_light,
                'max_light': day_max_light,
                'std_light': day_std_light
            })
    
        if day_results:
            valid_days = pd.DataFrame(day_results)
            all_days_results.append({
                'id': id_folder.replace('id=', ''),
                'num_valid_days': len(valid_days),
                'quarter': start_quarter,
                'percentage_valid_days': len(valid_days) / (end_day - start_day + 1) * 100,
                **{f'{stat}_{metric}': valid_days[stat].agg(metric) for stat in ['mean_enmo', 'max_enmo', 'std_enmo', 'mean_light', 'max_light', 'std_light'] for metric in ['mean', 'min', 'max', 'std']}
                })
    
        else:
            empty_features = {f'{base}_{metric}': np.nan for base  in ['mean_enmo', 'max_enmo', 'std_enmo', 'mean_light', 'max_light', 'std_light'] for metric in ['mean', 'min', 'max', 'std']}
            all_days_results.append({
                'id': id_folder.replace('id=', ''),
                'num_valid_days': 0,
                'quarter': start_quarter,
                'percentage_valid_days': 0,
                **empty_features
            })
    
    # Combine results into a single DataFrame
    all_night_df = pd.DataFrame(all_nights_results)
    all_day_df = pd.DataFrame(all_days_results)
    
    # for all col in all_day_df with 'std' in the last 4 characters, fill NaN with 0 if num_valid_days is not 0
    for col in all_day_df.columns:
        if 'std' in col[-4:]:
            all_day_df[col] = all_day_df[col][all_day_df['num_valid_days'] != 0].fillna(0)
    
    # for all col in all_night_df with 'std' in the last 4 characters, fill NaN with 0 if num_valid_nights is not 0
    for col in all_night_df.columns:
        if 'std' in col[-4:]:
            all_night_df[col] = all_night_df[col][all_night_df['num_valid_nights'] != 0].fillna(0)
    
    df_act = pd.merge(all_night_df, all_day_df, on='id', how='outer')
    return df_act

In [5]:
def feature_engineering(df):
    # season_cols = [col for col in df.columns if 'Season' in col]
    # df = df.drop(season_cols, axis=1) 
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']
    df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']
    df['BMI_PHR'] = df['Physical-BMI'] * df['Physical-HeartRate']
    return df

base = '/kaggle/input/child-mind-institute-problematic-internet-use/'
intermediate_base = '/kaggle/working'

# base = 'D:\Kaggle\CMI\child-mind-institute-problematic-internet-use'
# intermediate_base = './'

dirpath_train_ts = os.path.join(base, 'series_train.parquet')
dirpath_test_ts = os.path.join(base, 'series_test.parquet')

df_train = pd.read_csv(os.path.join(base, 'train.csv'))
df_test = pd.read_csv(os.path.join(base, 'test.csv'))
sample = pd.read_csv(os.path.join(base, 'sample_submission.csv'))
df_train = df_train.dropna(subset=['sii']).reset_index(drop=True)

df_train.loc[df_train['BIA-BIA_Fat'] < 0, 'BIA-BIA_Fat'] = np.nan
df_train.loc[df_train['BIA-BIA_FMI'] < 0, 'BIA-BIA_FMI'] = np.nan

load_actigraphy = True
save_actigraphy = True

# Compute time-series features
if os.path.exists(os.path.join(intermediate_base, 'df_train_actigraphy.csv')) and os.path.exists(os.path.join(intermediate_base, 'df_test_actigraphy.csv')) and load_actigraphy:
    df_train_actigraphy = pd.read_csv(os.path.join(intermediate_base, 'df_train_actigraphy.csv'))
    df_test_actigraphy = pd.read_csv(os.path.join(intermediate_base, 'df_test_actigraphy.csv'))
    
else:
    # results_train = process_actigraphy(id_paths_train)
    # results_test = process_actigraphy(id_paths_test)
    # df_train_actigraphy = pd.DataFrame(results_train, columns=['id'] + ['actigraphy_' + str(i) for i in range(len(results_train[0])-1)])
    # df_test_actigraphy = pd.DataFrame(results_test, columns=['id'] + ['actigraphy_' + str(i) for i in range(len(results_test[0])-1)])

    df_train_actigraphy = process_actigraphy(dirpath_train_ts)
    df_test_actigraphy = process_actigraphy(dirpath_test_ts)

    if save_actigraphy:
        df_train_actigraphy.to_csv(os.path.join(intermediate_base, 'df_train_actigraphy.csv'), index=False)
        df_test_actigraphy.to_csv(os.path.join(intermediate_base, 'df_test_actigraphy.csv'), index=False)
    

df_train_actigraphy_encoded = df_train_actigraphy
df_test_actigraphy_encoded = df_test_actigraphy

# df_train_actigraphy_encoded = encode(df_train_actigraphy, encoding_dim=60, epochs=100, batch_size=32)
# df_test_actigraphy_encoded = encode(df_test_actigraphy, encoding_dim=60, epochs=100, batch_size=32)
# df_train_actigraphy_encoded['id'] = df_train_actigraphy['id']
# df_test_actigraphy_encoded['id'] = df_test_actigraphy['id']

imputer = KNNImputer(n_neighbors=5)
numeric_cols_train = [col for col in df_test.select_dtypes(include=['float64', 'int64']).columns 
                      if 'sii' not in col.lower() and 'pciat' not in col.lower()]
numeric_cols_test = [col for col in df_test.select_dtypes(include=['float64', 'int64']).columns 
                      if 'sii' not in col.lower() and 'pciat' not in col.lower()]
df_all = pd.concat([df_train[numeric_cols_train], df_test[numeric_cols_test]], axis=0)
imputer.fit(df_all)

df_train_imputed = pd.DataFrame(imputer.transform(df_train[numeric_cols_train]), columns=numeric_cols_train)
df_test_imputed = pd.DataFrame(imputer.transform(df_test[numeric_cols_test]), columns=numeric_cols_test)

for col in df_train.columns:
    if col not in numeric_cols_train:
        df_train_imputed[col] = df_train[col]

for col in df_test.columns:
    if col not in numeric_cols_test:
        df_test_imputed[col] = df_test[col]

df_train = df_train_imputed
df_train = feature_engineering(df_train)

df_test = df_test_imputed
df_test = feature_engineering(df_test)

df_train = pd.merge(df_train, df_train_actigraphy_encoded, how='left', on='id')
df_test = pd.merge(df_test, df_test_actigraphy_encoded, how='left', on='id')

100%|██████████| 996/996 [05:11<00:00,  3.20it/s]
100%|██████████| 2/2 [00:00<00:00,  5.19it/s]


# Submission 1 - Tabular + Customized Actigraphy | KNN Imputer

In [6]:
featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday']

actigraphy_features = [col for col in df_test_actigraphy_encoded.columns if 'id' not in col]

target = 'sii'

additional_features = []
df_train = df_train[featuresCols + actigraphy_features + [target]]
df_test = df_test[featuresCols + actigraphy_features]

if np.any(np.isinf(df_train)):
    df_train = df_train.replace([np.inf, -np.inf], np.nan)
if np.any(np.isinf(df_test)):
    df_test = df_test.replace([np.inf, -np.inf], np.nan)

In [7]:
# Optuna objective function
# open a .txt

def train(params, trial_number):
    X = df_train.drop(['sii'], axis=1)
    y = df_train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
        
    list_train_kappas = []
    list_valid_kappas = []
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    y_test = np.zeros((len(df_test), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = LGBMRegressor(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[
            lgb.early_stopping(10000),           # Early stopping after 50 rounds
            lgb.log_evaluation(period=10000)],    # Log evaluation metrics every 10 rounds
        )

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        valid_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)
        list_train_kappas.append(train_kappa)
        list_valid_kappas.append(valid_kappa)
    
        y_test[:, fold] = model.predict(df_test)
        clear_output(wait=True)
            
    print(f"Mean Train QWK --> {np.mean(list_train_kappas):.4f}")
    print(f"Mean Valid QWK --> {np.mean(list_valid_kappas):.4f}")

    # Grid sweep setup
    threshold_ranges = [
        np.linspace(0.25, 0.75, 101, endpoint=True),   # First threshold range
        np.linspace(0.75, 1.25, 101, endpoint=True), # Second threshold range
        [3],
    ]

    # Prepare the grid search
    best_score = float('-inf')
    best_thresholds = None

    for thresholds in tqdm(product(*threshold_ranges)):
        thresholds = sorted(thresholds)  # Ensure thresholds are in order
        score = -evaluate_predictions(thresholds, y, oof_non_rounded)  # Flip back to positive
        if score > best_score:
            # print("Score: ", score, "thresholds: ", thresholds)
            best_score = score
            best_thresholds = thresholds

    threshold_ranges = [
        [best_thresholds[0]],
        [best_thresholds[1]],
        np.linspace(best_thresholds[1], 3.5, 1001, endpoint=True)    # Third threshold range
    ]

    for thresholds in tqdm(product(*threshold_ranges)):
        thresholds = sorted(thresholds)  # Ensure thresholds are in order
        score = -evaluate_predictions(thresholds, y, oof_non_rounded)  # Flip back to positive
        if score > best_score:
            best_score = score
            best_thresholds = thresholds
    
    oof_tuned = threshold_Rounder(oof_non_rounded, best_thresholds)
    oof_kappa = quadratic_weighted_kappa(y, oof_tuned)

    y_test = y_test.mean(axis=1)
    y_test = threshold_Rounder(y_test, best_thresholds)
    
    submission = pd.DataFrame({
        'id': sample['id'],
            'sii': y_test
        })

    # write to the .txt
    f.write(f"Trial: {trial_number}\n")
    f.write(f"Params: {params}\n")
    f.write(f"Score: {best_score}\n")
    f.write(f"Thresholds: {best_thresholds}\n")
    f.write(f"Train Kappa: {np.mean(list_train_kappas):.4f}\n")
    f.write(f"Validation Kappa: {np.mean(list_valid_kappas):.4f}\n")
    f.write(f"Optimized OOF QWK: {oof_kappa:.3f}\n")
    f.write(f"Best Score: {best_score}\n\n")
    f.write("-" * 100 + "\n")
    f.flush()

    return -oof_kappa, submission, oof_tuned

with open('LGBM_Optimization.txt', 'w') as f:
    f.write('Submission 1 - Optimization started\n')

    def objective(trial):
        params = {
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, step=0.01),  # Replaced suggest_loguniform
            'n_estimators': trial.suggest_int('n_estimators', 100, 300, step=50),
            'max_depth': trial.suggest_int('max_depth', 3, 7, step=1),
            'num_leaves': trial.suggest_int('num_leaves', 16, 512, step=2),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 16, 128),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0, step=0.01),  # Replaced suggest_uniform
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0, step=0.01),  # Replaced suggest_uniform
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 10, step=1),
            "lambda_l1": trial.suggest_int("lambda_l1", 0, 10, step=1),
            "lambda_l2": trial.suggest_int("lambda_l2", 0, 10, step=1),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.1),  # Replaced suggest_uniform
            'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0, step=0.1),  # Replaced suggest_uniform
            'verbose': -1, # Disable LightGBM verbose logging
            'random_state': SEED
        }

        score, submission, oof_tuned = train(params, trial.number)
        return score

    study = optuna.create_study(direction="minimize", study_name="LGBM Optimization")
    study.optimize(objective, n_trials=num_trials)
    best_params = study.best_params
    best_score = study.best_value
    print(f"Best Score: {best_score}")
    print(f"Best Params: {best_params}")

    best_params['random_state'] = SEED
    score, submission1, oof_tuned1 = train(best_params, study.best_trial.number)
    print("Reproduced Score: ", score)

submission1

Training Folds: 100%|██████████| 10/10 [00:04<00:00,  2.37it/s]


Mean Train QWK --> 0.4962
Mean Valid QWK --> 0.3916


10201it [00:30, 330.07it/s]
1001it [00:03, 328.74it/s]

Reproduced Score:  -0.4928209308509479





Unnamed: 0,id,sii
0,00008ff9,0
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,2
5,001f3379,0
6,0038ba98,1
7,0068a485,0
8,0069fbed,2
9,0083e397,2


# Submission 2 - Tabular Only | KNN Imputer

In [8]:
featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday']

target = 'sii'
df_train = df_train[featuresCols  + [target]]
df_test = df_test[featuresCols]

if np.any(np.isinf(df_train)):
    df_train = df_train.replace([np.inf, -np.inf], np.nan)
if np.any(np.isinf(df_test)):
    df_test = df_test.replace([np.inf, -np.inf], np.nan)

In [9]:
# Optuna objective function
# open a .txt

def train(params, trial_number):
    X = df_train.drop(['sii'], axis=1)
    y = df_train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
        
    list_train_kappas = []
    list_valid_kappas = []
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    y_test = np.zeros((len(df_test), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = LGBMRegressor(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[
            lgb.early_stopping(10000),           # Early stopping after 50 rounds
            lgb.log_evaluation(period=10000)],    # Log evaluation metrics every 10 rounds
        )

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        valid_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)
        list_train_kappas.append(train_kappa)
        list_valid_kappas.append(valid_kappa)
    
        y_test[:, fold] = model.predict(df_test)
        clear_output(wait=True)
            
    print(f"Mean Train QWK --> {np.mean(list_train_kappas):.4f}")
    print(f"Mean Valid QWK --> {np.mean(list_valid_kappas):.4f}")

    # Grid sweep setup
    threshold_ranges = [
        np.linspace(0.25, 0.75, 101, endpoint=True),   # First threshold range
        np.linspace(0.75, 1.25, 101, endpoint=True), # Second threshold range
        [3],
    ]

    # Prepare the grid search
    best_score = float('-inf')
    best_thresholds = None

    for thresholds in tqdm(product(*threshold_ranges)):
        thresholds = sorted(thresholds)  # Ensure thresholds are in order
        score = -evaluate_predictions(thresholds, y, oof_non_rounded)  # Flip back to positive
        if score > best_score:
            # print("Score: ", score, "thresholds: ", thresholds)
            best_score = score
            best_thresholds = thresholds

    threshold_ranges = [
        [best_thresholds[0]],
        [best_thresholds[1]],
        np.linspace(best_thresholds[1], 3.5, 1001, endpoint=True)    # Third threshold range
    ]

    for thresholds in tqdm(product(*threshold_ranges)):
        thresholds = sorted(thresholds)  # Ensure thresholds are in order
        score = -evaluate_predictions(thresholds, y, oof_non_rounded)  # Flip back to positive
        if score > best_score:
            best_score = score
            best_thresholds = thresholds
    
    oof_tuned = threshold_Rounder(oof_non_rounded, best_thresholds)
    oof_kappa = quadratic_weighted_kappa(y, oof_tuned)

    y_test = y_test.mean(axis=1)
    y_test = threshold_Rounder(y_test, best_thresholds)
    
    submission = pd.DataFrame({
        'id': sample['id'],
            'sii': y_test
        })

    # write to the .txt
    f.write(f"Trial: {trial_number}\n")
    f.write(f"Params: {params}\n")
    f.write(f"Score: {best_score}\n")
    f.write(f"Thresholds: {best_thresholds}\n")
    f.write(f"Train Kappa: {np.mean(list_train_kappas):.4f}\n")
    f.write(f"Validation Kappa: {np.mean(list_valid_kappas):.4f}\n")
    f.write(f"Optimized OOF QWK: {oof_kappa:.3f}\n")
    f.write(f"Best Score: {best_score}\n\n")
    f.write("-" * 100 + "\n")
    f.flush()

    return -oof_kappa, submission, oof_tuned


with open('LGBM_Optimization.txt', 'a') as f:
    f.write('Submission 2 - Optimization started\n')

    def objective(trial):
        params = {
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, step=0.01),  # Replaced suggest_loguniform
            'n_estimators': trial.suggest_int('n_estimators', 100, 300, step=50),
            'max_depth': trial.suggest_int('max_depth', 3, 7, step=1),
            'num_leaves': trial.suggest_int('num_leaves', 16, 512, step=2),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 16, 128),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0, step=0.01),  # Replaced suggest_uniform
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0, step=0.01),  # Replaced suggest_uniform
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 10, step=1),
            "lambda_l1": trial.suggest_int("lambda_l1", 0, 10, step=1),
            "lambda_l2": trial.suggest_int("lambda_l2", 0, 10, step=1),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.1),  # Replaced suggest_uniform
            'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0, step=0.1),  # Replaced suggest_uniform
            'verbose': -1, # Disable LightGBM verbose logging
            'random_state': SEED
        }

        score, submission, oof_tuned = train(params, trial.number)
        return score

    study = optuna.create_study(direction="minimize", study_name="LGBM Optimization")
    study.optimize(objective, n_trials=num_trials)
    best_params = study.best_params
    best_score = study.best_value
    print(f"Best Score: {best_score}")
    print(f"Best Params: {best_params}")
    
    best_params['random_state'] = SEED
    score, submission2, oof_tuned2 = train(best_params, study.best_trial.number)
    print("Reproduced Score: ", score)

submission2

Training Folds: 100%|██████████| 10/10 [00:02<00:00,  4.53it/s]


Mean Train QWK --> 0.4546
Mean Valid QWK --> 0.3853


10201it [00:28, 355.21it/s]
1001it [00:02, 364.72it/s]


Reproduced Score:  -0.4954341763489599


Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,1
3,00115b9f,0
4,0016bb22,2
5,001f3379,1
6,0038ba98,1
7,0068a485,0
8,0069fbed,2
9,0083e397,1


# Submission 3 - Tabular + Customized Actigraphy | Median Imputer

In [10]:
df_train = pd.read_csv(os.path.join(base, 'train.csv'))
df_test = pd.read_csv(os.path.join(base, 'test.csv'))
df_train = pd.merge(df_train, df_train_actigraphy, how='left', on='id')
df_test = pd.merge(df_test, df_test_actigraphy, how='left', on='id')

imputer = SimpleImputer(strategy='median')
numeric_cols_train = [col for col in df_train.select_dtypes(include=['float64', 'int64']).columns if 'sii' not in col.lower() and 'pciat' not in col.lower()]
numeric_cols_test = [col for col in df_test.select_dtypes(include=['float64', 'int64']).columns if 'sii' not in col.lower() and 'pciat' not in col.lower()]
imputer.fit(df_train[numeric_cols_train])

df_train_imputed = pd.DataFrame(imputer.transform(df_train[numeric_cols_train]), columns=numeric_cols_train)
df_test_imputed = pd.DataFrame(imputer.transform(df_test[numeric_cols_test]), columns=numeric_cols_test)

for col in df_train.columns:
    if col not in numeric_cols_train:
        df_train_imputed[col] = df_train[col]

for col in df_test.columns:
    if col not in numeric_cols_test:
        df_test_imputed[col] = df_test[col]

df_train = df_train_imputed
df_test = df_test_imputed

In [11]:

season_cols = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

feature_cols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday']

actigraphy_features = [col for col in df_train_actigraphy.columns if 'id' not in col]
target = 'sii'

df_train = df_train[feature_cols + actigraphy_features + [target]]
df_test = df_test[feature_cols + actigraphy_features]
df_train = df_train.dropna(subset='sii')

for col in season_cols: 
    df_train[col] = df_train[col].fillna('Missing')
    df_test[col] = df_test[col].fillna('Missing')
    df_train[col] = df_train[col].astype('category')
    df_test[col] = df_test[col].astype('category')

    unique_values_train = df_train[col].unique()
    unique_values_test = df_test[col].unique()
    unique_values = np.unique(np.concatenate([unique_values_train, unique_values_test]))
    catToNumberTrain = {value: idx for idx, value in enumerate(unique_values_train)}
    catToNumberTest = {value: idx for idx, value in enumerate(unique_values_test)}
    catToNumber = {value: idx for idx, value in enumerate(unique_values)}
    
    df_train[col] = df_train[col].replace(catToNumber) 
    df_test[col] = df_test[col].replace(catToNumber)

# check how many cols has missing values
missing_cols = df_train.columns[df_train.isnull().any()]
missing_cols

# if np.any(np.isinf(df_train)):
#     df_train = df_train.replace([np.inf, -np.inf], np.nan)
# if np.any(np.isinf(df_test)):
#     df_test = df_test.replace([np.inf, -np.inf], np.nan)

Index([], dtype='object')

In [12]:
# Optuna objective function
# open a .txt

def train(params, trial_number):
    X = df_train.drop(['sii'], axis=1)
    y = df_train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
        
    list_train_kappas = []
    list_valid_kappas = []
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    y_test = np.zeros((len(df_test), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = LGBMRegressor(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[
            lgb.early_stopping(10000),           # Early stopping after 50 rounds
            lgb.log_evaluation(period=10000)],    # Log evaluation metrics every 10 rounds
        )

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        valid_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)
        list_train_kappas.append(train_kappa)
        list_valid_kappas.append(valid_kappa)
    
        y_test[:, fold] = model.predict(df_test)
        clear_output(wait=True)
            
    print(f"Mean Train QWK --> {np.mean(list_train_kappas):.4f}")
    print(f"Mean Valid QWK --> {np.mean(list_valid_kappas):.4f}")

    # Grid sweep setup
    threshold_ranges = [
        np.linspace(0.25, 0.75, 101, endpoint=True),   # First threshold range
        np.linspace(0.75, 1.25, 101, endpoint=True), # Second threshold range
        [3],
    ]

    # Prepare the grid search
    best_score = float('-inf')
    best_thresholds = None

    for thresholds in tqdm(product(*threshold_ranges)):
        thresholds = sorted(thresholds)  # Ensure thresholds are in order
        score = -evaluate_predictions(thresholds, y, oof_non_rounded)  # Flip back to positive
        if score > best_score:
            # print("Score: ", score, "thresholds: ", thresholds)
            best_score = score
            best_thresholds = thresholds

    threshold_ranges = [
        [best_thresholds[0]],
        [best_thresholds[1]],
        np.linspace(best_thresholds[1], 3.5, 1001, endpoint=True)    # Third threshold range
    ]

    for thresholds in tqdm(product(*threshold_ranges)):
        thresholds = sorted(thresholds)  # Ensure thresholds are in order
        score = -evaluate_predictions(thresholds, y, oof_non_rounded)  # Flip back to positive
        if score > best_score:
            best_score = score
            best_thresholds = thresholds
    
    oof_tuned = threshold_Rounder(oof_non_rounded, best_thresholds)
    oof_kappa = quadratic_weighted_kappa(y, oof_tuned)

    y_test = y_test.mean(axis=1)
    y_test = threshold_Rounder(y_test, best_thresholds)
    
    submission = pd.DataFrame({
        'id': sample['id'],
            'sii': y_test
        })

    # write to the .txt
    f.write(f"Trial: {trial_number}\n")
    f.write(f"Params: {params}\n")
    f.write(f"Score: {best_score}\n")
    f.write(f"Thresholds: {best_thresholds}\n")
    f.write(f"Train Kappa: {np.mean(list_train_kappas):.4f}\n")
    f.write(f"Validation Kappa: {np.mean(list_valid_kappas):.4f}\n")
    f.write(f"Optimized OOF QWK: {oof_kappa:.3f}\n")
    f.write(f"Best Score: {best_score}\n\n")
    f.write("-" * 100 + "\n")
    f.flush()

    return -oof_kappa, submission, oof_tuned


with open('LGBM_Optimization.txt', 'a') as f:
    f.write('Submission 3 - Optimization started\n')

    def objective(trial):
        params = {
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, step=0.01),  # Replaced suggest_loguniform
            'n_estimators': trial.suggest_int('n_estimators', 100, 300, step=50),
            'max_depth': trial.suggest_int('max_depth', 3, 7, step=1),
            'num_leaves': trial.suggest_int('num_leaves', 16, 512, step=2),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 16, 128),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0, step=0.01),  # Replaced suggest_uniform
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0, step=0.01),  # Replaced suggest_uniform
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 10, step=1),
            "lambda_l1": trial.suggest_int("lambda_l1", 0, 10, step=1),
            "lambda_l2": trial.suggest_int("lambda_l2", 0, 10, step=1),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.1),  # Replaced suggest_uniform
            'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0, step=0.1),  # Replaced suggest_uniform
            'verbose': -1, # Disable LightGBM verbose logging
            'random_state': SEED
        }

        score, submission, oof_tuned = train(params, trial.number)
        return score

    study = optuna.create_study(direction="minimize", study_name="LGBM Optimization")
    study.optimize(objective, n_trials=num_trials)
    best_params = study.best_params
    best_score = study.best_value
    print(f"Best Score: {best_score}")
    print(f"Best Params: {best_params}")
    
    best_params['random_state'] = SEED
    score, submission3, oof_tuned3 = train(best_params, study.best_trial.number)
    print("Reproduced Score: ", score)

submission3

Training Folds: 100%|██████████| 10/10 [00:02<00:00,  4.19it/s]


Mean Train QWK --> 0.4456
Mean Valid QWK --> 0.3747


10201it [00:30, 339.46it/s]
1001it [00:02, 334.94it/s]


Reproduced Score:  -0.49599542536030583


Unnamed: 0,id,sii
0,00008ff9,0
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,1
5,001f3379,0
6,0038ba98,1
7,0068a485,0
8,0069fbed,1
9,0083e397,1


In [13]:
sub1 = submission1
sub2 = submission2
sub3 = submission3

sub1 = sub1.sort_values(by='id').reset_index(drop=True)
sub2 = sub2.sort_values(by='id').reset_index(drop=True)
sub3 = sub3.sort_values(by='id').reset_index(drop=True)

combined = pd.DataFrame({
    'id': sub1['id'],
    'sii_1': sub1['sii'],
    'sii_2': sub2['sii'],
    'sii_3': sub3['sii'],
})

def majority_vote(row):
    mode_values = row.mode()
    return int(mode_values.median())
    
combined['sii'] = combined[['sii_1', 'sii_2', 'sii_3']].apply(majority_vote, axis=1)
final_submission = combined[['id', 'sii']]
final_submission.to_csv('submission.csv', index=False)
print("Majority voting completed and saved to 'Final_Submission.csv'")
final_submission

Majority voting completed and saved to 'Final_Submission.csv'


Unnamed: 0,id,sii
0,00008ff9,0
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,2
5,001f3379,0
6,0038ba98,1
7,0068a485,0
8,0069fbed,2
9,0083e397,1


In [14]:
df_oof1 = pd.DataFrame(oof_tuned1, columns=['oof1'])
df_oof2 = pd.DataFrame(oof_tuned2, columns=['oof2'])
df_oof3 = pd.DataFrame(oof_tuned3, columns=['oof3'])

df_oof_combined = pd.concat([df_oof1, df_oof2, df_oof3], axis=1).apply(majority_vote, axis=1)

# compute the kappa score
print('Kappa Score: ', quadratic_weighted_kappa(df_train['sii'], df_oof_combined))

# print the percentage of each label in the oof_tuned and y
print(df_oof_combined.value_counts(normalize=True))
print(df_train['sii'].value_counts(normalize=True))

Kappa Score:  0.5037964888523481
0    0.561404
1    0.272295
2    0.159357
3    0.006944
Name: proportion, dtype: float64
sii
0.0    0.582602
1.0    0.266813
2.0    0.138158
3.0    0.012427
Name: proportion, dtype: float64


In [15]:
print(df_train['sii'].value_counts(normalize=True), '\n')
print(submission1['sii'].value_counts(normalize=True), '\n')
print(submission2['sii'].value_counts(normalize=True), '\n')
print(submission3['sii'].value_counts(normalize=True), '\n')
print(final_submission['sii'].value_counts(normalize=True))


sii
0.0    0.582602
1.0    0.266813
2.0    0.138158
3.0    0.012427
Name: proportion, dtype: float64 

sii
0    0.45
1    0.30
2    0.25
Name: proportion, dtype: float64 

sii
1    0.55
0    0.30
2    0.15
Name: proportion, dtype: float64 

sii
0    0.6
1    0.4
Name: proportion, dtype: float64 

sii
0    0.45
1    0.40
2    0.15
Name: proportion, dtype: float64
