In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.metrics import cohen_kappa_score
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [8]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_dict = pd.read_csv('data_dictionary.csv')

In [3]:
# add feature names to the dataframe
df_train['enmo_mean'] = np.nan
df_train['enmo_std'] = np.nan
df_train['light_mean'] = np.nan
df_train['light_std'] = np.nan

In [4]:
for id in df_train['id'].values:
    # first check if the file exists
    if not os.path.exists(f'series_train.parquet\id={id}'):
        continue
    
    # read the file and extract the features
    df_series = pd.read_parquet(f'series_train.parquet\id={id}', engine='pyarrow')
    df_train.loc[df_train['id'] == id, 'enmo_mean'] = df_series['enmo'].mean()
    df_train.loc[df_train['id'] == id, 'enmo_std'] = df_series['enmo'].std()
    df_train.loc[df_train['id'] == id, 'light_mean'] = df_series['light'].mean()
    df_train.loc[df_train['id'] == id, 'light_std'] = df_series['light'].std()

In [20]:
# get train matrix and target vector
columns_not_in_test = list(set(df_train.columns).difference(set(df_test.columns)))
X = df_train.drop(columns_not_in_test, axis=1)
y = df_train['sii']

In [21]:
# get categorical and numerical columns
numerical_features = df_dict[(df_dict['Type'] == 'float') | (df_dict['Type'] == 'int')]['Field'].values
numerical_features = [feature for feature in numerical_features if feature in X.columns]

categorical_features = df_dict[(df_dict['Type'] == 'str') | (df_dict['Type'] == 'categorical int')]['Field'].values
categorical_features = [feature for feature in categorical_features if feature in X.columns]

- **Logistic regression**: standardization + imputation of missing values + one-hot encoding
- **Random forest**: imputation of missing values + one-hot encoding
- **Histogram-based Gradient Boosting** (no preprocessing needed)
- **XGBoost** (no preprocessing needed)

In [None]:
# preprocessing pipelines
numerical_transformer_lr = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
numerical_transformer_rf = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# preprocessor for Logistic Regression
preprocessor_lr = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_lr, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# preprocessor for Random Forest
preprocessor_rf = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_rf, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

<div class="alert alert-block alert-danger">
We need to find a better way to handle missing values!
</div>

In [None]:
# evaluation function
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    kappa = cohen_kappa_score(y, y_pred)
    mean_accuracy = model.score(X, y)
    
    return kappa, mean_accuracy

<div class="alert alert-block alert-info">
    <h4>Plan for the future:</h4>
    <ul>
        <li>Somehow balance dataset</li>
        <li>Better handle missing values</li>
        <li>Extract new features from time series data</li>
        <li>Estimate feature importance and do feature selection</li>
        <li>Take also into account the unlabelled data
            <ul>
                <li>First a semi-supervised learning (e.g., Label Propagation and Label Spreading), then train a supervised model with the labelled data</li>
            </ul>
        </li>
    </ul>
</div>