In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from utils import load_tabular_data, evaluate_model


In [2]:
df_train, df_test, data_dict = load_tabular_data("baseline_train.csv", "baseline_test.csv", "data_dictionary.csv")
len(df_train), len(df_test)

(2188, 548)

In [3]:
y_train = df_train["sii"]
X_train = df_train.drop(columns="sii")

y_test = df_test["sii"]
X_test = df_test.drop(columns="sii")

print(y_train.shape, X_train.shape, y_test.shape, X_test.shape)

(2188,) (2188, 64) (548,) (548, 64)


In [4]:
X_train.dtypes

Unnamed: 0                                  int64
id                                         object
Basic_Demos-Enroll_Season                  object
Basic_Demos-Age                             int64
Basic_Demos-Sex                             int64
                                           ...   
PreInt_EduHx-computerinternet_hoursday    float64
enmo_mean                                 float64
enmo_std                                  float64
light_mean                                float64
light_std                                 float64
Length: 64, dtype: object

In [5]:
# get categorical and numerical columns
numerical_features = data_dict[(data_dict['Type'] == 'float') | (data_dict['Type'] == 'int')]['Field'].values
numerical_features = [feature for feature in numerical_features if feature in X_train.columns]

categorical_features = data_dict[(data_dict['Type'] == 'str') | (data_dict['Type'] == 'categorical int')]['Field'].values
categorical_features = [feature for feature in categorical_features if feature in X_train.columns]

In [6]:
# preprocessing pipelines
numerical_transformer_rf = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [7]:
# preprocessor for Random Forest
preprocessor_rf = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_rf, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
# add Random Forest to pipeline and train

clf_rf = RandomForestClassifier(random_state=111) # default parameters

clf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor_rf),
    ("classifier", clf_rf)
])

clf_pipeline.fit(X_train, y_train)

In [10]:
# predict and evaluate model
kappa, mean_accuracy = evaluate_model(clf_pipeline, X_test, y_test)
print(f"Quadratic weighted kappa: {kappa}, Accuracy: {mean_accuracy}")

Quadratic weighted kappa: 0.15217736643989366, Accuracy: 0.583941605839416
