In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from utils import load_tabular_data, evaluate_model


In [3]:
df_train, df_test, data_dict = load_tabular_data("../data/baseline_train.csv", "../data/baseline_test.csv", "../data/data_dictionary.csv")
len(df_train), len(df_test)

(2188, 548)

In [5]:
y_train = df_train["sii"]
X_train = df_train.drop(columns="sii")

y_test = df_test["sii"]
X_test = df_test.drop(columns="sii")

In [6]:
# get categorical and numerical columns
numerical_features = data_dict[(data_dict['Type'] == 'float') | (data_dict['Type'] == 'int')]['Field'].values
numerical_features = [feature for feature in numerical_features if feature in X_train.columns]

categorical_features = data_dict[(data_dict['Type'] == 'str') | (data_dict['Type'] == 'categorical int')]['Field'].values
categorical_features = [feature for feature in categorical_features if feature in X_train.columns]

In [8]:
# preprocessing pipelines
numerical_transformer_rf = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [9]:
# preprocessor for Logistic Regression
preprocessor_rf = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_rf, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [10]:
X_train_preprocessed = preprocessor_rf.fit_transform(X_train)
X_test_preprocessed = preprocessor_rf.transform(X_test)

In [15]:
# train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_preprocessed, y_train)

In [16]:
# predict and evaluate model
kappa, mean_accuracy = evaluate_model(model, X_test_preprocessed, y_test)
print(f"Quadratic weighted kappa: {kappa}, Accuracy: {mean_accuracy}")


Quadratic weighted kappa: 0.3169906586034327, Accuracy: 0.6003649635036497
