In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from utils import load_tabular_data, add_series_features, evaluate_model

In [2]:
df_train, df_test, data_dict = load_tabular_data('train.csv', 'test.csv', 'data_dictionary.csv')

In [3]:
df_train = add_series_features(df_train, 'series_train.parquet')
df_test = add_series_features(df_test, 'series_test.parquet')

In [20]:
# get train matrix and target vector
columns_not_in_test = list(set(df_train.columns).difference(set(df_test.columns)))
X = df_train.drop(columns_not_in_test, axis=1)
y = df_train['sii']

In [None]:
# get categorical and numerical columns
numerical_features = data_dict[(data_dict['Type'] == 'float') | (data_dict['Type'] == 'int')]['Field'].values
numerical_features = [feature for feature in numerical_features if feature in X.columns]

categorical_features = data_dict[(data_dict['Type'] == 'str') | (data_dict['Type'] == 'categorical int')]['Field'].values
categorical_features = [feature for feature in categorical_features if feature in X.columns]

- **Logistic regression**: standardization + imputation of missing values + one-hot encoding
- **Random forest**: imputation of missing values + one-hot encoding
- **Histogram-based Gradient Boosting** (no preprocessing needed)
- **XGBoost** (no preprocessing needed)

In [None]:
# preprocessing pipelines
numerical_transformer_lr = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
numerical_transformer_rf = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# preprocessor for Logistic Regression
preprocessor_lr = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_lr, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# preprocessor for Random Forest
preprocessor_rf = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_rf, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

<div class="alert alert-block alert-danger">
We need to find a better way to handle missing values!
</div>

<div class="alert alert-block alert-info">
    <h4>Plan for the future:</h4>
    <ul>
        <li>Somehow balance dataset</li>
        <li>Better handle missing values</li>
        <li>Extract new features from time series data</li>
        <li>Estimate feature importance and do feature selection</li>
        <li>Take also into account the unlabelled data
            <ul>
                <li>First a semi-supervised learning (e.g., Label Propagation and Label Spreading), then train a supervised model with the labelled data</li>
            </ul>
        </li>
    </ul>
</div>