In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from src.utils import *
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.semi_supervised import LabelPropagation
from sklearn.preprocessing import OneHotEncoder


In [2]:
df_train, df_test_kaggle, data_dict = load_tabular_data('data/train.csv', 'data/test.csv', 'data/data_dictionary.csv')

### Add features extracted from time series data
(for more information, see the notebook ...)

In [3]:
df_train = add_series_features(df_train, 'data/series_train.parquet')
df_test_kaggle = add_series_features(df_test_kaggle, 'data/series_test.parquet')

### Split data into training and private test set

In [4]:
y = df_train['sii']
X_train, X_test, y_train, y_test = train_test_split(df_train, y, test_size=0.2, random_state=42)

### Outlier detection
(for more information, see the notebook ...)

In [5]:
# CGAS score should be between 0 and 100
X_train.loc[X_train['CGAS-CGAS_Score'] > 100, 'CGAS-CGAS_Score'] = np.nan
X_train.loc[X_train['CGAS-CGAS_Score'] < 0, 'CGAS-CGAS_Score'] = np.nan

physical_features = df_train.columns[df_train.columns.str.contains('Physical')]
# Replace 0 values for physical features with NaN
X_train[physical_features] = X_train[physical_features].replace(0, np.nan)

# Replace values where systolic blood pressure is less than diastolic blood pressure with NaN
condition = df_train['Physical-Systolic_BP'] < df_train['Physical-Diastolic_BP']
df_train.loc[condition, ['Physical-Systolic_BP', 'Physical-Diastolic_BP']] = np.nan

### Data imputation
- For the numerical features, we fill the missing values with the KNN imputer.
- For the categorical features, we fill the missing values with the most frequent value in the column.

In [6]:
X_train = impute_tabdata(X_train, 'data/data_dictionary.csv')

### One hot encoding for categorical features

In [7]:
categorical_features = data_dict[(data_dict['Type'] == 'str') | (data_dict['Type'] == 'categorical int')]['Field'].values
columns_not_in_test = list(set(df_train.columns).difference(set(df_test_kaggle.columns)))
categorical_features = [feature for feature in categorical_features if feature not in columns_not_in_test]

In [8]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_features = encoder.fit_transform(X_train[categorical_features])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))
X_train = X_train.drop(columns=categorical_features)
X_train = pd.concat([X_train.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)


### Label propagation

In [None]:
columns_not_in_test.remove('sii')
X_train = X_train.drop(columns=columns_not_in_test)
X_train = label_propagation(X_train, LabelPropagation())

### Oversampling

In [None]:
X_train = oversample_tabdata(X_train)

In [None]:
# get train matrix and target vector
X = df_train.drop(columns_not_in_test, axis=1)
y = df_train['sii']

In [None]:
# get categorical and numerical columns
numerical_features = data_dict[(data_dict['Type'] == 'float') | (data_dict['Type'] == 'int')]['Field'].values
numerical_features = [feature for feature in numerical_features if feature in X.columns]

categorical_features = data_dict[(data_dict['Type'] == 'str') | (data_dict['Type'] == 'categorical int')]['Field'].values
categorical_features = [feature for feature in categorical_features if feature in X.columns]

### Model

In [None]:
xgb_model = xgb.XGBClassifier(objective='multi:softmax', 
                              eval_metric='mlogloss', 
                              num_class=4, 
                              learning_rate=0.01, 
                              max_depth=2, 
                              min_child_weight=0, 
                              gamma=0, 
                              subsample=0.7, 
                              colsample_bytree=0.55,
                              reg_alpha=1e-5,
                              n_estimators=5000)
xgb_model.fit(X_train, y_train)

### Evaluation

In [None]:
eval = evaluate_model(xgb_model, X_test, y_test)
print(f'XGB model accuracy: {eval[1]}, kappa: {eval[0]}')