In [1]:
# env setting
import sys
sys.path.append("../src")
sys.path.append("../models")

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
# from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer

# model import
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

#custom function
import config
import helpers
from custom_pipeline import Custom_Pipeline

In [2]:
(train, test, origin, submission) = helpers.data_loader()
train.head(5)

train = train.replace('None', np.NaN)
test = test.replace('None', np.NaN)
origin = origin.replace('None', np.NaN)

categorical_features = config.CATEGORICAL_FEATURES
# categorical_features.remove('lesion_2')
# categorical_features.remove('lesion_3')
target = 'outcome'

numerical_features = list(set(train.columns) - set(categorical_features) - set(config.USELESS_FEATURES) - set([target]))

train = pd.concat(
    [train, origin], ignore_index=True
)
train = train.drop_duplicates()

print(train.shape)
print(test.shape)

(1534, 29)
(824, 28)


In [3]:
X_tr = train.copy()
X_test = test.copy()

train['outcome'] = train['outcome'].map({
    'died' : 0,
    'euthanized' : 1,
    'lived' : 2
})

y = train.outcome

USECOLS = categorical_features
DROPCOLS = ['lesion_2', 'lesion_3', 'id']
ALPHA = 0.5

pipe = Custom_Pipeline(X_tr, y)
X_tr = pipe.fit_transform(USECOLS, ALPHA, DROPCOLS)
print(X_tr.shape)

(1534, 27)


In [4]:
X_test = pipe.transform(X_test, USECOLS, ALPHA, DROPCOLS)
print(X_test)

     rectal_temp  pulse  respiratory_rate  nasogastric_reflux_ph  \
0           38.6   40.0              20.0                    7.0   
1           38.2  112.0              48.0                    3.5   
2           37.7   66.0              12.0                    3.0   
3           37.1   88.0              20.0                    2.0   
4           38.3   50.0              12.0                    3.0   
..           ...    ...               ...                    ...   
819         40.3  114.0              36.0                    7.0   
820         37.2  100.0              20.0                    2.0   
821         39.2  132.0              12.0                    6.5   
822         38.3   54.0              66.0                    7.0   
823         38.1   66.0              12.0                    2.0   

     packed_cell_volume  total_protein  abdomo_protein  lesion_1  \
0                  42.0            7.5             2.3       0.0   
1                  44.0            6.0         

In [5]:
categorical_features.append('treated_more_than_once')
categorical_features.append('number_of_treatements')

### Feature Importances

In [4]:
from sklearn.feature_selection import mutual_info_classif

info_model = mutual_info_classif(X_tr, y,
                                 n_neighbors=11, 
                                 random_state=42)

(pd.DataFrame({'name' : X_tr.columns, 
              'values' : info_model})
 .sort_values('values', ascending=False))

Unnamed: 0,name,values
10,hospital_number,0.267157
7,lesion_1,0.205531
1,pulse,0.171348
5,total_protein,0.162192
6,abdomo_protein,0.156134
15,pain,0.153754
4,packed_cell_volume,0.14429
3,nasogastric_reflux_ph,0.132677
22,abdomo_appearance,0.091182
8,number_of_treatements,0.083776


In [5]:
chi_test_df = X_tr.copy()
chi_test_df['outcome'] = y

for i in categorical_features:
    helpers.chi_squared_test(chi_test_df, i, 'outcome')

[32mhospital_number has a significant relationship with the target variable.[0m
[32mtemp_of_extremities has a significant relationship with the target variable.[0m
[32mperipheral_pulse has a significant relationship with the target variable.[0m
[32mmucous_membrane has a significant relationship with the target variable.[0m
[32mcapillary_refill_time has a significant relationship with the target variable.[0m
[32mpain has a significant relationship with the target variable.[0m
[32mperistalsis has a significant relationship with the target variable.[0m
[32mabdominal_distention has a significant relationship with the target variable.[0m
[32mnasogastric_tube has a significant relationship with the target variable.[0m
[32mnasogastric_reflux has a significant relationship with the target variable.[0m
[32mrectal_exam_feces has a significant relationship with the target variable.[0m
[32mabdomen has a significant relationship with the target variable.[0m
[32mabdomo_appea

In [9]:
from lightgbm import LGBMClassifier

X_train, X_val, y_train, y_val = train_test_split(X_tr, y, test_size=0.2, random_state=42)


lgbm = LGBMClassifier()
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lgbm.fit(X_train, y_train)

scores = cross_val_score(lgbm, X_train, y_train,
                         cv=folds, scoring='f1_micro',
                         n_jobs=-1)

y_pred = lgbm.predict(X_val)
val_score = f1_score(y_true=y_val, y_pred=y_pred, average='micro')

print("==== TRAIN CV SCORE ====")
print("mean score: ", np.mean(scores))
print(scores)

print("==== Valid Score ==== ")
print("score: ", val_score)

# print("==== ReTrain CV Score ====")
# lgbm.fit(X_tr, y)
# scores = cross_val_score(lgbm, X_tr, y,
#                          cv=folds, scoring='f1_micro',
#                          n_jobs=-1)
# print("mean score: ", np.mean(scores))
# print(scores)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000759 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 968
[LightGBM] [Info] Number of data points in the train set: 1227, number of used features: 27
[LightGBM] [Info] Start training from score -1.146181
[LightGBM] [Info] Start training from score -1.631689
[LightGBM] [Info] Start training from score -0.720410
==== TRAIN CV SCORE ====
mean score:  0.7506122448979591
[0.78455285 0.71544715 0.71836735 0.75510204 0.77959184]
==== Valid Score ==== 
score:  0.7491856677524429
==== ReTrain CV Score ====
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000803 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1059
[LightGBM] [Info] Number of data points in the train set: 1534, number of used features: 27
[LightGBM] [Info] Start training from score -1.147370
[LightGBM] [Info] 

In [12]:
feature_imp = pd.DataFrame({'feature': lgbm.feature_name_, 'Importance': lgbm.feature_importances_}) 
feature_imp = feature_imp.sort_values('Importance', ascending=False)
feature_imp

Unnamed: 0,feature,Importance
7,lesion_1,868
10,hospital_number,799
5,total_protein,665
1,pulse,587
4,packed_cell_volume,539
2,respiratory_rate,501
6,abdomo_protein,463
9,deviation_from_normal_temp,416
0,rectal_temp,396
8,number_of_treatements,395


### Optimize Model

In [6]:
import warnings
warnings.filterwarnings('ignore')

import json, joblib, optuna

In [None]:
def objective(trial, data=X_tr, target=y):
    x_train, x_val, y_train, y_val = train_test_split(data, target, test_size=0.18,random_state=42)
    
    param = {
    'eval_metric': ['merror', 'mlogloss'], 
    'n_estimators': 1000,
    'objective':'multi:softmax',
    'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
    'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
    'gamma': trial.suggest_int('gamma', 1, 20),
    'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
    'colsample_bynode': trial.suggest_categorical('colsample_bynode', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
    'colsample_bylevel': trial.suggest_categorical('colsample_bylevel', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
    'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1.0, 0.05),
    'learning_rate': trial.suggest_categorical('learning_rate', [0.002, 0.004, 0.006,0.008,0.01,0.014,0.017,0.02]),
    'max_depth': trial.suggest_int('max_depth',10,60 ),
    'max_leaves' : trial.suggest_int('max_leaves', 1, 1000)
    #'min_child_samples': trial.suggest_int('min_child_samples', 1, 300)
 
}

    model = XGBClassifier(**param)  
    
    model.fit(x_train,y_train,eval_set=[(x_val, y_val)],verbose=0)   
    preds = model.predict(x_val)   
    score = f1_score(y_val, preds, average='micro')
    
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
study.best_trial.params

{'min_child_weight': 4,
 'reg_alpha': 0.11188324348736123,
 'reg_lambda': 0.0907372061074145,
 'gamma': 1,
 'colsample_bytree': 0.3,
 'colsample_bynode': 0.4,
 'colsample_bylevel': 1.0,
 'subsample': 0.95,
 'learning_rate': 0.017,
 'max_depth': 27,
 'max_leaves': 902}

In [18]:
# kaggle ver
xgb_params = {'min_child_weight': 5,
 'reg_alpha': 0.014425096788083052,
 'reg_lambda': 0.012345176750382126,
 'gamma': 1,
 'colsample_bytree': 0.5,
 'colsample_bynode': 0.7,
 'colsample_bylevel': 0.7,
 'subsample': 0.95,
 'learning_rate': 0.017,
 'max_depth': 15,
 'max_leaves': 366}

model = XGBClassifier(**xgb_params) 
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(model, X_tr, y,
                         cv=folds, scoring='f1_micro',
                         n_jobs=-1)

print(scores)
print(np.mean(scores))

[0.77850163 0.752443   0.76547231 0.76872964 0.74509804]
0.7620489238040493


In [None]:
def objective(trial, data=X_tr, target=y):
    x_train, x_val, y_train, y_val = train_test_split(data, target, test_size=0.18,random_state=42)
    
    params = {
        'eval_metric': 'multi_logloss', 
        'n_estimators': 1000,
        'objective':'multiclass',
        'boosting_type' : 'gbdt',  
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1.0, 0.05),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.002, 0.004, 0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_int('max_depth',10,60 ),
        'max_leaves' : trial.suggest_int('max_leaves', 1, 1000)
    }
    
    model = LGBMClassifier(**params)
    
    model.fit(x_train,y_train,eval_set=[(x_val, y_val)])   
    preds = model.predict(x_val)  
    score = f1_score(y_val, preds, average='micro')
    
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [8]:
study.best_trial.params

{'lambda_l1': 0.010514016310020503,
 'lambda_l2': 0.0049288944161536225,
 'num_leaves': 215,
 'feature_fraction': 0.7036721838116332,
 'bagging_fraction': 0.7718262291600653,
 'bagging_freq': 5,
 'min_child_samples': 62,
 'subsample': 0.8500000000000001,
 'learning_rate': 0.008,
 'max_depth': 29,
 'max_leaves': 278}

In [29]:
x_train, x_val, y_train, y_val = train_test_split(X_tr, y, test_size=0.18,random_state=42)

model = HistGradientBoostingClassifier(
    max_depth=4,          
    max_iter=80,         
    learning_rate=0.1,     
    random_state=42,   
    scoring='f1_micro',          
    max_leaf_nodes = 21,
    l2_regularization = 0.1,
)

model.fit(x_train,y_train)   
preds = model.predict(x_val)  
score = f1_score(y_val, preds, average='micro')
print(score)

0.7725631768953068


In [12]:
model.fit(X_tr, y)
prediction = model.predict(X_test)
decode_map = {
    0 : 'died',
    1 : 'euthanized',
    2 : 'lived'
}

sample_submission = pd.read_csv(config.SUBMISSION_FILE)
sample_submission['outcome'] = prediction
sample_submission['outcome'] = sample_submission['outcome'].map(decode_map)
sample_submission.to_csv("../output/sample_submission_V4.csv", index=False)
sample_submission

Unnamed: 0,id,outcome
0,1235,lived
1,1236,died
2,1237,lived
3,1238,euthanized
4,1239,lived
...,...,...
819,2054,died
820,2055,euthanized
821,2056,died
822,2057,lived


### Ensemble

In [6]:
from sklearn.ensemble import VotingClassifier
#create a dictionary of our models
estimators = [
    ('xgb', XGBClassifier(**config.XGB_PARAMS)),
    ('lgbm' , LGBMClassifier(**config.LGBM_PARAMS)),
    ('hgb', HistGradientBoostingClassifier(**config.HGB_PARAMS))
]

#create our voting classifier, inputting our models
ensemble = VotingClassifier(estimators, voting='soft', weights = [1.15176, 0.37643, 1.20592])

x_train, x_val, y_train, y_val = train_test_split(X_tr, y, test_size=0.1, stratify=y, random_state=42)
folds = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

scores = cross_val_score(ensemble, x_train, y_train,
                         cv=folds,
                         scoring='f1_micro',
                         n_jobs=-1)

print(scores)
print(np.mean(scores))

#fit model to training data
ensemble.fit(x_train, y_train)
prediction = ensemble.predict(x_val)
#test our model on the test data
print(f1_score(y_val, prediction, average='micro'))

[0.7826087  0.75724638 0.79347826 0.78623188 0.72826087]
0.7695652173913043
0.7272727272727273


In [12]:

for i, (train_index, test_index) in enumerate(folds.split(X_tr, y)):
    print(i)
    if i == 4:
        print(f"Fold {i}:")
        print(f"  Train: index={len(train_index)}")
        print(f"  Test:  index={len(test_index)}")
        break

0
1
2
3
4
Fold 4:
  Train: index=1228
  Test:  index=[   3    9   11   14   18   19   27   35   37   41   51   61   64   69
   74   84   86   88   94   98  100  101  103  109  112  113  116  122
  123  132  133  136  148  152  162  166  169  170  180  187  189  192
  195  197  203  214  218  220  223  231  233  240  251  256  258  265
  270  282  283  287  288  301  306  307  309  310  312  313  319  323
  326  346  359  364  365  372  373  378  379  380  390  394  404  411
  416  418  429  434  441  443  447  448  454  455  467  470  475  479
  485  491  495  500  503  513  516  519  525  529  537  545  555  562
  564  565  566  571  572  573  577  579  596  604  608  629  633  635
  639  640  646  653  655  667  669  686  696  697  704  708  715  720
  731  732  734  735  743  746  753  765  767  776  778  780  791  805
  808  809  812  823  828  834  838  839  841  864  876  882  883  884
  885  891  896  903  918  923  928  929  931  934  937  941  951  961
  962  967  969  979  98

In [9]:
X_tr.iloc[train_index, :]

Unnamed: 0,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,lesion_1,number_of_treatements,deviation_from_normal_temp,...,abdominal_distention,nasogastric_tube,nasogastric_reflux,rectal_exam_feces,abdomen,abdomo_appearance,surgery,age,surgical_lesion,cp_data
0,38.100000,132.0,24.0,6.500000,57.0,8.500000,3.400000,2209.0,16.0,0.300000,...,1.364901,1.214189,1.150167,1.302797,1.070510,0.829657,1.075008,1.209047,1.020604,1.169493
1,37.500000,88.0,12.0,2.000000,33.0,64.000000,2.000000,2208.0,10.0,0.300000,...,0.952374,1.127182,1.094893,0.931897,1.070510,0.829657,1.075008,1.209047,1.020604,1.169493
2,38.300000,120.0,28.0,3.500000,37.0,6.400000,3.400000,5124.0,2.0,0.500000,...,0.952374,1.214189,1.235535,1.219908,1.059533,0.829657,1.075008,1.209047,1.020604,1.169493
4,38.000000,52.0,48.0,7.000000,47.0,7.300000,2.600000,0.0,7.0,0.200000,...,1.543433,1.214189,1.150167,1.417162,1.355710,1.300451,1.396073,1.209047,1.582999,1.176009
5,38.100000,56.0,32.0,5.000000,49.0,8.000000,2.800000,0.0,20.0,0.300000,...,1.543433,1.127182,1.094893,1.302797,1.788035,1.300451,1.396073,1.209047,1.582999,1.176009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1527,37.763636,78.0,24.0,1.954545,43.0,62.000000,2.000000,2209.0,4.0,0.527273,...,1.064882,1.214189,1.235535,1.009730,1.070510,1.012251,1.075008,1.209047,1.582999,1.169493
1528,38.500000,40.0,16.0,1.954545,37.0,67.000000,1.390909,0.0,5.0,0.700000,...,1.543433,1.153532,1.176632,1.302797,1.845934,1.474432,1.396073,1.209047,1.582999,1.169493
1530,37.200000,72.0,24.0,3.681818,44.0,24.318182,3.300000,2208.0,6.0,0.600000,...,0.952374,1.068999,1.235535,0.931897,1.070510,0.829657,1.396073,1.209047,1.020604,1.176009
1531,37.500000,72.0,30.0,5.090909,60.0,6.800000,3.281818,3205.0,7.0,0.300000,...,0.952374,1.214189,1.235535,1.302797,1.059533,1.055050,1.075008,1.209047,1.020604,1.169493


In [11]:
y[test_index].value_counts()

2    150
0     97
1     59
Name: outcome, dtype: int64

In [22]:
ensemble.fit(X_tr, y)
prediction = ensemble.predict(X_test)

sample_submission = pd.read_csv(config.SUBMISSION_FILE)
decode_map = {
    0 : 'died',
    1 : 'euthanized',
    2 : 'lived'
}

sample_submission['outcome'] = prediction
sample_submission['outcome'] = sample_submission['outcome'].map(decode_map)
sample_submission.to_csv('../output/sample_submission_V4(ensemble_sklearn).csv',index=False)
sample_submission

Unnamed: 0,id,outcome
0,1235,lived
1,1236,died
2,1237,lived
3,1238,euthanized
4,1239,lived
...,...,...
819,2054,died
820,2055,euthanized
821,2056,died
822,2057,lived


In [26]:
from sklearn.metrics.pairwise import cosine_similarity

ensm = pd.read_csv('../output/sample_submission_V4(ensemble).csv')['outcome'].map({
    'died' : 0,
    'euthanized' : 1,
    'lived' : 2
})

cosine_similarity([ensm], [prediction])

array([[0.96113866]])

In [27]:
bst = pd.read_csv('../output/sample_submission_V1.csv')['outcome'].map({
    'died' : 0,
    'euthanized' : 1,
    'lived' : 2
})

cosine_similarity([bst], [prediction])

array([[0.88239297]])

In [28]:
cosine_similarity([ensm], [bst])

array([[0.87436624]])

In [6]:
from functools import partial
from copy import deepcopy
from sklearn.metrics import log_loss
import gc
import optuna

class Splitter:
    def __init__(self, n_splits=5, test_size=0.2):
        self.n_splits = n_splits
        self.test_size = test_size

    def split_data(self, X, y, random_state_list):
        for random_state in random_state_list:
                kf = StratifiedKFold(n_splits=self.n_splits, random_state=random_state, shuffle=True)
                for train_index, val_index in kf.split(X, y):
                    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
                    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
                    yield X_train, X_val, y_train, y_val
                    
class Classifier:
    def __init__(self, n_estimators=1000, device="cpu", random_state=0):
        self.n_estimators = n_estimators
        self.device = device
        self.random_state = random_state
        self.models = self._define_model()
        self.models_name = list(self._define_model().keys())
        self.len_models = len(self.models)
        
    def _define_model(self):
        
        xgb_params = config.XGB_PARAMS
        lgbm_params = config.LGBM_PARAMS
        hgb_params = config.HGB_PARAMS

        
        models = {
            'xgb': XGBClassifier(**xgb_params),
            'lgbm': LGBMClassifier(**lgbm_params),
            'hgb': HistGradientBoostingClassifier(**hgb_params),
        }

        return models
    
class OptunaWeights:
    def __init__(self, random_state, n_trials=1000):
        self.study = None
        self.weights = None
        self.random_state = random_state
        self.n_trials = n_trials

    def _objective(self, trial, y_true, y_preds):
        # Define the weights for the predictions from each model
        weights = [trial.suggest_float(f"weight{n}", 1e-12, 2) for n in range(len(y_preds))]

        # Calculate the weighted prediction
        weighted_pred = np.average(np.array(y_preds), axis=0, weights=weights)
        
        weighted_pred_labels = np.argmax(weighted_pred, axis=1)
        f1_micro_score = f1_score(y_true, weighted_pred_labels, average='micro')
        return f1_micro_score

    def fit(self, y_true, y_preds):
        optuna.logging.set_verbosity(optuna.logging.ERROR)
        sampler = optuna.samplers.CmaEsSampler(seed=self.random_state)
        pruner = optuna.pruners.HyperbandPruner()
        self.study = optuna.create_study(sampler=sampler, pruner=pruner, study_name="OptunaWeights", direction='maximize')
        objective_partial = partial(self._objective, y_true=y_true, y_preds=y_preds)
        self.study.optimize(objective_partial, n_trials=self.n_trials)
        self.weights = [self.study.best_params[f"weight{n}"] for n in range(len(y_preds))]

    def predict(self, y_preds):
        assert self.weights is not None, 'OptunaWeights error, must be fitted before predict'
        weighted_pred = np.average(np.array(y_preds), axis=0, weights=self.weights)
        return weighted_pred

    def fit_predict(self, y_true, y_preds):
        self.fit(y_true, y_preds)
        return self.predict(y_preds)
    
    def weights(self):
        return self.weights

In [None]:
n_splits = 5
random_state = 42
random_state_list = [42] 
n_estimators = 1000 
early_stopping_rounds = 333
verbose = False
device = 'cpu'
splitter = Splitter(n_splits=n_splits)

# Initialize an array for storing test predictions
test_predss = np.zeros((X_test.shape[0], 3))
ensemble_f1_score = []
weights = []
trained_models = {'xgb':[], 'lgb':[], 'cat':[]}
    
for i, (X_train_, X_val, y_train_, y_val) in enumerate(splitter.split_data(X_tr, y, random_state_list=random_state_list)):
    n = i % n_splits
    m = i // n_splits
            
    # Get a set of Regressor models
    classifier = Classifier(n_estimators, device, random_state)
    models = classifier.models
    
    # Initialize lists to store oof and test predictions for each base model
    oof_preds = []
    test_preds = []
    
    # Loop over each base model and fit it to the training data, evaluate on validation data, and store predictions
    for name, model in models.items():
        if ('xgb' in name) or ('lgbm' in name):
            model.fit(X_train_, y_train_, eval_set=[(X_val, y_val)])
        else:
            model.fit(X_train_, y_train_)
            
        if name in trained_models.keys():
            trained_models[f'{name}'].append(deepcopy(model))
        
        test_pred = model.predict_proba(X_test)
        y_val_pred = model.predict_proba(X_val)

        y_val_pred_labels = np.argmax(y_val_pred, axis=1)
        f1_micro_score = f1_score(y_val, y_val_pred_labels, average='micro')
        
        print(f'{name} [FOLD-{n} SEED-{random_state_list[m]}] F1 Micro Score: {f1_micro_score:.5f}')
        
        oof_preds.append(y_val_pred)
        test_preds.append(test_pred)
    
    # Use Optuna to find the best ensemble weights
    optweights = OptunaWeights(random_state=random_state)
    y_val_pred = optweights.fit_predict(y_val, oof_preds)
    
    score = log_loss(y_val, y_val_pred)
    y_val_pred_labels = np.argmax(y_val_pred, axis=1)
    f1_micro_score = f1_score(y_val, y_val_pred_labels, average='micro')
    
    print(f'Ensemble [FOLD-{n} SEED-{random_state_list[m]}] ---------------> F1 Micro Score: {f1_micro_score:.5f}')
    
    ensemble_f1_score.append(f1_micro_score)
    weights.append(optweights.weights)
    
    # Predict to X_test by the best ensemble weights
    _test_preds = optweights.predict(test_preds)
    test_predss += _test_preds / (n_splits * len(random_state_list))
    
    gc.collect()

In [8]:
mean_score = np.mean(ensemble_f1_score)
std_score = np.std(ensemble_f1_score)
print(f'Ensemble F1 score {mean_score:.5f} ± {std_score:.5f}')

# Print the mean and standard deviation of the ensemble weights for each model
print('--- Model Weights ---')
mean_weights = np.mean(weights, axis=0)
std_weights = np.std(weights, axis=0)
for name, mean_weight, std_weight in zip(models.keys(), mean_weights, std_weights):
    print(f'{name}: {mean_weight:.5f} ± {std_weight:.5f}')

Ensemble F1 score 0.78421 ± 0.02745
--- Model Weights ---
xgb: 1.15176 ± 0.37855
lgbm: 0.37643 ± 0.21655
hgb: 1.20592 ± 0.52147


In [9]:
for pred in test_predss:
    if (pred[1] < pred[2]) and ((pred[2] + pred[1]) > pred[0]): 
        pred[0] = 0
        pred[1] = 0
        pred[2] = 1
        
    if (pred[0] > pred[2]) and (pred[0] > pred[1]) and (pred[0] - pred[1] < 0.3): 
        pred[0] = 0
        pred[1] = 1
        pred[2] = 0
        
    if pred[2] > 0.42:
        pred[0] = 0
        pred[1] = 0
        pred[2] = 1
        
test_predss

array([[0.        , 0.        , 1.        ],
       [0.57092738, 0.09703689, 0.33203573],
       [0.        , 0.        , 1.        ],
       ...,
       [0.71258075, 0.1253713 , 0.16204794],
       [0.        , 0.        , 1.        ],
       [0.        , 0.        , 1.        ]])

In [11]:
sample_submission = pd.read_csv(config.SUBMISSION_FILE)
submission = pd.DataFrame({'id': sample_submission['id'], 'outcome': np.argmax(test_predss, axis=1)})
decode_map = {
    0 : 'died',
    1 : 'euthanized',
    2 : 'lived'
}

submission['outcome'] = submission['outcome'].map(decode_map)
submission.to_csv('../output/sample_submission_V4(ensemble).csv',index=False)
submission

Unnamed: 0,id,outcome
0,1235,lived
1,1236,died
2,1237,lived
3,1238,euthanized
4,1239,lived
...,...,...
819,2054,died
820,2055,euthanized
821,2056,died
822,2057,lived


### Cosine Similarity

In [None]:
concatenated_df = pd.concat([sub1["outcome"], sub2["outcome"], sub3["outcome"]], axis=1)
same_categories = concatenated_df.apply(lambda row: row.nunique() == 1, axis=1)
df_same_categories = concatenated_df[same_categories]
df_same_categories

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

X_train_cop = X_train.copy()
X_test_cop = X_test.copy()

def closest_index(X_test_cop, X_train_cop, index_test, num_closest=500):
    res = [] 
    for i in index_test:
        test_point = np.array(X_test_cop.loc[i]).reshape(1, -1)  
        similarities = cosine_similarity(X_train_cop, test_point)  
        closest_indices = np.argsort(similarities[:, 0])[-num_closest:][::-1] 
        res.append(closest_indices)  
    return np.array(res)

res = closest_index(X_test_cop, X_train_cop, df_all_different_categories.index.tolist())
res=res.reshape(1,-1)[0]
print("Closest indexes in X_train:", res)