In [83]:
import platform 

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from tqdm import tqdm

In [84]:
import matplotlib.font_manager

In [96]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

In [86]:
from sklearn.feature_selection import chi2, RFECV

In [87]:
import optuna

In [88]:
import xgboost as xgb
import lightgbm as lgb
import catboost as ctb

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB



if platform.processor() == 'arm':
    from sklearn.svm import SVC
    from sklearn.neighbors import KNeighborsClassifier
else:
    from cuml.svm import SVC
    from cuml.neighbors import KNeighborsClassifier

In [89]:
import mlflow

In [90]:
plot_params = {
    'font.family': 'serif',
    'font.weight': 'light',
    
    'figure.figsize': (5,5),
    'figure.frameon': False, 
    'figure.titlesize': 'xx-large',
    'figure.titleweight': 'normal',
    
    'axes.titlesize': 'large',
    'axes.titlecolor': 'black',
    'axes.titleweight': 'normal',
    'axes.titlelocation': 'center',
    'axes.labelsize': 'x-large',
    
    'grid.alpha': .25, 
    'legend.frameon':False,
    'xtick.labelsize': 'x-large',
    'ytick.labelsize': 'x-large',
}

pylab.rcParams.update(plot_params)
sns.set_palette('mako')

In [91]:
RANDOM_STATE = 7

In [92]:
data_folder = 'data'
train_data_fname = 'train.csv'
test_data_fname = 'test.csv'
external_data_fname = 'external_data.csv'
sample_submission_fname = 'sample_submission.csv'


train_data_path = os.path.join(data_folder, train_data_fname)
test_data_path = os.path.join(data_folder, test_data_fname)
external_data_path = os.path.join(data_folder, external_data_fname)
sample_data_path = os.path.join(data_folder, sample_submission_fname)

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
external_data = pd.read_csv(external_data_path)
sample_data = pd.read_csv(sample_data_path)

In [93]:
# set(external_data.columns.tolist()).difference(train_data.columns.tolist())

In [94]:
train_data.drop(['id'], axis = 1, inplace=True)

train_data['is_external_data'] = 0
external_data['is_external_data'] = 1
external_data.prognosis = external_data.prognosis.str.replace(' ', '_')
print(train_data.shape)
print(external_data.shape)

train_data =  pd.concat([train_data, external_data])
print(f"New train shape: \t{train_data.shape}")
print(f"Unique targets: \t{train_data.prognosis.nunique()}")

# Add is_external_data column to the test set
test_data['is_external_data'] = 0


(707, 66)
(252, 66)
New train shape: 	(959, 66)
Unique targets: 	11


In [109]:
X, Y = train_data.drop(['prognosis'], axis = 1), train_data[['prognosis']]
target_encoder = OrdinalEncoder()
Y = target_encoder.fit_transform(Y)

In [110]:
Y.shape

(959, 1)

In [None]:
# 

def generate_predictions(y_proba,  encoder):
    """"""
    top_predictions = np.argsort(-y_proba, axis=1)
    top_3_predictions = top_3_predictions[:,:3]

    original_shape = top_3_predictions.shape
    top_3_predictions_labels = encoder.inverse_transform(top_3_predictions.reshape(-1,1))
    top_3_predictions_labels = top_3_predictions_labels.reshape(original_shape)


def ap_k(y_true, y_pred):
    """"""
    