In [None]:
import gc, random, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set_theme()

from matplotlib.ticker import FixedLocator, FixedFormatter

from sklearn.cluster import DBSCAN, KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer, QuantileTransformer
from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
from sklearn.decomposition import PCA
from sklearn import metrics

from scipy import stats
from scipy.stats import norm

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import tensorflow as tf
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import callbacks

import optuna

## Seed everything

In [None]:
seed = 42

os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)

## Load Data

In [None]:
df = pd.read_csv('../../../Data/Tabular Playground Series/tabular-playground-series-jul-2022/data.csv')
df.shape

In [None]:
df.head(2)

In [None]:
X_train = df.drop('id', axis=1)

In [None]:
X_train.head(2)

In [None]:
X_train.describe().T

In [None]:
X_train.info()

In [None]:
X_train.boxplot(figsize=(24,4))

## Probably a good idea to scale the data

In [None]:
# standard_scaler = StandardScaler()
# X_train = pd.DataFrame(standard_scaler.fit_transform(X_train), columns=X_train.columns)

power_transformer = PowerTransformer()
X_train = pd.DataFrame(power_transformer.fit_transform(X_train), columns=X_train.columns)

X_train.head(2)

In [None]:
X_train.boxplot(figsize=(24,4))

In [None]:
# for col in X_train.columns:    
#     plt.figure(figsize=(5,5))
#     res = stats.probplot(X_train[col], plot=plt)

## Let's remove outliers from all columns 

In [None]:
X_train_orig = X_train.copy(deep=True)

print(f'X_train.shape={X_train.shape}')

for col in X_train.columns:
    mean = X_train[col].mean()
    std  = X_train[col].std()
    
    upper_bound = mean + 3*std
    lower_bound = mean - 3*std
    
    X_train = X_train[(X_train[col] > lower_bound) & (X_train[col] < upper_bound)]
    
print(f'X_train.shape={X_train.shape}')

## Let's look at quantiles plots again

In [None]:
# for col in X_train.columns:    
#     plt.figure(figsize=(5,5))
#     res = stats.probplot(X_train[col], plot=plt)

In [None]:
# X_train.boxplot(figsize=(16,4))

# How much of the variance each number of components explain?

* We see that on average each new component adds about 4% of explained variance (with an exception of the first 2 components, where contribution is 6.21% and 4.93%.

In [None]:
for n in range(1,30):
    pca = PCA(n_components=n, random_state=seed)

    p = pca.fit_transform(X_train)

    inv_transform = pca.inverse_transform(p)

    plt.figure(figsize=(12,2))
    pd.DataFrame(inv_transform, columns=X_train.columns).std().plot.bar(title=f'n_components={n} | variance explained={np.round(np.sum(pca.explained_variance_ratio_)*100,2)}')

## Let's use BGM on 7 clusters to build a supervised learning problem

In [None]:
bgm = BayesianGaussianMixture(
    n_components=7, 
    covariance_type='full', 
    max_iter=300, 
    random_state=seed, 
    n_init=15
)

bgm.fit(X_train)

In [None]:
bgm.converged_

In [None]:
plt.plot(bgm.weights_)

In [None]:
y_train = bgm.predict(X_train)
proba = bgm.predict_proba(X_train)

In [None]:
y_train[:5]

In [None]:
np.round(proba[:10],2)

In [None]:
max_proba = np.max(proba,axis=1)

In [None]:
max_proba[:10]

In [None]:
_ = plt.hist(max_proba, bins=50)

In [None]:
sns.boxplot(data=max_proba)

In [None]:
sure_idx = max_proba >= 0.7

In [None]:
sure_samples = X_train[sure_idx]
sure_y = y_train[sure_idx]
print(f'sure_samples.shape={sure_samples.shape}')
print(f'sure_y.shape={sure_y.shape}')

In [None]:
sure_samples.head(2)

In [None]:
def objective(trial, n_splits=10, shuffle=True, get_info=False):
    
    param = {        
        'verbose': 0,
        'random_state': seed,
        'loss_function': 'MultiClass',
        'task_type': 'GPU',
        'iterations': trial.suggest_int('iterations', 10, 1000, 10),
        'depth': trial.suggest_int('depth', 1, 8),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1),
        'random_strength': trial.suggest_uniform('random_strength', 1e-9, 10),
        'bagging_temperature': trial.suggest_uniform('bagging_temperature', 0, 1),
        'border_count': trial.suggest_int('border_count', 1, 255),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 30),
    }
    
    model = cb.CatBoostClassifier(**param)
    
    kf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=seed)    
    
    scores = []
    models = []

    for train_index, test_index in kf.split(sure_samples, sure_y): 

        train_dataset = cb.Pool(
            data=sure_samples.iloc[train_index,:], 
            label=sure_y[train_index],
        )
        
        eval_dataset = cb.Pool(
            data=sure_samples.iloc[test_index,:], 
            label=sure_y[test_index],
        )
        
        model.fit(
            train_dataset,            
            early_stopping_rounds=200,
            verbose=0,            
        )

        y_test_pred = model.predict(eval_dataset).reshape(-1)
        
        score = metrics.adjusted_rand_score(
            labels_true=sure_y[test_index],
            labels_pred=y_test_pred
        )
        
        scores.append(score)
        models.append(model)

    scores = np.array(scores)
    
    if get_info:
        
        info = {
            'mean_score': np.mean(scores), 
            'score_std' : np.std(scores), 
            'scores': scores, 
            'models': models,
        }
        
        return info
    else:
        return np.mean(scores)

In [None]:
study = optuna.create_study(direction='maximize', study_name='tps_clustering')
study.optimize(objective, n_trials=10, show_progress_bar=True)

In [None]:
study.best_value

In [None]:
study.best_params

In [None]:
info = objective(optuna.trial.FixedTrial(study.best_params), get_info=True)

In [None]:
best_models = info['models']

## A bag of CatBoost models for predictions

In [None]:
predict_proba = 0

for model in best_models:
    predict_proba += model.predict_proba(X_train_orig)

In [None]:
predictions = np.argmax(predict_proba, axis=1)
predictions[:5]

In [None]:
submission = pd.read_csv('../../../Data/Tabular Playground Series/tabular-playground-series-jul-2022/sample_submission.csv')
submission['Predicted'] = predictions
submission.to_csv("submission.csv", index=False)

## ___

### Leaderboard Score

<img src="">

# Ideas to try

* blending and stacking models together
* different threshold for sure_samples
* sklearn.metrics.balanced_accuracy_score for metrics instead of adjusted_rand_index
* roc_auc_score(labels, probas, average="weighted", multi_class="ovo") for metrics
* ExtraTreesClassifier
* KNN
* Quadratic Discriminant Analysis
* SVC
* XGBoost
* LightGBM