In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import gc
import random

from IPython import display as ipd

from pandas_profiling import ProfileReport as profile

import pkg_resources as pkg
print( f"pandas_profiling version: {pkg.get_distribution('pandas_profiling').version}")

from tqdm import tqdm
import lightgbm as lgb

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import roc_curve, auc, cohen_kappa_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestRegressor


import matplotlib.pyplot as plt
import seaborn as sns

import imblearn
print(imblearn.__version__)

In [None]:
RANDOM_SEED = 42
DEBUG = True
PROFILE = False

def seeding(SEED, use_tf=False):
    np.random.seed(SEED)
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    os.environ['TF_CUDNN_DETERMINISTIC'] = str(SEED)
    if use_tf:
        tf.random.set_seed(SEED)
    print('seeding done!!!')

seeding(RANDOM_SEED)

train = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/test.csv')
submission = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/sample_submission.csv')

train = train.sample(frac=1).reset_index(drop=True)

if DEBUG:
    train = train[:100000]
    
target = train.Cover_Type
train.drop(['Id','Cover_Type'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

In [None]:
target.hist()

In [None]:
from imblearn.over_sampling import RandomOverSampler

oversample = RandomOverSampler(sampling_strategy='auto')
X_over, y_over = oversample.fit_resample(train, target)

In [None]:
# map tagrgets to match LGBM
# remove class 5 as non existant

y_over[y_over== 1] = 0
y_over[y_over== 2] = 1
y_over[y_over== 3] = 2
y_over[y_over== 4] = 3
y_over[y_over== 6] = 4
y_over[y_over== 7] = 5
y_over.hist()

In [None]:
#Soil_Type1 has constant value "0"	Constant
#Soil_Type3 has constant value "0"	Constant
#Soil_Type4 has constant value "0"	Constant
#Soil_Type6 has constant value "0"	Constant
#Soil_Type7 has constant value "0"	Constant
#Soil_Type15 has constant value "0"	Constant
#Soil_Type17 has constant value "0"	Constant

#Wilderness_Area2 is highly skewed (γ1 = 30.66681781)	Skewed
#Wilderness_Area4 is highly skewed (γ1 = 23.24892545)	Skewed
#Soil_Type2 is highly skewed (γ1 = 223.6067977)	Skewed
#Soil_Type5 is highly skewed (γ1 = 25.42422024)	Skewed
#Soil_Type10 is highly skewed (γ1 = 23.24892545)	Skewed
#Soil_Type13 is highly skewed (γ1 = 29.06044526)	Skewed
#Soil_Type14 is highly skewed (γ1 = 24.63319199)	Skewed

In [None]:
%%time

if PROFILE:
    train_profile = profile(train, title="Train Data", minimal=True)
    display(train_profile)

In [None]:
%%time

def run_train(X, y, run_params, splits, num_boost_round, verbose_eval, early_stopping_rounds ):
    scores = []
    models = []
    evals_results = {}  # to record eval results for plotting
    folds = KFold(n_splits=splits)
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        print(f'Fold {fold_n+1} started')
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        model = lgb.train(
            run_params, valid_names=["train", "valid"], 
            train_set=lgb.Dataset(X_train, y_train ), 
            num_boost_round = num_boost_round,
            valid_sets = [lgb.Dataset(X_valid, y_valid)],
            verbose_eval = verbose_eval,
            evals_result=evals_results,
            early_stopping_rounds = early_stopping_rounds,
        )

        y_predicted = model.predict(X_valid)
        y_predicted
        #score = accuracy_score(y_valid, y_predicted)   
        #print(f'accuracy_score: {score}')

        models.append(model)
        #scores.append(score)
    return scores, models, evals_results


LEARNING_RATE = 0.01
MAX_DEPTH = -1
NUM_LEAVES = 31
TOTAL_SPLITS = 4
NUM_BOOST_ROUND = 100
EARLY_STOPPING_ROUNDS = 20
VERBOSE_EVAL = 10    
    
run_params = {
    'verbose': -1, 
    'boosting_type': 'gbdt', 
    'objective': 'multiclass', 
    'metric': ['multi_logloss'],
    'learning_rate': LEARNING_RATE, 
    'num_leaves': NUM_LEAVES, 
    #'scale_pos_weight':scale_pos_weight,
    #'feature_fraction': 0.5, 
    #'bagging_fraction': 0.5, 
    #'bagging_freq': 4, 
    'max_depth': MAX_DEPTH, 
    'num_class' : 6
}

scores, models, evals_results = run_train(X_over, y_over, run_params, TOTAL_SPLITS, NUM_BOOST_ROUND, VERBOSE_EVAL, EARLY_STOPPING_ROUNDS)
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))

In [None]:
ax = lgb.plot_metric(evals_results, metric='multi_logloss')
plt.show()

In [None]:
idx = 0
for model in models:
    yhat = np.argmax(model.predict(X_over, workers=8), axis=1).reshape(-1)
    print(f"Model:{idx}, F1 Macro Score: ", f1_score(yhat, y_over, average="macro"))
    idx = idx + 1

In [None]:
# https://machinelearningmastery.com/stacking-ensemble-for-deep-learning-neural-networks/

# create stacked model input dataset as outputs from the ensemble
def stacked_dataset(models, inputX):
    stackX = None
    count = 1;
    for model in models:
        yhat = model.predict(inputX, verbose=1, workers=8)
        y_pred = np.argmax(yhat, axis=1).reshape(-1)
        if stackX is None:
            stackX = y_pred
        else:
            stackX = np.dstack((stackX, y_pred))
    stackX = stackX.reshape((stackX.shape[1], stackX.shape[2]))
    return stackX

# fit a model based on the outputs from the ensemble members
def fit_stacked_model(models, inputX, inputy, regressor):
    stackedX = stacked_dataset(models, inputX)
    print("fitting regressor ...")
    regressor.fit(stackedX, np.argmax(inputy, axis=1).reshape(-1))
    return regressor

# fit a model based on the outputs from the ensemble members
def fit_and_predict_stacked_model(models, inputX, inputy, regressor):
    stackedX = stacked_dataset(models, inputX)
    print("fitting regressor ...")
    regressor.fit(stackedX, inputy)
    print("regressor predict ...")
    yhat = regressor.predict(stackedX)
    return regressor, np.array(np.round(yhat,0), np.int)

# make a prediction with the stacked model
def stacked_prediction(models, regressor, inputX):
    stackedX = stacked_dataset(models, inputX)
    yhat = regressor.predict(stackedX)
    return np.array(np.round(yhat,0), np.int)

In [None]:
regressor, yhat = fit_and_predict_stacked_model(models, X_over, y_over, RandomForestRegressor())    
print(f"GBM Ensemble F1 Macro Score: ", f1_score(yhat, y_over, average="macro"))

In [None]:
stackedX = stacked_dataset(models, test)
y_pred = np.array(np.round(regressor.predict(stackedX),0), np.int)

In [None]:
np.unique(y_pred)

In [None]:
# map back 
y_pred[y_pred== 0] = 1
y_pred[y_pred== 1] = 2
y_pred[y_pred== 2] = 3
y_pred[y_pred== 3] = 4
y_pred[y_pred== 4] = 6
y_pred[y_pred== 5] = 7

In [None]:
submission.Cover_Type.unique()

In [None]:
submission['Cover_Type'] = y_pred
submission.to_csv('submission.csv', index=False, float_format='%.6f')
submission.head(20)