# Tabular Playground Series - Feb 2021

## Imports

In [None]:
#!pip install -U -q pip==21.0.1
#!pip install -U -q setuptools
!pip install -U -q scikit-learn==0.23.2

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold

## Read datasets

In [None]:
train = pd.read_csv('train.zip', index_col='id')
display(train.sample(2))

In [None]:
test = pd.read_csv('test.zip', index_col='id')
display(test.sample(2))

In [None]:
submission = pd.read_csv('sample_submission.csv', index_col='id')
display(submission.sample(2))

## EDA

## Feature Selection

In [None]:
!pip install -q lofo-importance
from lofo import LOFOImportance, Dataset, plot_importance
dataset = Dataset(df=train, target='target', features=[col for col in train.columns if col != 'target'])
cv = KFold(n_splits=5, shuffle=True, random_state=42)
lofo_imp = LOFOImportance(dataset, cv=cv, scoring="neg_root_mean_squared_error")
importance_df = lofo_imp.get_importance()
plot_importance(importance_df, figsize=(6,10))

In [None]:
train = train.drop([
#                    'cat4', 
#                    'cat5', 
#                    'cat7', 
#                    'cont2', 
#                    'cont4', 
#                    'cont7', 
#                    'cont12',
                    ], axis=1)
test = test.drop([
#                  'cat4', 
#                  'cat5', 
#                  'cat7', 
#                  'cont2', 
#                  'cont4', 
#                  'cont7', 
#                  'cont12',
                    ], axis=1)

## Split

In [None]:
train = train.sample(frac=0.25, random_state=42)

X = train.drop(['target'], axis=1)
y = train.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state = 42)

## LazyPredict

In [None]:
!pip install -U -q pip==21.0.1
!pip install -U -q setuptools
!pip install -q lazypredict
from lazypredict.Supervised import LazyRegressor

In [None]:
reg = LazyRegressor(verbose=0, ignore_warnings=True)
models,predictions = reg.fit(X_train, X_test, y_train, y_test)

In [None]:
print(models)

## EvalML

In [None]:
!pip install -U -q pip==21.0.1
!pip install -U -q setuptools
!pip install -q evalml

In [None]:
import evalml
from evalml.automl import AutoMLSearch

In [None]:
X_train, X_test, y_train, y_test = evalml.preprocessing.split_data(X, y, problem_type='regression')

In [None]:
automl = AutoMLSearch(X_train=X_train, y_train=y_train, 
                      problem_type='regression',
                      objective='Root Mean Squared Error',
                      additional_objectives=['R2', 'MSE'],
                      optimize_thresholds=True,
                      max_batches=2,
                      ensembling=True)
automl.search()

In [None]:
automl.rankings

In [None]:
automl.describe_pipeline(automl.rankings.iloc[0]["id"])

In [None]:
automl.best_pipeline.fit(X_train, y_train)
automl.best_pipeline.score(X_test, y_test, objectives=["Root Mean Squared Error", "R2", "MSE"])

In [None]:
submission['target'] = automl.best_pipeline.predict(test)
submission.to_csv('submission_evalml .csv')

In [None]:
submission

## AutoGluon

In [None]:
!pip install -U -q pip==21.0.1
#!pip install -U -q setuptools
!pip install -U -q mxnet
!pip install -q autogluon
from autogluon.tabular import TabularPredictor

In [None]:
predictor = TabularPredictor(label='target', 
                             problem_type='regression', 
                             eval_metric='root_mean_squared_error'
                             ).fit(train, 
                                   #time_limit=1800, 
                                   presets='high_quality_fast_inference_only_refit'
                                   )

In [None]:
predictor.leaderboard()

In [None]:
submission['target'] = predictor.predict(test)
submission.to_csv('submission_autogluon.csv')

## AutoKeras

In [None]:
!pip install -U -q pip==21.0.1
#!pip install -U -q setuptools
!pip install -q autokeras
import tensorflow as tf
import autokeras as ak

In [None]:
train_size = int(train.shape[0] * 0.1)
train[:train_size].to_csv("train_new.csv", index=False)
train[train_size:].to_csv("test_new.csv", index=False)
train_file_path = "train_new.csv"
test_file_path = "test_new.csv"

In [None]:
reg = ak.StructuredDataRegressor(overwrite=True, 
                                 max_trials=5,
                                 loss="mean_squared_error",
                                 )  
reg.fit(train_file_path, 
        "target", 
        epochs=10,
        )

In [None]:
print(reg.evaluate(test_file_path, "target"))

In [None]:
submission['target'] = reg.predict(test)
submission.to_csv('submission_autokeras.csv')

## AutoML Alex

## PyCaret

In [None]:
#!pip install -U -q pip==21.0.1
#!pip install -U -q setuptools
!pip install -U -q pycaret[full]

from pycaret.regression import *

In [None]:
data = train.sample(frac=0.9, random_state=42)
data_unseen = train.drop(data.index)
data.reset_index(drop=True, inplace=True)
data_unseen.reset_index(drop=True, inplace=True)
print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions ' + str(data_unseen.shape))

In [None]:
reg = setup(
            data = train, 
            target ='target',
            html = True, 
            silent = True, 
            verbose = False, 
            session_id = 42,
            preprocess = True,
            train_size = 0.9,

#            normalize=True, normalize_method='minmax', 
#            transformation=True, transformation_method='quantile',
#            pca=True, pca_method='linear', pca_components=5,
             
#            polynomial_features=True, polynomial_degree=3,
#            trigonometry_features=True,
#            polynomial_threshold=0.1,
             
#            feature_interaction=True,
#            feature_ratio=True,
#            interaction_threshold=0.01,

#            feature_selection=True, feature_selection_threshold=0.1, feature_selection_method='boruta', # настраиваем итоговую селекцию признаков
             
#            data_split_shuffle=True,
            )

In [None]:
models()

In [None]:
best = compare_models(sort='RMSE', fold=5, n_select=1, turbo=True,)

In [None]:
reg1 = create_model('catboost', fold=5)

In [None]:
reg2 = create_model('lightgbm', fold=5)

In [None]:
submission['target'] = predict_model(reg, data=test).Label
submission.to_csv('submission_pycaret_reg.csv')

In [None]:
opt = tune_model(reg, optimize = 'RMSE', fold=5, search_library='tune-sklearn', search_algorithm='bayesian', choose_better=True)
display(opt)

In [None]:
submission['target'] = predict_model(opt, data=test).Label
submission.to_csv('submission_pycaret_opt.csv')

In [None]:
bag = ensemble_model(reg, method = 'Bagging', fold=5, optimize = 'RMSE', choose_better=True)
display(bag)

In [None]:
submission['target'] = predict_model(bag, data=test).Label
submission.to_csv('submission_pycaret_bag.csv')

In [None]:
bst = ensemble_model(reg, method = 'Boosting', fold=5, optimize = 'RMSE', choose_better=True)
display(bag)

In [None]:
submission['target'] = predict_model(bst, data=test).Label
submission.to_csv('submission_pycaret_bst.csv')