# Tabular Playground Series - Feb 2021

## Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold

## Read datasets

In [2]:
train = pd.read_csv('train.zip', index_col='id')
display(train.sample(2))

Unnamed: 0_level_0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
56956,A,B,A,A,B,B,A,E,E,L,...,0.209124,0.340411,0.679454,0.358233,0.332291,0.295032,0.360515,0.264896,0.700195,8.960819
401413,A,A,A,C,B,B,A,E,E,K,...,0.203811,0.353985,0.402373,0.267595,0.305343,0.396868,0.227666,0.249721,0.556236,6.817004


In [3]:
test = pd.read_csv('test.zip', index_col='id')
display(test.sample(2))

Unnamed: 0_level_0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
363264,A,A,A,C,B,B,A,E,G,F,...,0.277032,0.595995,0.985877,0.696328,0.691716,0.619671,0.652784,0.928903,0.324949,0.725156
264626,A,B,A,C,B,B,A,E,E,I,...,0.491876,0.249672,0.256255,0.443257,0.380537,0.354388,0.247124,0.448589,0.287455,0.220694


In [4]:
submission = pd.read_csv('sample_submission.csv', index_col='id')
display(submission.sample(2))

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
146895,0.5
124637,0.5


## EDA

## Feature Selection

In [None]:
!pip install -q lofo-importance
from lofo import LOFOImportance, Dataset, plot_importance
dataset = Dataset(df=train, target='target', features=[col for col in train.columns if col != 'target'])
cv = KFold(n_splits=5, shuffle=True, random_state=42)
lofo_imp = LOFOImportance(dataset, cv=cv, scoring="neg_root_mean_squared_error")
importance_df = lofo_imp.get_importance()
plot_importance(importance_df, figsize=(6,10))

In [5]:
train = train.drop(['cat4', 'cat5', 'cat7', 'cont2', 'cont4', 'cont7', 'cont12'], axis=1)
test = test.drop(['cat4', 'cat5', 'cat7', 'cont2', 'cont4', 'cont7', 'cont12'], axis=1)

## Split

In [None]:
X = train.drop(['target'], axis=1)
y = train.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state = 42)

## LazyPredict

In [None]:
!pip install -U -q pip
!pip install -U -q setuptools
!pip install -q lazypredict
from lazypredict.Supervised import LazyRegressor

In [None]:
reg = LazyRegressor(verbose=0, ignore_warnings=True)
models,predictions = reg.fit(X_train, X_test, y_train, y_test)

In [None]:
print(models)

## EvalML

In [None]:
!pip install -U -q pip
!pip install -U -q setuptools
!pip install -q evalml

In [None]:
import evalml
from evalml.automl import AutoMLSearch

In [None]:
X_train, X_test, y_train, y_test = evalml.preprocessing.split_data(X, y, problem_type='regression')

In [None]:
automl = AutoMLSearch(X_train=X_train, y_train=y_train, 
                      problem_type='regression',
                      objective='Root Mean Squared Error',
                      additional_objectives=['R2', 'MSE'],
                      optimize_thresholds=True,
                      max_batches=2,
                      ensembling=True)
automl.search()

In [None]:
automl.rankings

In [None]:
automl.describe_pipeline(automl.rankings.iloc[0]["id"])

In [None]:
automl.best_pipeline.fit(X_train, y_train)
automl.best_pipeline.score(X_test, y_test, objectives=["Root Mean Squared Error", "R2", "MSE"])

In [None]:
submission['target'] = automl.best_pipeline.predict(test)
submission.to_csv('submission_evalml .csv')

In [None]:
submission

## AutoGluon

In [7]:
!pip install -U -q pip
!pip install -U -q setuptools
!pip install -U -q mxnet
!pip install -q autogluon
from autogluon.tabular import TabularPredictor

In [8]:
predictor = TabularPredictor(label='target', 
                             problem_type='regression', 
                             eval_metric='root_mean_squared_error'
                             ).fit(train, 
                                   time_limit=1800, 
                                   presets='high_quality_fast_inference_only_refit'
                                   )

  and should_run_async(code)
No path specified. Models will be saved in: "AutogluonModels/ag-20210422_130837/"
Presets specified: ['high_quality_fast_inference_only_refit']
Beginning AutoGluon training ... Time limit = 1800s
AutoGluon will save models to "AutogluonModels/ag-20210422_130837/"
AutoGluon Version:  0.1.0
Train Data Rows:    300000
Train Data Columns: 17
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
NumExpr defaulting to 4 threads.
	Available Memory:                    17961.14 MB
	Train Data (Original)  Memory Usage: 145.8 MB (0.8% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting CategoryFeatureGenerator...
			Fittin

In [9]:
predictor.leaderboard()

                         model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0          WeightedEnsemble_L3  -0.846289     660.687930  752.759294                0.006846           5.566365            3      False          9
1              LightGBM_BAG_L2  -0.846454     298.660806  702.663576                1.378855         204.044627            2      False          7
2              LightGBM_BAG_L1  -0.846537       5.450410  408.794699                5.450410         408.794699            1      False          3
3          WeightedEnsemble_L2  -0.846537       5.457523  413.979220                0.007113           5.184520            2      False          5
4            LightGBMXT_BAG_L2  -0.847844     298.797114  631.543463                1.515164         132.924514            2      False          8
5            LightGBMXT_BAG_L1  -0.875848       0.602385   31.577895                0.602385          31.577895       

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,-0.846289,660.68793,752.759294,0.006846,5.566365,3,False,9
1,LightGBM_BAG_L2,-0.846454,298.660806,702.663576,1.378855,204.044627,2,False,7
2,LightGBM_BAG_L1,-0.846537,5.45041,408.794699,5.45041,408.794699,1,False,3
3,WeightedEnsemble_L2,-0.846537,5.457523,413.97922,0.007113,5.18452,2,False,5
4,LightGBMXT_BAG_L2,-0.847844,298.797114,631.543463,1.515164,132.924514,2,False,8
5,LightGBMXT_BAG_L1,-0.875848,0.602385,31.577895,0.602385,31.577895,1,False,4
6,KNeighborsUnif_BAG_L2,-0.921975,659.302229,543.148301,362.020279,44.529352,2,False,6
7,KNeighborsUnif_BAG_L1,-0.952763,149.501869,32.103163,149.501869,32.103163,1,False,1
8,KNeighborsDist_BAG_L1,-0.953635,141.727287,26.143192,141.727287,26.143192,1,False,2
9,WeightedEnsemble_L3_FULL,,,97.543239,0.009884,0.543953,3,True,18


In [10]:
submission['target'] = predictor.predict(test)
submission.to_csv('submission_autogluon.csv')

  and should_run_async(code)


## AutoKeras

In [None]:
!pip install -q autokeras
import tensorflow as tf
import autokeras as ak

In [None]:
train_size = int(train.shape[0] * 0.2)
train[:train_size].to_csv("train_new.csv", index=False)
train[train_size:].to_csv("test_new.csv", index=False)
train_file_path = "train_new.csv"
test_file_path = "test_new.csv"

In [None]:
reg = ak.StructuredDataRegressor(overwrite=True, 
                                 max_trials=5,
                                 loss="mean_squared_error",
                                 )  
reg.fit(train_file_path, 
        "target", 
        epochs=10,
        )

In [None]:
print(reg.evaluate(test_file_path, "target"))

In [None]:
submission['target'] = reg.predict(test)
submission.to_csv('submission_autokeras.csv')

## PyCaret