In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer, MinMaxScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import KBinsDiscretizer
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import StackingRegressor
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import seaborn as sns


pd.set_option('display.max_columns', None)

In [2]:
try: 
    import optuna
except:
    !pip install optuna
    import optuna

In [3]:
try:
    from CustomPipeline import *
except:
    import sys
    sys.path.insert(0,'/content/sample_data')
    from CustomPipeline import *
    print("ok")

In [4]:
test_ensemble = pd.read_csv("./en_test_ensemble.csv", delimiter=",", sep='.')
test = pd.read_csv("./en_test_witout_target.csv", delimiter=",", sep='.', index_col='id')
train_ensemble = pd.read_csv("./en_train_ensemble.csv", delimiter=",", sep='.')

In [5]:
df = pd.read_csv("./ensemble_with_clusters_1.csv", delimiter=",", sep='.')

# Feature engeneering with k-means

In [6]:
df.iloc[:, 25:]

Unnamed: 0,0,1,label,k_means_cont0cont1,k_means_cont0cont2,k_means_cont0cont3,k_means_cont0cont4,k_means_cont0cont5,k_means_cont0cont6,k_means_cont0cont7,k_means_cont0cont8,k_means_cont0cont9,k_means_cont0cont10,k_means_cont0cont11,k_means_cont0cont12,k_means_cont0cont13,k_means_cont00,k_means_cont01,k_means_cont1cont2,k_means_cont1cont3,k_means_cont1cont4,k_means_cont1cont5,k_means_cont1cont6,k_means_cont1cont7,k_means_cont1cont8,k_means_cont1cont9,k_means_cont1cont10,k_means_cont1cont11,k_means_cont1cont12,k_means_cont1cont13,k_means_cont10,k_means_cont11,k_means_cont2cont3,k_means_cont2cont4,k_means_cont2cont5,k_means_cont2cont6,k_means_cont2cont7,k_means_cont2cont8,k_means_cont2cont9,k_means_cont2cont10,k_means_cont2cont11,k_means_cont2cont12,k_means_cont2cont13,k_means_cont20,k_means_cont21,k_means_cont3cont4,k_means_cont3cont5,k_means_cont3cont6,k_means_cont3cont7,k_means_cont3cont8,k_means_cont3cont9,k_means_cont3cont10,k_means_cont3cont11,k_means_cont3cont12,k_means_cont3cont13,k_means_cont30,k_means_cont31,k_means_cont4cont5,k_means_cont4cont6,k_means_cont4cont7,k_means_cont4cont8,k_means_cont4cont9,k_means_cont4cont10,k_means_cont4cont11,k_means_cont4cont12,k_means_cont4cont13,k_means_cont40,k_means_cont41,k_means_cont5cont6,k_means_cont5cont7,k_means_cont5cont8,k_means_cont5cont9,k_means_cont5cont10,k_means_cont5cont11,k_means_cont5cont12,k_means_cont5cont13,k_means_cont50,k_means_cont51,k_means_cont6cont7,k_means_cont6cont8,k_means_cont6cont9,k_means_cont6cont10,k_means_cont6cont11,k_means_cont6cont12,k_means_cont6cont13,k_means_cont60,k_means_cont61,k_means_cont7cont8,k_means_cont7cont9,k_means_cont7cont10,k_means_cont7cont11,k_means_cont7cont12,k_means_cont7cont13,k_means_cont70,k_means_cont71,k_means_cont8cont9,k_means_cont8cont10,k_means_cont8cont11,k_means_cont8cont12,k_means_cont8cont13,k_means_cont80,k_means_cont81,k_means_cont9cont10,k_means_cont9cont11,k_means_cont9cont12,k_means_cont9cont13,k_means_cont90,k_means_cont91,k_means_cont10cont11,k_means_cont10cont12,k_means_cont10cont13,k_means_cont100,k_means_cont101,k_means_cont11cont12,k_means_cont11cont13,k_means_cont110,k_means_cont111,k_means_cont12cont13,k_means_cont120,k_means_cont121,k_means_cont130,k_means_cont131,k_means_01,k_means_cont4_cont6_,k_means_cont5_cont8_,k_means_cont6_cont7_
0,811.794579,802.49600,test,6,5,3,13,11,2,11,13,12,8,12,15,13,11,8,12,6,2,2,9,5,2,5,3,4,1,7,14,15,5,1,1,14,11,5,9,14,8,0,1,0,1,7,6,8,1,8,2,9,8,0,2,6,1,4,11,14,7,9,2,14,7,14,6,13,9,7,1,12,0,11,1,3,9,4,11,12,15,8,5,13,7,9,10,13,9,5,10,9,6,15,11,14,10,6,13,9,1,6,5,8,7,6,0,10,13,14,10,5,11,6,13,1,13,1,9,13,6,12,8,0,0,0
1,841.286617,838.92790,test,1,14,9,1,4,3,7,7,10,7,10,13,12,15,14,14,11,1,9,3,2,15,2,13,8,12,15,11,11,2,14,10,12,3,5,9,4,14,12,11,1,10,9,4,15,5,13,13,10,5,6,12,7,5,7,7,13,10,4,12,9,14,8,8,4,2,8,11,11,6,8,2,10,8,0,4,9,11,10,14,7,8,12,12,7,10,10,12,0,10,11,8,14,15,15,11,14,12,10,15,11,11,7,12,14,4,10,6,7,12,11,7,12,0,2,15,6,7,10,6,0,0,0
2,837.758268,841.12854,test,12,6,13,4,7,7,10,5,4,7,15,14,2,6,15,10,3,3,1,0,2,6,8,13,2,10,8,0,1,8,4,3,10,3,0,0,12,2,0,2,8,5,3,4,2,13,4,14,5,3,4,13,4,5,10,9,4,8,15,15,10,6,10,15,15,15,0,11,2,15,2,11,11,15,0,3,4,1,2,10,13,0,4,7,7,3,10,1,5,2,6,1,3,15,1,0,13,12,1,1,2,1,11,1,14,8,2,12,4,12,0,8,0,15,8,15,1,4,1,6,0,0,0
3,853.769654,847.46990,test,3,7,4,10,2,8,2,14,5,12,5,5,1,10,1,4,11,3,1,3,4,8,6,4,9,10,8,3,12,2,6,14,2,15,1,10,0,10,14,0,9,13,8,4,10,0,12,5,14,11,2,15,5,8,3,4,11,12,0,7,10,6,10,1,0,11,2,15,2,8,2,0,11,2,12,2,1,0,9,4,0,5,15,0,3,13,9,6,14,13,8,15,9,0,0,4,3,10,14,6,7,1,11,3,1,15,11,9,14,1,0,10,2,15,8,14,14,12,8,0,0,0,0
4,819.820749,811.22650,test,13,3,6,1,13,0,8,7,1,1,0,0,11,8,6,9,15,5,13,9,11,2,15,14,14,2,7,1,0,11,0,0,0,7,2,10,8,10,14,10,13,9,2,6,8,12,8,12,6,0,13,2,15,14,2,1,12,10,13,5,0,0,6,11,1,9,1,1,0,4,0,1,3,3,14,1,12,1,11,10,13,7,8,1,13,14,12,9,1,6,13,6,13,8,12,7,9,5,8,4,2,1,3,6,9,10,8,0,0,6,0,4,7,6,1,3,5,0,9,7,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283995,869.400623,867.26910,test_ensemble,7,12,11,9,3,5,5,0,2,14,1,9,5,4,4,0,8,1,1,3,1,15,11,7,8,12,1,10,4,12,8,10,6,10,3,7,5,14,12,8,3,12,6,8,12,3,9,11,1,2,3,11,13,6,0,7,12,10,2,5,9,14,15,5,8,11,9,11,1,6,8,2,5,6,5,15,9,9,3,14,8,9,6,4,15,5,7,8,7,3,4,4,5,14,15,11,7,4,15,2,13,2,10,15,7,4,5,13,11,7,11,11,8,11,6,6,3,5,4,4,0,0,0
283996,810.539736,808.34860,test_ensemble,6,5,5,0,11,2,13,13,12,3,9,11,6,11,6,8,10,10,15,4,8,10,0,9,15,4,13,14,10,15,8,1,1,10,9,9,10,4,2,12,0,9,9,15,11,0,6,13,8,10,2,8,6,14,2,10,0,1,4,6,12,5,4,6,1,7,11,5,12,5,6,15,4,9,14,5,6,5,12,15,10,6,9,1,2,11,4,3,15,1,15,6,7,1,14,13,10,1,8,3,15,13,4,0,6,3,4,8,5,6,4,5,1,6,12,9,5,6,9,11,0,0,0
283997,808.015706,809.41300,test_ensemble,8,14,1,1,13,0,7,7,10,0,0,4,11,1,6,14,8,10,15,0,2,1,2,12,9,3,15,5,10,4,14,8,4,0,5,11,2,0,8,10,6,9,6,13,5,2,9,15,7,9,3,6,14,14,2,1,13,10,4,8,0,8,8,14,1,9,10,1,12,0,0,4,13,1,14,3,12,15,8,10,11,12,3,1,7,10,15,1,2,10,1,6,14,10,12,15,14,15,8,5,7,15,7,0,6,1,0,10,15,6,5,14,13,6,5,1,5,14,9,11,0,0,0
283998,785.915281,782.50354,test_ensemble,5,14,12,5,6,7,15,1,4,0,15,2,3,3,3,13,9,7,4,5,12,0,10,15,7,0,2,7,7,10,2,8,4,14,5,11,2,0,7,10,7,3,5,6,8,12,8,10,9,15,0,2,2,3,8,12,1,14,6,10,1,9,11,2,3,9,11,1,10,0,0,7,3,10,6,14,12,1,8,10,6,7,14,2,15,4,6,15,15,0,3,13,13,10,12,13,2,2,3,8,3,10,3,5,13,1,14,10,2,13,8,14,3,12,10,13,4,2,7,12,0,0,0


In [7]:
# cast int to obj
df.iloc[:, 28:] = df.iloc[:, 28:].astype('object')

In [8]:
df = df.iloc[:, 25:].copy()

In [9]:
df.head(2)

Unnamed: 0,0,1,label,k_means_cont0cont1,k_means_cont0cont2,k_means_cont0cont3,k_means_cont0cont4,k_means_cont0cont5,k_means_cont0cont6,k_means_cont0cont7,k_means_cont0cont8,k_means_cont0cont9,k_means_cont0cont10,k_means_cont0cont11,k_means_cont0cont12,k_means_cont0cont13,k_means_cont00,k_means_cont01,k_means_cont1cont2,k_means_cont1cont3,k_means_cont1cont4,k_means_cont1cont5,k_means_cont1cont6,k_means_cont1cont7,k_means_cont1cont8,k_means_cont1cont9,k_means_cont1cont10,k_means_cont1cont11,k_means_cont1cont12,k_means_cont1cont13,k_means_cont10,k_means_cont11,k_means_cont2cont3,k_means_cont2cont4,k_means_cont2cont5,k_means_cont2cont6,k_means_cont2cont7,k_means_cont2cont8,k_means_cont2cont9,k_means_cont2cont10,k_means_cont2cont11,k_means_cont2cont12,k_means_cont2cont13,k_means_cont20,k_means_cont21,k_means_cont3cont4,k_means_cont3cont5,k_means_cont3cont6,k_means_cont3cont7,k_means_cont3cont8,k_means_cont3cont9,k_means_cont3cont10,k_means_cont3cont11,k_means_cont3cont12,k_means_cont3cont13,k_means_cont30,k_means_cont31,k_means_cont4cont5,k_means_cont4cont6,k_means_cont4cont7,k_means_cont4cont8,k_means_cont4cont9,k_means_cont4cont10,k_means_cont4cont11,k_means_cont4cont12,k_means_cont4cont13,k_means_cont40,k_means_cont41,k_means_cont5cont6,k_means_cont5cont7,k_means_cont5cont8,k_means_cont5cont9,k_means_cont5cont10,k_means_cont5cont11,k_means_cont5cont12,k_means_cont5cont13,k_means_cont50,k_means_cont51,k_means_cont6cont7,k_means_cont6cont8,k_means_cont6cont9,k_means_cont6cont10,k_means_cont6cont11,k_means_cont6cont12,k_means_cont6cont13,k_means_cont60,k_means_cont61,k_means_cont7cont8,k_means_cont7cont9,k_means_cont7cont10,k_means_cont7cont11,k_means_cont7cont12,k_means_cont7cont13,k_means_cont70,k_means_cont71,k_means_cont8cont9,k_means_cont8cont10,k_means_cont8cont11,k_means_cont8cont12,k_means_cont8cont13,k_means_cont80,k_means_cont81,k_means_cont9cont10,k_means_cont9cont11,k_means_cont9cont12,k_means_cont9cont13,k_means_cont90,k_means_cont91,k_means_cont10cont11,k_means_cont10cont12,k_means_cont10cont13,k_means_cont100,k_means_cont101,k_means_cont11cont12,k_means_cont11cont13,k_means_cont110,k_means_cont111,k_means_cont12cont13,k_means_cont120,k_means_cont121,k_means_cont130,k_means_cont131,k_means_01,k_means_cont4_cont6_,k_means_cont5_cont8_,k_means_cont6_cont7_
0,811.794579,802.496,test,6,5,3,13,11,2,11,13,12,8,12,15,13,11,8,12,6,2,2,9,5,2,5,3,4,1,7,14,15,5,1,1,14,11,5,9,14,8,0,1,0,1,7,6,8,1,8,2,9,8,0,2,6,1,4,11,14,7,9,2,14,7,14,6,13,9,7,1,12,0,11,1,3,9,4,11,12,15,8,5,13,7,9,10,13,9,5,10,9,6,15,11,14,10,6,13,9,1,6,5,8,7,6,0,10,13,14,10,5,11,6,13,1,13,1,9,13,6,12,8,0,0,0
1,841.286617,838.9279,test,1,14,9,1,4,3,7,7,10,7,10,13,12,15,14,14,11,1,9,3,2,15,2,13,8,12,15,11,11,2,14,10,12,3,5,9,4,14,12,11,1,10,9,4,15,5,13,13,10,5,6,12,7,5,7,7,13,10,4,12,9,14,8,8,4,2,8,11,11,6,8,2,10,8,0,4,9,11,10,14,7,8,12,12,7,10,10,12,0,10,11,8,14,15,15,11,14,12,10,15,11,11,7,12,14,4,10,6,7,12,11,7,12,0,2,15,6,7,10,6,0,0,0


In [10]:
test_new = df[df['label'] == 'test'].drop('label', axis=1)
train_ensemble_new = df[df['label'] == 'train_ensemble'].drop('label', axis=1)
test_ensemble_new = df[df['label'] == 'test_ensemble'].drop('label', axis=1)

In [11]:
X = train_ensemble_new
y = train_ensemble['target']

num_train = X.select_dtypes([int, float])
cat_train = X.select_dtypes(object)

num = list(num_train)
cat = list(cat_train)
cat_features = [X.columns.get_loc(i) for i in cat]

rmse = make_scorer(mean_squared_error, squared=False)

pipeline_num = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('normal', PowerTransformer()), 
    ('scaling', MinMaxScaler()),  
#     ('bins', KBinsDiscretizer(n_bins = 50))
])
pipeline_cat = Pipeline(steps=[
#     ('encoding', OrdinalEncoder()),
    ('encoding', OneHotEncoder(handle_unknown='ignore')),
])
preprocessor = ColumnTransformer(n_jobs=-1,
    transformers=[
        ('num', pipeline_num, num),
        ('cat', pipeline_cat, cat),
        ], remainder="passthrough")


preprocessor.fit(df.drop('label', axis=1))
transform = preprocessor.transform(X)
X_test_ensemble = preprocessor.transform(test_ensemble_new)
X_test = preprocessor.transform(test_new)


In [14]:
def objective(trial):
    
    param_model = {
        'alpha': trial.suggest_float('alpha', 0.01, 1),
        'random_state':trial.suggest_categorical("random_state", [0, 1, 42]),
    }
    
    pipeline_ridge = Pipeline(steps=[ 
                                     ('feature_selection', SelectFromModel(linear_model.Lasso(alpha=0.6, random_state=1))),
                                     ('model', linear_model.Ridge(**param_model)),
                                     ])

    rmse_mean_cv = cross_val_score(pipeline_ridge, transform, y, cv=5, scoring=rmse)
    print(rmse_mean_cv)

    return rmse_mean_cv.mean()

In [None]:
study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=50)
# 72.05929420329886.
# 72.05702112640415.

[32m[I 2021-08-31 22:22:00,726][0m A new study created in memory with name: no-name-058d2001-3821-450a-ab38-bdec066f0ccf[0m
[32m[I 2021-08-31 22:22:55,903][0m Trial 0 finished with value: 72.08986131277832 and parameters: {'alpha': 0.08375057755463385, 'random_state': 1}. Best is trial 0 with value: 72.08986131277832.[0m


[72.3917034  72.28949453 71.6785645  71.88758008 72.20196405]


  max_iter, tol, rng, random, positive)
  max_iter, tol, rng, random, positive)


In [None]:
best_params = study.best_trial.params
best_params 

# check bias clusters

In [None]:
model = linear_model.Lasso(**best_params)
model.fit(transform, y)

In [None]:
model.coef_

In [None]:
X.head(2)

In [None]:
pred = model.predict(transform)
train_ensemble['pred_lasso'] = pred

In [None]:
pred = model.predict(X_test_ensemble)
print(mean_squared_error(test_ensemble["target"], pred, squared=False))

In [None]:
pred = model.predict(X_test)
test['target'] = pred / 100

# Residuals

In [None]:
train_ensemble['residuals'] = train_ensemble['target'] - train_ensemble['0']
train_ensemble['label'] = train_ensemble.residuals.apply(lambda x: 1 if x > -180 else 0) 

In [None]:
sns.set(rc={'figure.figsize':(6.7,4.27)})
residuals = train_ensemble['target'] - train_ensemble['0']
sns.scatterplot(train_ensemble['0'], residuals, hue = train_ensemble['label'], size=train_ensemble['label'])

In [None]:
sns.set(rc={'figure.figsize':(6.7,4.27)})
residuals = train_ensemble['target'] - train_ensemble['pred_lasso']
sns.scatterplot(train_ensemble['pred_lasso'], residuals, hue = train_ensemble['label'], size=train_ensemble['label'])

# save predict

In [None]:
test.head()

In [None]:
test[['target']].to_csv('./ensemble_for_submit_with_clusters.csv', index=True)