In [1]:
from sklearn import metrics, model_selection, ensemble
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import KBinsDiscretizer
import xgboost as xgb
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import StackingRegressor
import xgboost


pd.set_option('display.max_columns', None)

  import pandas.util.testing as tm


In [2]:
try: 
    import optuna
except:
    !pip install optuna
    import optuna

In [3]:
try:
    from catboost import Pool, CatBoostRegressor, cv
except:
    !pip install catboost 
    from catboost import Pool, CatBoostRegressor, cv

In [4]:
try:
    from CustomPipeline import *
except:
    import sys
    sys.path.insert(0,'/content/sample_data')
    from CustomPipeline import *
    print("ok")

In [5]:
try:
    train = pd.read_csv("./train_anomaly.csv", delimiter=",", sep='.')
except:
    train = pd.read_csv("./sample_data/train_anomaly.csv", delimiter=",", sep='.')

In [6]:
try:
    from catboost import Pool, CatBoostRegressor, cv
except:
    !pip install catboost 
    from catboost import Pool, CatBoostRegressor, cv

In [7]:
RANDOM_STATE = 42

In [8]:
train.drop("id", axis=1, inplace=True)
train["target"] = 100 * train["target"]

In [68]:
# test run
# train, test = train_test_split(train, test_size=0.2, random_state=RANDOM_STATE)

In [69]:
train, test_ensemble = train_test_split(train, test_size=0.1, random_state=RANDOM_STATE)
train_estimators, train_ensemble = train_test_split(train, test_size=0.2, random_state=RANDOM_STATE)

In [70]:
X = train_estimators.drop(["target"], axis=1)
y = train_estimators["target"]

In [71]:
num = TypesOfColumns(X).get_num()
cat = TypesOfColumns(X).get_cat()
cat_idx = TypesOfColumns(X).get_cat_idx()

# linear ridge

In [72]:
params_linear = {'alpha': 0.09}
n_bins=63

X_lr = train_estimators.query("target > 600").drop(["target"], axis=1)
y_lr = train_estimators.query("target > 600")["target"]

model = linear_model.Ridge(**params_linear)
lr = LinearWrapper(model, bins_linear=n_bins, cat=cat, num=num)

# xgb

In [77]:
params_xgb = {
    'tree_method':'gpu_hist',
    'random_state': 1, 
    'n_jobs': 4,
    'booster': 'gbtree',
    'n_estimators': 10000,
    'learning_rate': 0.035,
    'reg_lambda': 1.22,
    'reg_alpha': 36.04,
    'subsample': 0.9,
    'colsample_bytree': 0.11,
    'max_depth': 3,
    'min_child_weight': 6
}
model = xgboost.XGBRegressor(**params_xgb)
xgb = XGBWrapper(model, cat=cat, num=num)

# catboost

In [80]:
params_cat = {'iterations': 326, 
    'depth': 4, 
    'loss_function':"RMSE",
    'random_strength': 34, 
    'bagging_temperature': 0.2, 
    'learning_rate': 0.466, 
    'l2_leaf_reg': 24.26
}


model = CatBoostRegressor(**params_cat)
catboost = CatBoostWrapper(model, cat_features=cat_idx)

# random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

params_rf = {
    'random_state':42, 
    'n_jobs':-1
}

model = RandomForestRegressor(**params_rf)
rf = XGBWrapper(model, cat=cat, num=num)

# knn

In [None]:
from sklearn.neighbors import KNeighborsRegressor

params_knn = {
    'n_jobs':-1
}

model = KNeighborsRegressor(**params_knn)
knn = XGBWrapper(model, cat=cat, num=num)

# StackingRegressor

In [None]:
# test = pd.read_csv("./sample_data/test_anomaly.csv", delimiter=",", sep='.')

In [None]:
models = [lr, xgb]
# models = [catboost]

df_ensemble = pd.DataFrame()
df_test = pd.DataFrame()
df_submit = pd.DataFrame()

for i, model in enumerate(models):
    if model == lr:
        model.train(X_lr, y_lr)
    else:
        model.train(X, y)

    pred = model.predict(train_ensemble.drop(['target'], axis=1))
#     print(pred)
    train_ensemble[str(i)] = pred
    pred = model.predict(test_ensemble.drop(['target'], axis=1))
#     print(pred)
    test_ensemble[str(i)] = pred
    pred = model.predict(test)
#     print(pred)
    test[str(i)] = pred

In [None]:
# train_ensemble.to_csv('./train_ensemble.csv', index=False)
# test_ensemble.to_csv('./test_ensemble.csv', index=False)
# test.to_csv('./test.csv', index=False)

In [9]:
test_ensemble = pd.read_csv("./en_test_ensemble.csv", delimiter=",", sep='.')
test = pd.read_csv("./en_test_witout_target.csv", delimiter=",", sep='.')
train_ensemble = pd.read_csv("./en_train_ensemble.csv", delimiter=",", sep='.')

In [10]:
train_ensemble

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target,anomaly_col,0,1
0,A,A,A,C,B,B,A,E,E,H,0.263894,0.422811,0.455877,0.549582,0.285487,0.565569,-0.141368,0.688337,0.691903,0.439211,0.418874,0.053918,0.198492,0.864552,719.973554,Norm,822.393348,840.87530
1,B,B,A,A,C,C,A,E,A,F,0.440252,0.415761,0.546531,0.704333,0.687984,0.158406,0.544799,0.632793,0.316917,0.245392,0.292086,0.583627,0.263786,0.248510,813.072326,Anomaly,821.181948,802.09760
2,B,A,A,C,B,B,C,E,C,G,0.556213,0.087407,0.509698,0.549791,0.286923,0.358378,0.465119,0.389759,0.282789,0.392988,0.262058,0.428709,0.353908,0.766063,953.767401,Anomaly,862.212604,858.83620
3,A,A,A,C,B,B,A,E,C,A,0.091434,0.317570,1.028451,0.281829,0.699003,0.495328,0.407554,0.203858,0.375169,0.153842,0.315238,0.200414,0.302480,0.299906,766.967046,Norm,830.719295,813.61000
4,A,B,A,C,B,D,A,E,A,I,0.621734,0.816455,0.297241,0.536833,0.524217,0.803557,0.150297,0.607898,0.259313,0.275478,0.507401,0.845441,0.783878,0.169852,816.300457,Norm,813.834500,805.53284
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53995,A,B,A,C,B,B,A,E,C,F,0.713022,0.251317,0.612808,0.721725,0.342860,0.819535,0.881754,0.472204,0.412661,0.558916,0.454587,0.340566,0.397448,0.287042,874.369410,Norm,838.510350,844.04315
53996,B,B,A,C,B,B,A,B,A,G,0.437447,0.234785,0.505016,0.539729,0.286069,0.300176,0.553383,0.559673,0.839478,0.827247,0.276473,0.754376,0.284526,0.523540,826.677350,Anomaly,809.192132,807.62120
53997,A,B,A,C,B,B,A,E,C,I,0.217517,0.764528,0.323548,0.465224,0.531727,0.462099,0.383066,0.350403,0.375989,0.315024,0.080939,0.371964,0.591805,0.683887,705.166978,Norm,795.673766,803.96590
53998,B,B,A,C,B,B,A,E,E,I,0.382687,0.772405,0.676804,0.367690,0.569831,0.422857,0.426998,0.499143,0.325629,0.365058,0.050370,0.052908,0.173300,0.587090,818.386556,Norm,783.106832,750.08276


In [11]:
X = train_ensemble.drop(['target'], axis=1)
y = train_ensemble['target']

num_train = X.select_dtypes([int, float])
cat_train = X.select_dtypes(object)

num = list(num_train)
cat = list(cat_train)

rmse = make_scorer(mean_squared_error, squared=False)

pipeline_num = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaling', StandardScaler()),  
    ('normal', PowerTransformer())
])
pipeline_cat = Pipeline(steps=[
    # ('imputer', SimpleImputer(strategy='most_frequent')),
    # ('encoding', OrdinalEncoder()),
    ('encoding', OneHotEncoder(handle_unknown='ignore')),
])
preprocessor = ColumnTransformer(n_jobs=-1,
    transformers=[
        ('num', pipeline_num, num),
        ('cat', pipeline_cat, cat),
        ], remainder="passthrough")

transform = preprocessor.fit_transform(X)

def objective(trial):
    
    param_model = {
        'alpha': trial.suggest_loguniform('alpha', 1e-2, 1.0),
        'random_state':trial.suggest_categorical("random_state", [0, 42]),
    }
    
    pipeline_ridge = Pipeline(steps=[ 
                                     ('model', linear_model.Lasso(**param_model)),
                                     ])

    rmse_mean_cv = cross_val_score(pipeline_ridge, transform, y, cv=5, scoring=rmse)
    print(rmse_mean_cv)

    return rmse_mean_cv.mean()


# def objective(trial):
    
#     param_model = {
#         'tree_method':'approx',
#         'n_estimators': 5000,
#         'eval_metric':'rmse',
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.9),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-2, 1),
#         'max_depth': trial.suggest_int('max_depth', 4, 6),
#         'objective': trial.suggest_categorical("objective", ['reg:squarederror']),
#         'reg_alpha': trial.suggest_float('reg_alpha', 1e-2, 1000),
#         'reg_lambda': trial.suggest_int('reg_lambda', 1, 100), 
#         'subsample': trial.suggest_float('subsample', 0.5, 0.9)
#     }
#     pipeline_ridge = Pipeline(steps=[ 
#                                      ('model', xgboost.XGBRegressor(**param_model)),
#                                      ])

#     rmse_mean_cv = cross_val_score(pipeline_ridge, X, y, cv=5, scoring=rmse, n_jobs=-1)
#     print(rmse_mean_cv)

#     return rmse_mean_cv.mean()

In [12]:
study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=10)

[32m[I 2021-08-27 15:57:30,662][0m A new study created in memory with name: no-name-e9abb1ad-129f-4447-a9dc-a03deb96962a[0m
[32m[I 2021-08-27 15:57:32,110][0m Trial 0 finished with value: 72.05947747262431 and parameters: {'alpha': 0.21835757368184883, 'random_state': 42}. Best is trial 0 with value: 72.05947747262431.[0m


[72.30284566 72.22828324 71.66470721 71.89434073 72.20721052]


[32m[I 2021-08-27 15:57:35,240][0m Trial 1 finished with value: 72.07575655156508 and parameters: {'alpha': 0.024118425475249422, 'random_state': 42}. Best is trial 0 with value: 72.05947747262431.[0m


[72.30991625 72.28181161 71.66621849 71.89704599 72.22379041]


[32m[I 2021-08-27 15:57:36,569][0m Trial 2 finished with value: 72.05928410657759 and parameters: {'alpha': 0.6106770790022757, 'random_state': 42}. Best is trial 2 with value: 72.05928410657759.[0m


[72.31650428 72.22814257 71.66136058 71.88262151 72.20779159]


[32m[I 2021-08-27 15:57:38,637][0m Trial 3 finished with value: 72.07148182703123 and parameters: {'alpha': 0.05571983134626278, 'random_state': 0}. Best is trial 2 with value: 72.05928410657759.[0m


[72.31228571 72.26430603 71.66478748 71.90074496 72.21528495]


[32m[I 2021-08-27 15:57:42,461][0m Trial 4 finished with value: 72.07713097938151 and parameters: {'alpha': 0.017030720831404004, 'random_state': 0}. Best is trial 2 with value: 72.05928410657759.[0m


[72.30802483 72.28725077 71.66841662 71.89360318 72.22835951]


[32m[I 2021-08-27 15:57:43,813][0m Trial 5 finished with value: 72.06055122936269 and parameters: {'alpha': 0.37628168966642295, 'random_state': 0}. Best is trial 2 with value: 72.05928410657759.[0m


[72.30980014 72.23004701 71.66432985 71.88989875 72.2086804 ]


[32m[I 2021-08-27 15:57:45,022][0m Trial 6 finished with value: 72.0604518297869 and parameters: {'alpha': 0.8599853430455336, 'random_state': 42}. Best is trial 2 with value: 72.05928410657759.[0m


[72.32335395 72.22993748 71.66007732 71.87995238 72.20893802]


[32m[I 2021-08-27 15:57:46,567][0m Trial 7 finished with value: 72.06620306969901 and parameters: {'alpha': 0.09023447691021283, 'random_state': 42}. Best is trial 2 with value: 72.05928410657759.[0m


[72.30808423 72.24770771 71.66387959 71.90086358 72.21048024]


[32m[I 2021-08-27 15:57:48,172][0m Trial 8 finished with value: 72.0626460889379 and parameters: {'alpha': 0.115632714796668, 'random_state': 42}. Best is trial 2 with value: 72.05928410657759.[0m


[72.30414653 72.23992338 71.66215651 71.89832585 72.20867818]


[32m[I 2021-08-27 15:57:51,294][0m Trial 9 finished with value: 72.07582095625051 and parameters: {'alpha': 0.023619795613825902, 'random_state': 42}. Best is trial 2 with value: 72.05928410657759.[0m


[72.30970892 72.2821461  71.66634131 71.89683119 72.22407726]


In [13]:
best_params = study.best_trial.params
best_params

{'alpha': 0.6106770790022757, 'random_state': 42}

In [14]:
rmse = make_scorer(mean_squared_error, squared=False)
X_test = preprocessor.fit_transform(test_ensemble.drop(["target"], axis=1))

In [15]:
transform.shape

(54000, 74)

In [16]:
X_test = preprocessor.fit_transform(test.drop('id', axis=1))
X_test.shape

(200000, 74)

In [23]:
preprocessor.named_transformers_  

{'num': Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                 ('scaling', StandardScaler()), ('normal', PowerTransformer())]),
 'cat': Pipeline(steps=[('encoding', OneHotEncoder(handle_unknown='ignore'))])}

In [17]:
from sklearn.linear_model import Lasso
model = Lasso(**best_params)
model.fit(transform, y)
pred = model.predict(X_test)
# print(mean_squared_error(test_ensemble["target"], pred, squared=False))

# best_params['tree_method'] = 'gpu_hist',
# best_params['n_estimators']= 10000,
# best_params['eval_metric']='rmse',

# model = xgboost.XGBRegressor(**best_params)
# model.fit(df_ensemble, train_ensemble[['target']])
# pred = model.predict(df_test)
# print(mean_squared_error(test_ensemble["target"], pred, squared=False))

In [26]:
model.coef_

array([ 0.        , -0.04301254,  0.        ,  0.        ,  0.        ,
       -0.        , -0.        ,  0.        , -0.        , -0.        ,
        0.        , -0.        ,  0.        , -0.        ,  5.83758709,
       14.83521743,  0.        , -0.        , -0.        ,  0.        ,
        0.        , -0.        , -0.        , -0.        ,  0.        ,
        0.        ,  0.        ,  0.        , -0.        , -0.        ,
        0.        , -0.        , -0.        ,  0.        ,  0.        ,
       -0.        ,  0.        , -0.        ,  0.        , -0.        ,
        0.        ,  0.        , -0.        ,  0.        ,  0.        ,
        0.        , -0.        ,  0.        , -0.        , -0.        ,
        0.        ,  0.        ,  0.        ,  0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
        0.        ,  0.        ,  0.        , -0.        ,  0.        ,
        0.        ,  0.        , -0.        ,  0.        ,  0.  

In [92]:
pred

array([804.84934709, 839.50664185, 840.10554623, ..., 848.1891627 ,
       817.13214307, 802.32941003])

# save predict

In [None]:
# test = pd.read_csv("./sample_data/test_anomaly.csv", delimiter=",", sep='.')

In [None]:
# df_ensemble = pd.DataFrame()
# df_ensemble['lr'] = lr_pred
# df_ensemble['xgb_pred'] = xgb_pred
# df_ensemble['cat_pred'] = cat_pred

# df_test = pd.DataFrame()
# df_test['lr'] = lr_test
# df_test['xgb_pred'] = xgb_test
# df_test['cat_pred'] = cat_test

In [None]:
# # del
# lr_pred1 = lr.predict(test)
# xgb_pred1 = xgb.predict(test)
# cat_pred1 = catboost.predict(test)

# df_submit = pd.DataFrame()
# df_submit['lr'] = lr_pred1
# df_submit['xgb_pred'] = xgb_pred1
# df_submit['cat_pred'] = cat_pred1

In [24]:
# pred = pipeline_ridge.predict(test)



In [93]:
test['target'] = pred / 100

In [94]:
test.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,anomaly_col,0,1,target
0,0,B,B,B,C,B,B,A,E,E,I,0.296227,0.686757,0.587731,0.392753,0.476739,0.37635,0.337884,0.321832,0.445212,0.290258,0.244476,0.087914,0.301831,0.845702,Norm,811.794579,802.496,8.048493
1,5,A,B,A,C,B,C,A,E,C,H,0.543707,0.364761,0.452967,0.929645,0.285509,0.860046,0.798712,0.835961,0.391657,0.288276,0.549568,0.905097,0.850684,0.69394,Norm,841.286617,838.9279,8.395066
2,15,B,A,A,A,B,B,A,E,D,K,0.408961,0.296129,0.690999,0.740027,0.697272,0.6836,0.404089,0.879379,0.275549,0.427871,0.491667,0.384315,0.376689,0.508099,Norm,837.758268,841.12854,8.401055
3,16,B,B,A,C,B,D,A,E,A,N,1.031239,0.356062,0.303651,0.895591,0.719306,0.77789,0.730954,0.644315,1.024017,0.39109,0.98834,0.411828,0.393585,0.461372,Norm,853.769654,847.4699,8.49548
4,17,B,B,A,C,B,C,A,E,C,F,0.530447,0.729004,0.281723,0.444698,0.313032,0.431007,0.390992,0.408874,0.447887,0.390253,0.648932,0.385935,0.370401,0.900412,Norm,819.820749,811.2265,8.132553


In [96]:
test[['id', 'target']].to_csv('./ensemble.csv', index=False)