In [1]:
import warnings
warnings.filterwarnings("ignore")

import datetime
from sklearn import metrics, model_selection, ensemble
import pandas as pd
import numpy as np
import optuna
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn import linear_model

from imblearn.pipeline import make_pipeline
from imblearn import under_sampling
from sklearn.metrics import roc_auc_score

from ipynb.fs.full.CustomPipeline import CustomPipeline

In [2]:
# test = pd.read_csv("./test_anomaly.csv", delimiter=",", sep='.')
train = pd.read_csv("./train_anomaly.csv", delimiter=",", sep='.')

In [3]:
train['target_distribution'] = train.apply(lambda df: 'exponetial' if (df.target < 6) else 'normal', axis=1)
train['target_distribution'].value_counts()

normal        299628
exponetial       372
Name: target_distribution, dtype: int64

In [4]:
train.drop("target", axis=1, inplace=True)

In [5]:
num_train = train.select_dtypes([int, float])
cat_train = train.select_dtypes(object)

num = list(num_train)
cat = list(cat_train.drop(['target_distribution'],axis=1))

In [6]:
preprocessor = CustomPipeline(10, num, cat).get_preprocessor()

In [13]:
preprocessor

ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('num',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='median',
                                                                verbose=0)),
                                                 ('scaling',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                   

In [7]:
X = train.drop(['target_distribution', 'id'], axis=1)
y = train.target_distribution

In [8]:
transformed = preprocessor.fit_transform(X)

In [9]:
train_x, valid_x, train_y, valid_y = train_test_split(transformed, y, test_size=0.3, random_state=42)

In [10]:
train_x.shape

(210000, 201)

In [11]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE 
from imblearn.over_sampling import KMeansSMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler 
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier

In [15]:
models = [linear_model.LogisticRegression(), RandomForestClassifier(), KNeighborsClassifier()]
for model in models:
    pipeline_rf = make_pipeline(
                                RandomUnderSampler(random_state=42),
                                TruncatedSVD(n_components=40),
                                model
                                )

    pipeline_rf.fit(train_x, train_y)
    print(metrics.roc_auc_score(valid_y, pipeline_rf.predict_proba(valid_x)[:, 1]))

0.5699211246566815
0.5496785148251415
0.5169636131179872


# optuna

In [20]:
from sklearn.model_selection import cross_val_score

In [34]:
def objective(trial):
    
    param_model = {
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
        'C': trial.suggest_uniform('C', 0.01, 100),
    }
    
    param_cvd = {
        'n_components': trial.suggest_int('n_components', 10, 90),
    }

    
    pipeline_rf = make_pipeline(
                                RandomUnderSampler(random_state=42),
                                TruncatedSVD(**param_cvd),
                                linear_model.LogisticRegression(random_state=42, **param_model)
                                )

    return cross_val_score(pipeline_rf, train_x, train_y, cv=5, scoring='roc_auc').mean()


In [35]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, timeout=600)

[32m[I 2021-08-21 15:54:13,843][0m A new study created in memory with name: no-name-02d612d2-0a99-425c-b4d7-387cb20fa39a[0m
[32m[I 2021-08-21 15:54:18,840][0m Trial 0 finished with value: 0.579223556385283 and parameters: {'penalty': 'l2', 'C': 61.69408842524872, 'n_components': 14}. Best is trial 0 with value: 0.579223556385283.[0m
[33m[W 2021-08-21 15:54:23,377][0m Trial 1 failed, because the objective function returned nan.[0m
[32m[I 2021-08-21 15:54:29,675][0m Trial 2 finished with value: 0.5994521022865971 and parameters: {'penalty': 'l2', 'C': 24.540472355636034, 'n_components': 23}. Best is trial 2 with value: 0.5994521022865971.[0m
[32m[I 2021-08-21 15:54:36,107][0m Trial 3 finished with value: 0.6210014771854558 and parameters: {'penalty': 'l2', 'C': 30.30907733492793, 'n_components': 85}. Best is trial 3 with value: 0.6210014771854558.[0m
[33m[W 2021-08-21 15:54:40,618][0m Trial 4 failed, because the objective function returned nan.[0m
[33m[W 2021-08-21 15: