In [55]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, confusion_matrix
import optuna
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pickle

In [9]:
class PCAWithNames(BaseEstimator, TransformerMixin):
    def __init__(self, n_components=2, prefix="PC"):
        self.n_components = n_components
        self.prefix = prefix
        self.pca_ = None
        self.feature_names_out_ = None

    def fit(self, X, y=None):
        # Поддерживаем как DataFrame, так и array
        if isinstance(X, pd.DataFrame):
            X = X.values
        self.pca_ = PCA(n_components=self.n_components).fit(X)
        self.feature_names_out_ = [f"{self.prefix}{i+1}" for i in range(self.pca_.n_components_)]
        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        Xt = self.pca_.transform(X)
        return pd.DataFrame(Xt, columns=self.feature_names_out_, index=getattr(X, 'index', None))

    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_out_)

In [5]:
df = pd.read_pickle('../data/clean_data.pickle')

df.head()

Unnamed: 0,SEQN,target,Age,Sex,timestamp_0,timestamp_1,timestamp_2,timestamp_3,timestamp_4,timestamp_5,...,timestamp_4190,timestamp_4191,timestamp_4192,timestamp_4193,timestamp_4194,timestamp_4195,timestamp_4196,timestamp_4197,timestamp_4198,timestamp_4199
278,62259.0,1,61.0,Male,48,44,48,44,48,44,...,0,0,0,0,0,0,0,0,0,0
279,62259.0,1,61.0,Male,48,44,48,44,48,44,...,0,0,0,0,0,0,0,0,0,0
280,62259.0,1,61.0,Male,48,44,48,44,48,44,...,0,0,0,0,0,0,0,0,0,0
934,62474.0,1,71.0,Female,48,44,48,44,48,44,...,0,0,0,0,0,0,0,0,0,0
935,62474.0,1,71.0,Female,48,44,48,44,48,44,...,0,0,0,0,0,0,0,0,0,0


In [10]:
cat_cols = ['Sex']
numeric_cols = ['Age']
pca_cols = [x for x in df.columns.tolist() if 'timestamp' in x]

In [11]:
len(cat_cols), len(numeric_cols), len(pca_cols)

(1, 1, 4200)

In [22]:
def make_pipeline(
        pca_n_components:int,
        cat_iterations:int,
        cat_lr:float,
        cat_depth:int,
        cat_cols_ = cat_cols,
        pca_cols_=pca_cols,
        numeric_cols_=numeric_cols
) -> Pipeline:
    
    pca_pipeline = Pipeline([
        ('scaler', MinMaxScaler()),
        ('pca', PCA(n_components=pca_n_components))  
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), cat_cols_),
            ('pca', pca_pipeline, pca_cols_),
            ('num', 'passthrough', numeric_cols_)  
        ],
        remainder='drop',  
        verbose_feature_names_out=False 
    ).set_output(transform="pandas") 

    final_pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('classifier', CatBoostClassifier(
            iterations=cat_iterations,
            learning_rate=cat_lr,
            depth=cat_depth,
            verbose=False,
            random_state=42,
            thread_count=1
            )
        )
    ])

    return final_pipeline

In [23]:
X = df[numeric_cols+cat_cols+pca_cols]
y = df[['target']]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
def objective(trial, df=X_train, y=y_train):
    # Гиперпараметры PCA
    n_components = trial.suggest_int("pca_n_components", 1, 239)
    
    # Гиперпараметры CatBoost
    iterations = trial.suggest_int("iterations", 50, 500)
    learning_rate = trial.suggest_float("learning_rate", 1e-3, 0.3, log=True)
    depth = trial.suggest_int("depth", 2, 16)

    pipe = make_pipeline(
        n_components,
        iterations,
        learning_rate,
        depth
    )

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(pipe, df, y, cv=cv, scoring='balanced_accuracy', n_jobs=-1)
    return scores.mean()

In [39]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

[I 2025-12-01 17:00:54,023] A new study created in memory with name: no-name-0549908b-1e7a-410a-b700-f631b05566ad


[I 2025-12-01 17:01:00,971] Trial 0 finished with value: 0.5012025012025012 and parameters: {'pca_n_components': 138, 'iterations': 96, 'learning_rate': 0.12743292645166993, 'depth': 8}. Best is trial 0 with value: 0.5012025012025012.
[I 2025-12-01 17:41:06,936] Trial 1 finished with value: 0.5252353466639181 and parameters: {'pca_n_components': 33, 'iterations': 229, 'learning_rate': 0.007515092346636379, 'depth': 16}. Best is trial 1 with value: 0.5252353466639181.
[I 2025-12-01 18:18:29,063] Trial 2 finished with value: 0.5062530062530063 and parameters: {'pca_n_components': 170, 'iterations': 492, 'learning_rate': 0.04978341661365291, 'depth': 14}. Best is trial 1 with value: 0.5252353466639181.
[I 2025-12-01 18:22:09,753] Trial 3 finished with value: 0.501185322613894 and parameters: {'pca_n_components': 189, 'iterations': 94, 'learning_rate': 0.18896694112261214, 'depth': 13}. Best is trial 1 with value: 0.5252353466639181.
[I 2025-12-01 20:12:39,845] Trial 4 finished with value:

In [40]:
print("Лучшие параметры:", study.best_params)

Лучшие параметры: {'pca_n_components': 2, 'iterations': 368, 'learning_rate': 0.021080089914285394, 'depth': 14}


In [42]:
df_trials = study.trials_dataframe()

df_trials

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_depth,params_iterations,params_learning_rate,params_pca_n_components,state
0,0,0.501203,2025-12-01 17:00:54.025760,2025-12-01 17:01:00.971186,0 days 00:00:06.945426,8,96,0.127433,138,COMPLETE
1,1,0.525235,2025-12-01 17:01:00.973290,2025-12-01 17:41:06.935626,0 days 00:40:05.962336,16,229,0.007515,33,COMPLETE
2,2,0.506253,2025-12-01 17:41:06.945232,2025-12-01 18:18:29.062994,0 days 00:37:22.117762,14,492,0.049783,170,COMPLETE
3,3,0.501185,2025-12-01 18:18:29.081464,2025-12-01 18:22:09.752617,0 days 00:03:40.671153,13,94,0.188967,189,COMPLETE
4,4,0.515581,2025-12-01 18:22:09.761639,2025-12-01 20:12:39.843547,0 days 01:50:30.081908,16,303,0.003762,32,COMPLETE
5,5,0.5,2025-12-01 20:12:39.874413,2025-12-01 20:14:57.766960,0 days 00:02:17.892547,11,380,0.002724,126,COMPLETE
6,6,0.499502,2025-12-01 20:14:57.775786,2025-12-01 20:15:05.101175,0 days 00:00:07.325389,11,217,0.00174,10,COMPLETE
7,7,0.5,2025-12-01 20:15:05.103481,2025-12-01 20:15:32.800425,0 days 00:00:27.696944,11,58,0.031477,163,COMPLETE
8,8,0.5,2025-12-01 20:15:32.802421,2025-12-01 20:15:52.777521,0 days 00:00:19.975100,8,384,0.002069,104,COMPLETE
9,9,0.491582,2025-12-01 20:15:52.780692,2025-12-01 20:16:27.155205,0 days 00:00:34.374513,11,85,0.072674,138,COMPLETE


In [43]:
df_trials.to_pickle('../data/optuna_study_catboost_pca.pickle')

In [44]:
best_params = study.best_params

In [45]:
best_params

{'pca_n_components': 2,
 'iterations': 368,
 'learning_rate': 0.021080089914285394,
 'depth': 14}

In [47]:
model = make_pipeline(
    best_params['pca_n_components'], 
    best_params['iterations'],
    best_params['learning_rate'],
    best_params['depth'],
)

In [48]:
model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('pca', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,n_components,2
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,


In [49]:
balanced_accuracy_score(y_train, model.predict(X_train))

0.9983108108108107

In [50]:
balanced_accuracy_score(y_test, model.predict(X_test))

0.6739864864864865

In [51]:
df_train = pd.concat([X_train, y_train], axis=1)

df_train.head()

Unnamed: 0,Age,Sex,timestamp_0,timestamp_1,timestamp_2,timestamp_3,timestamp_4,timestamp_5,timestamp_6,timestamp_7,...,timestamp_4191,timestamp_4192,timestamp_4193,timestamp_4194,timestamp_4195,timestamp_4196,timestamp_4197,timestamp_4198,timestamp_4199,target
1897,56.0,Female,48,44,48,44,48,44,48,44,...,0,0,0,0,0,0,0,0,0,1
1435,55.0,Male,48,44,48,44,48,44,48,44,...,0,0,0,0,0,0,0,0,0,1
5466,39.0,Female,48,44,48,44,48,44,48,44,...,0,0,0,0,0,0,0,0,0,1
9768,62.0,Female,48,44,48,44,48,44,48,44,...,0,0,0,0,0,0,0,0,0,1
16759,51.0,Female,48,44,48,44,48,44,48,44,...,0,0,0,0,0,0,0,0,0,1


In [52]:
df_train.to_pickle('../data/train_data.pickle')

In [53]:
df_test = pd.concat([X_test, y_test], axis=1)

df_test.head()

Unnamed: 0,Age,Sex,timestamp_0,timestamp_1,timestamp_2,timestamp_3,timestamp_4,timestamp_5,timestamp_6,timestamp_7,...,timestamp_4191,timestamp_4192,timestamp_4193,timestamp_4194,timestamp_4195,timestamp_4196,timestamp_4197,timestamp_4198,timestamp_4199,target
25846,61.0,Male,48,44,48,44,48,44,48,44,...,0,0,0,0,0,0,0,0,0,1
26894,35.0,Female,48,44,48,44,48,44,48,44,...,0,0,0,0,0,0,0,0,0,1
8566,24.0,Male,48,44,48,44,48,44,48,44,...,0,0,0,0,0,0,0,0,0,1
2636,59.0,Female,48,44,48,44,48,44,48,44,...,0,0,0,0,0,0,0,0,0,1
25854,61.0,Male,48,44,48,44,48,44,48,44,...,0,0,0,0,0,0,0,0,0,1


In [54]:
df_test.to_pickle('../data/test_data.pickle')

In [56]:
with open('../data/catboost_pca_baseline.pickle', 'wb') as f:
    pickle.dump(model, f)