In [None]:
#add optuna

In [None]:
!pip uninstall transformers -y
!pip install -U "transformers==4.56.0" 

In [None]:
pip install h3

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from catboost import CatBoostRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA

import re
import time
import torch

In [None]:
from h3.api import basic_str as h3
import warnings
warnings.filterwarnings('ignore')

tqdm.pandas()

train_data = pd.read_csv("/kaggle/input/cnewwwww/train.tsv", sep="\t")
test_data = pd.read_csv("/kaggle/input/cnewwwww/test.tsv", sep="\t")
reviews_data = pd.read_csv("/kaggle/input/cnewwwww/reviews.tsv", sep="\t")

In [None]:
train_data.isna().sum().sort_values()

In [None]:
train_data = train_data[train_data['target'] >= 1]

def extract_coords(coord_str, coord_type):
    comma_pos = coord_str.find(",")
    if coord_type == "lng":
        return float(coord_str[1:comma_pos])
    elif coord_type == "lat":
        return float(coord_str[comma_pos+2:-1])

train_data['lng'] = train_data['coordinates'].apply(lambda x: extract_coords(x, "lng"))
train_data['lat'] = train_data['coordinates'].apply(lambda x: extract_coords(x, "lat"))
test_data['lng'] = test_data['coordinates'].apply(lambda x: extract_coords(x, "lng"))
test_data['lat'] = test_data['coordinates'].apply(lambda x: extract_coords(x, "lat"))

for df in (train_data, test_data):
    df["lat_rad"] = np.radians(df["lat"])
    df["lng_rad"] = np.radians(df["lng"])
    df["sin_lat"] = np.sin(df["lat_rad"])
    df["cos_lat"] = np.cos(df["lat_rad"])
    df["sin_lng"] = np.sin(df["lng_rad"])
    df["cos_lng"] = np.cos(df["lng_rad"])
    df["lat_lng_ratio"] = df["lat"] / df["lng"]
    df["lng_lat_ratio"] = df["lng"] / df["lat"]
    df["coord_sum"] = df["lat"] + df["lng"]
    df["coord_product"] = df["lat"] * df["lng"]

def add_h3(df, levels=(7, 8, 9)):
    for lvl in levels:
        df[f"h3_{lvl}"] = [h3.latlng_to_cell(df["lat"].iloc[i], df["lng"].iloc[i], lvl) for i in range(len(df))]
    return df
train_data = add_h3(train_data)
test_data = add_h3(test_data)

for df in (train_data, test_data):
    for lvl in (7, 8, 9):
        col = f"h3_{lvl}"
        df[f"{col}_density"] = df[col].map(df[col].value_counts())

import json
from sklearn.neighbors import BallTree
with open('/kaggle/input/metrodata/metro.msk.json', 'r', encoding='utf-8') as f:
    metro_data = json.load(f)
metro_stations = []
for line in metro_data:
    for station in line['stations']:
        metro_stations.append({
            'name': station['name'],
            'lat': station['lat'],
            'lng': station['lng']
        })
metro_df = pd.DataFrame(metro_stations)
metro_coords_rad = np.radians(metro_df[['lat', 'lng']].values)
metro_tree = BallTree(metro_coords_rad, metric='haversine')

def find_nearest_metro(lat, lng, k=3):
    point_rad = np.radians([[lat, lng]])
    distances, indices = metro_tree.query(point_rad, k=k)
    distances_km = distances * 6371
    return distances_km[0], indices[0]

train_index = train_data.index
test_index = test_data.index

train_metro_features = []
for idx, row in train_data.iterrows():
    distances, indices = find_nearest_metro(row['lat'], row['lng'], k=3)
    train_metro_features.append({
        'metro_dist_1': distances[0],
        'metro_dist_2': distances[1],
        'metro_dist_3': distances[2]
    })
train_metro_df = pd.DataFrame(train_metro_features, index=train_index)
train_data = pd.concat([train_data, train_metro_df], axis=1)

test_metro_features = []
for idx, row in test_data.iterrows():
    distances, indices = find_nearest_metro(row['lat'], row['lng'], k=3)
    test_metro_features.append({
        'metro_dist_1': distances[0],
        'metro_dist_2': distances[1],
        'metro_dist_3': distances[2]
    })
test_metro_df = pd.DataFrame(test_metro_features, index=test_index)
test_data = pd.concat([test_data, test_metro_df], axis=1)

In [None]:
import polars as pl

In [None]:
from sklearn.neighbors import BallTree

In [None]:
def create_spatial_features_train_test(train_df, test_df, category_col='category', lat_col='lat', lon_col='lng', target_col='target'):

    if isinstance(train_df, pl.DataFrame):
        train_pl = train_df
    else:
        train_pl = pl.from_pandas(train_df)
        
    if isinstance(test_df, pl.DataFrame):
        test_pl = test_df
    else:
        test_pl = pl.from_pandas(test_df)
    
    # 1. Создаем BallTree на train данных
    train_lat = train_pl[lat_col].to_numpy()
    train_lon = train_pl[lon_col].to_numpy()
    train_category = train_pl[category_col].to_numpy()
    train_target = train_pl[target_col].to_numpy()
    
    train_coords_radians = np.deg2rad(np.column_stack([train_lat, train_lon]))
    tree = BallTree(train_coords_radians, metric='haversine')
    
    radius_300m = 300 / 6371000
    radius_1000m = 1000 / 6371000
    
    def calculate_features(df_coords, df_categories, is_train=False):
        mean_target_300m = []
        mean_target_1000m = []
        
        for idx in tqdm(range(len(df_coords))):
            current_category = df_categories[idx]
            current_coord = np.deg2rad([df_coords[idx]])
            
            indices_300m = tree.query_radius(current_coord, r=radius_300m)[0]
            indices_1000m = tree.query_radius(current_coord, r=radius_1000m)[0]
            
            if is_train:
                same_category_300m = [
                    i for i in indices_300m 
                    if i != idx and train_category[i] == current_category
                ]
                same_category_1000m = [
                    i for i in indices_1000m 
                    if i != idx and train_category[i] == current_category
                ]
            else:
                same_category_300m = [
                    i for i in indices_300m 
                    if train_category[i] == current_category
                ]
                same_category_1000m = [
                    i for i in indices_1000m 
                    if train_category[i] == current_category
                ]
            
            if len(same_category_300m) > 0:
                mean_300 = np.mean(train_target[same_category_300m])
            else:
                mean_300 = np.nan
                
            if len(same_category_1000m) > 0:
                mean_1000 = np.mean(train_target[same_category_1000m])
            else:
                mean_1000 = np.nan
            
            mean_target_300m.append(mean_300)
            mean_target_1000m.append(mean_1000)
        
        return mean_target_300m, mean_target_1000m
    
    train_coords = np.column_stack([train_lat, train_lon])
    train_300m, train_1000m = calculate_features(train_coords, train_category, is_train=True)
    
    test_lat = test_pl[lat_col].to_numpy()
    test_lon = test_pl[lon_col].to_numpy()
    test_category = test_pl[category_col].to_numpy()
    test_coords = np.column_stack([test_lat, test_lon])
    test_300m, test_1000m = calculate_features(test_coords, test_category, is_train=False)
    
    train_result = train_pl.with_columns([
        pl.Series(f'mean_{target_col}_same_category_300m', train_300m),
        pl.Series(f'mean_{target_col}_same_category_1000m', train_1000m)
    ])
    
    test_result = test_pl.with_columns([
        pl.Series(f'mean_{target_col}_same_category_300m', test_300m),
        pl.Series(f'mean_{target_col}_same_category_1000m', test_1000m)
    ])
    
    return train_result.to_pandas(), test_result.to_pandas()


In [None]:
train_data,test_data =  create_spatial_features_train_test(
    train_data, 
    test_data,
    category_col='category',
    lat_col='lat', 
    lon_col='lng',
    target_col='target'
)


In [None]:
train_data['address'] = train_data['address'].fillna("")
test_data['address'] = test_data['address'].fillna("")

for df in (train_data, test_data):
    df["name_len"] = df["name"].str.len()
    df["name_words"] = df["name"].str.split().str.len()
    df["addr_len"] = df["address"].str.len()
    df["name_log_len"] = np.log1p(df["name_len"])
    df["addr_log_len"] = np.log1p(df["addr_len"])

combined = pd.concat([train_data[["id", "name", "address", "category"]], test_data[["id", "name", "address", "category"]]])
addr_count = combined["address"].value_counts().to_dict()
name_count = combined["name"].value_counts().to_dict()
cat_count = combined["category"].value_counts().to_dict()
rev_count = reviews_data.groupby("id")["text"].size().to_dict()

for df in (train_data, test_data):
    df["addr_count"] = df["address"].map(addr_count)
    df["name_count"] = df["name"].map(name_count)
    df["rev_count"] = df["id"].map(rev_count)
    df["cat_count"] = df["category"].map(cat_count)
    df["seti"] = (df["name_count"] > 1).astype(int)

word_pat = re.compile(r"\w+", re.I)

def get_text_features(text):
    if not isinstance(text, str):
        return pd.Series([0, 0, 0, 0, 0])
    words = word_pat.findall(text.lower())
    word_cnt = len(words)
    unique_cnt = len(set(words))
    return pd.Series([
        len(text),
        word_cnt,
        unique_cnt,
        unique_cnt / max(word_cnt, 1),
        np.mean([len(w) for w in words]) if words else 0
    ])

text_feats = pd.DataFrame([get_text_features(t) for t in tqdm(reviews_data["text"])])
text_feats.columns = ["text_len", "word_cnt", "unique_cnt", "lex_div", "avg_word_len"]
text_feats["id"] = reviews_data["id"]
text_feats["text_log_len"] = np.log1p(text_feats["text_len"])

text_stats = reviews_data.groupby("id")["text"].agg(
    rev_len_mean=lambda x: np.mean(x.str.len()),
    rev_len_std=lambda x: np.std(x.str.len()),
    rev_len_log=lambda x: np.log1p(np.mean(x.str.len()))
).reset_index()

text_agg = text_feats.groupby("id").agg({
    "text_len": ["mean", "std"],
    "word_cnt": ["mean", "std"], 
    "unique_cnt": "mean",
    "lex_div": "mean",
    "avg_word_len": "mean",
    "text_log_len": "mean"
})

text_agg.columns = [f"{a}_{b}" for a, b in text_agg.columns]
text_agg = text_agg.reset_index()

In [None]:

for df_name, df in [('train', train_data), ('test', test_data)]:
    merged = df.merge(text_stats, on="id", how="left")
    for col in text_stats.columns:
        if col != "id":
            merged[col] = merged[col].fillna(0)
    
    if df_name == 'train':
        train_data = merged
    else:
        test_data = merged

train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [None]:
train_data['rev_count'] = train_data['rev_count'].fillna(0)

In [None]:
test_data['rev_count'] = test_data['rev_count'].fillna(0)

In [None]:
sentiment_model_name = "tabularisai/multilingual-sentiment-analysis"
batch_size = 32
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name).to(device)
model.eval() 


In [None]:

texts = reviews_data["text"].astype(str).tolist()
review_ids = reviews_data["id"].values

predicted_classes = []
confidence_scores = []

with torch.no_grad():
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
        outputs = model(**inputs)
        
        predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        predicted_classes.extend(predictions)
        
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()
        confidence_scores.extend(probabilities.max(axis=1))

sentiment_df = pd.DataFrame({
    "id": review_ids, 
    "sentiment_class": predicted_classes,
    "confidence": confidence_scores
})

aggregated_sentiment = (
    sentiment_df.groupby("id")["sentiment_class"]
    .agg([
        ("very_positive_count", lambda x: np.sum(x == 4)),  # Very Positive
        ("positive_count", lambda x: np.sum(x == 3)),       # Positive  
        ("neutral_count", lambda x: np.sum(x == 2)),        # Neutral
        ("negative_count", lambda x: np.sum(x == 1)),       # Negative
        ("very_negative_count", lambda x: np.sum(x == 0)),  # Very Negative
        ("total_count", "count"),
        ("mean_sentiment", "mean")  
    ])
    .reset_index()
)

aggregated_sentiment["positive_ratio"] = (
    aggregated_sentiment["very_positive_count"] + aggregated_sentiment["positive_count"]
) / aggregated_sentiment["total_count"].clip(lower=1)

aggregated_sentiment["negative_ratio"] = (
    aggregated_sentiment["very_negative_count"] + aggregated_sentiment["negative_count"] 
) / aggregated_sentiment["total_count"].clip(lower=1)

aggregated_sentiment["neutral_ratio"] = (
    aggregated_sentiment["neutral_count"] 
) / aggregated_sentiment["total_count"].clip(lower=1)

train_data = train_data.merge(aggregated_sentiment, on="id", how="left")
test_data = test_data.merge(aggregated_sentiment, on="id", how="left")

sentiment_columns = [
    "very_positive_count", "positive_count", "neutral_count", 
    "negative_count", "very_negative_count", "total_count",
    "mean_sentiment", "positive_ratio", "negative_ratio", 
    "neutral_ratio", 
]



In [None]:

embedding_model_name = "cointegrated/rubert-tiny2"

grouped_reviews = (
    reviews_data.groupby("id")["text"]
    .apply(lambda x: " ".join(x.tolist()[:20])[:2048])
    .reset_index()
    .rename(columns={"text": "combined_text"})
)

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
embedding_model = SentenceTransformer(embedding_model_name)

In [None]:
text_list = grouped_reviews["combined_text"].tolist()
embeddings = embedding_model.encode(
    text_list,
    batch_size=batch_size,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=False
)

pca = PCA(n_components=64, random_state=42)
reduced_embeddings = pca.fit_transform(embeddings)

embedding_features = [f"embedding_{i}" for i in range(64)]
embedding_df = pd.DataFrame(reduced_embeddings, columns=embedding_features)
embedding_df["id"] = grouped_reviews["id"].values

train_data = train_data.merge(embedding_df, on="id", how="left")
test_data = test_data.merge(embedding_df, on="id", how="left")

In [None]:
train_data.isna().sum().sort_values()

In [None]:
train_data[embedding_features] = train_data[embedding_features].fillna(0)
test_data[embedding_features] = test_data[embedding_features].fillna(0)

In [None]:
categorical_features = ['name', 'category']
target_column = 'target'
feature_columns = [col for col in train_data.columns if col not in [target_column, 'id', 'coordinates', 'address', 'h3_7', 'h3_8', 'h3_9',
                                                                   'h3_7_density', 'h3_8_density', 'h3_9_density']]

In [None]:
#adv val

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

NFOLD = 5
SEED = 42
TOP_N_IMP = 30
EARLY_STOPPING = 200  

In [None]:
cat_feats = categorical_features

In [None]:
X_tr_df = train_data[feature_columns]
X_te_df = test_data[feature_columns]

In [None]:
for c in cat_feats:
    if c in X_tr_df.columns:
        X_tr_df[c] = X_tr_df[c].astype('object')
    if c in X_te_df.columns:
        X_te_df[c] = X_te_df[c].astype('object')

In [None]:
import matplotlib.pyplot as plt

In [None]:
n_train = X_tr_df.shape[0]
n_test  = X_te_df.shape[0]

# ---------------- Adversarial Validation ----------------
# is_test: train=0, test=1, P(y=1)=P(is_test)

X_av = pd.concat([X_tr_df, X_te_df], axis=0, ignore_index=True)
y_av = np.concatenate([np.zeros(n_train, dtype=int), np.ones(n_test, dtype=int)])

skf_av = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)

av_params = dict(
    loss_function='Logloss',
    eval_metric='AUC',
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3.0,
    iterations=1000,
    od_type='Iter',
    od_wait=EARLY_STOPPING,
    random_seed=SEED,
    verbose=100,
    auto_class_weights='Balanced', task_type = 'GPU'
)

oof_p_test = np.zeros(n_train + n_test, dtype=float)
fold_aucs_av = []
fold_imps = []

for fold, (tr_idx, va_idx) in enumerate(skf_av.split(X_av, y_av), 1):
    tr_pool = Pool(X_av.iloc[tr_idx], label=y_av[tr_idx], cat_features=cat_feats)
    va_pool = Pool(X_av.iloc[va_idx], label=y_av[va_idx], cat_features=cat_feats)

    model_av = CatBoostClassifier(**av_params)
    model_av.fit(tr_pool, eval_set=va_pool, use_best_model=True)

    oof_p_test[va_idx] = model_av.predict_proba(va_pool)[:, 1]
    fold_auc = roc_auc_score(y_av[va_idx], oof_p_test[va_idx])
    fold_aucs_av.append(fold_auc)

    fold_imps.append(model_av.get_feature_importance(type='FeatureImportance'))
    print(f"[AV] Fold {fold}/{NFOLD} AUC={fold_auc:.6f}")

oof_auc_av = roc_auc_score(y_av, oof_p_test)
print(f"[AV] OOF AUC: {oof_auc_av:.6f}")

In [None]:
# ---------------- Plots ----------------
p_train = oof_p_test[:n_train]
p_test  = oof_p_test[n_train:]

plt.figure(figsize=(8,5))
bins = np.linspace(0, 1, 50)
plt.hist(p_train, bins=bins, alpha=0.6, label='train (is_test=0)')
plt.hist(p_test,  bins=bins, alpha=0.6, label='test (is_test=1)')
plt.axvline(0.5, color='k', linestyle='--', linewidth=1)
plt.title(f'Adversarial OOF P(test) | AUC={oof_auc_av:.4f}')
plt.xlabel('P(sample is from test)')
plt.ylabel('Count')
plt.legend()
plt.tight_layout()
plt.show()

avg_imp = np.mean(np.stack(fold_imps, axis=0), axis=0)
ord_idx = np.argsort(avg_imp)[::-1][:TOP_N_IMP]
plt.figure(figsize=(9, max(4.5, 0.35*len(ord_idx))))
plt.barh(np.array(feature_columns)[ord_idx][::-1], avg_imp[ord_idx][::-1], color='#4e79a7')
plt.title(f'Adversarial feature importance (top {TOP_N_IMP})')
plt.tight_layout()
plt.show()

In [None]:
X_av

In [None]:
weights.max()

In [None]:
weights = p_train

In [None]:
# =========================================================
# CatBoost Hyperparameter Tuning (Classifier & Regressor)
# - IO: polars, моделирование: pandas + CatBoost
# - Tuning: Optuna + (Stratified)KFold CV + early stopping
# - GPU/CPU-aware пространство поиска
# - Возвращает лучшие параметры, лучшие модели CV-ансамбля, OOF и предсказания на тесте (опционально)
# =========================================================
import os
import gc
import math
import warnings
warnings.filterwarnings("ignore")

import polars as pl
import numpy as np
import pandas as pd

import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from optuna.importance import get_param_importances

from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, log_loss, mean_squared_error

# --------------- Config (правьте под себя) ---------------
SEED = 42
NFOLD = 5
EARLY_STOPPING = 200
N_TRIALS_CLS = 60
N_TRIALS_REG = 60
TIMEOUT_SEC = None  # например 3600

USE_GPU = False  # True, если хотите принудительно использовать GPU
N_THREADS = 0    # 0 = все доступные

# Явно укажите:
TARGET_CLASS = 'target_cls'  # таргет для классификации
TARGET_REG   = 'target_reg'  # таргет для регрессии
CAT_COLS = ['cat_col_1', 'cat_col_2']  # список категориальных фич по именам
ID_COL = None  # опционально

# Если FEATURES=None — возьмём все столбцы кроме TARGET/ID
FEATURES_CLS = None
FEATURES_REG = None

# --------------- Utils ---------------
def set_seed(seed=42):
    import random
    random.seed(seed)
    np.random.seed(seed)
set_seed(SEED)

def detect_gpu():
    if USE_GPU:
        return True
    # простая эвристика: проверим переменную окружения
    return False

GPU_AVAILABLE = detect_gpu()

def to_pandas_and_prepare(df_pl: pl.DataFrame, features: list, cat_cols: list):
    df_pd = df_pl.select(features).to_pandas()
    # CatBoost: object dtype для категориальных
    cat_cols = [c for c in cat_cols if c in features]
    for c in cat_cols:
        df_pd[c] = df_pd[c].astype('object')
    cat_idx = [i for i, c in enumerate(features) if c in cat_cols]
    return df_pd, cat_idx

def get_features(df_pl: pl.DataFrame, target: str, id_col: str | None, explicit_features: list | None):
    if explicit_features is not None:
        return explicit_features
    exc = {target}
    if id_col and id_col in df_pl.columns:
        exc.add(id_col)
    return [c for c in df_pl.columns if c not in exc]

def kfold_generator_classification(y, n_splits=5, seed=42):
    return StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed).split(np.zeros_like(y), y)

def kfold_generator_regression(X, n_splits=5, seed=42):
    return KFold(n_splits=n_splits, shuffle=True, random_state=seed).split(X)

# --------------- Hyperparameter spaces ---------------
def suggest_common_params(trial, for_gpu: bool):
    params = {
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 100.0, log=True),
        "random_strength": trial.suggest_float("random_strength", 0.0, 2.0),
        "rsm": trial.suggest_float("rsm", 0.5, 1.0),  # column sampling
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 64),
        "leaf_estimation_method": trial.suggest_categorical("leaf_estimation_method", ["Newton", "Gradient"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli"]),
        "iterations": 10000,
        "od_type": "Iter",
        "od_wait": EARLY_STOPPING,
        "random_seed": SEED,
        "thread_count": N_THREADS,
        "task_type": "GPU" if for_gpu else "CPU",
        "allow_writing_files": False,
        "verbose": 0,
    }
    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0.0, 1.0)
    else:  # Bernoulli
        params["subsample"] = trial.suggest_float("subsample", 0.5, 1.0)

    # grow_policy зависит от устройства
    if for_gpu:
        # на GPU разрешён Lossguide
        grow = trial.suggest_categorical("grow_policy", ["SymmetricTree", "Lossguide"])
        params["grow_policy"] = grow
        if grow == "Lossguide":
            params["max_leaves"] = trial.suggest_int("max_leaves", 31, 512)
        # можно ограничить bins, если нужно
        params["border_count"] = trial.suggest_int("border_count", 32, 128)
    else:
        params["grow_policy"] = "SymmetricTree"
        params["border_count"] = trial.suggest_int("border_count", 32, 255)

    return params

# --------------- CV evaluation helpers ---------------
def cv_score_classifier(params, X, y, cat_idx, n_splits=5, seed=42):
    scores = []
    best_iters = []
    for fold, (tr_idx, va_idx) in enumerate(kfold_generator_classification(y, n_splits, seed), 1):
        tr_pool = Pool(X.iloc[tr_idx], label=y[tr_idx], cat_features=cat_idx)
        va_pool = Pool(X.iloc[va_idx], label=y[va_idx], cat_features=cat_idx)

        model = CatBoostClassifier(
            loss_function="Logloss",
            eval_metric="AUC",
            **params
        )
        model.fit(tr_pool, eval_set=va_pool, use_best_model=True)
        pred = model.predict_proba(va_pool)[:, 1]
        auc = roc_auc_score(y[va_idx], pred)
        scores.append(auc)
        best_iters.append(model.get_best_iteration())
    return float(np.mean(scores)), int(np.mean(best_iters))

def cv_score_regressor(params, X, y, cat_idx, n_splits=5, seed=42):
    rmses = []
    best_iters = []
    for fold, (tr_idx, va_idx) in enumerate(kfold_generator_regression(X, n_splits, seed), 1):
        tr_pool = Pool(X.iloc[tr_idx], label=y[tr_idx], cat_features=cat_idx)
        va_pool = Pool(X.iloc[va_idx], label=y[va_idx], cat_features=cat_idx)

        model = CatBoostRegressor(
            loss_function="RMSE",
            eval_metric="RMSE",
            **params
        )
        model.fit(tr_pool, eval_set=va_pool, use_best_model=True)
        pred = model.predict(va_pool)
        rmse = mean_squared_error(y[va_idx], pred, squared=False)
        rmses.append(rmse)
        best_iters.append(model.get_best_iteration())
    return float(np.mean(rmses)), int(np.mean(best_iters))

# --------------- Optuna objectives ---------------
def objective_classifier(trial, X, y, cat_idx):
    params = suggest_common_params(trial, GPU_AVAILABLE)
    # Доп. параметры, специфичные для классификации
    params["auto_class_weights"] = trial.suggest_categorical("auto_class_weights", [None, "Balanced"])

    # CV оценка
    scores = []
    best_iters = []
    # Pruning по фолдам (report + prune)
    for step, (tr_idx, va_idx) in enumerate(kfold_generator_classification(y, NFOLD, SEED)):
        tr_pool = Pool(X.iloc[tr_idx], label=y[tr_idx], cat_features=cat_idx)
        va_pool = Pool(X.iloc[va_idx], label=y[va_idx], cat_features=cat_idx)

        model = CatBoostClassifier(
            loss_function="Logloss",
            eval_metric="AUC",
            **params
        )
        model.fit(tr_pool, eval_set=va_pool, use_best_model=True)
        pred = model.predict_proba(va_pool)[:, 1]
        auc = roc_auc_score(y[va_idx], pred)
        scores.append(auc)
        best_iters.append(model.get_best_iteration())

        # сообщаем промежуточное значение и проверяем на prune
        trial.report(np.mean(scores), step=step)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return float(np.mean(scores))

def objective_regressor(trial, X, y, cat_idx):
    params = suggest_common_params(trial, GPU_AVAILABLE)
    # CV оценка
    rmses = []
    for step, (tr_idx, va_idx) in enumerate(kfold_generator_regression(X, NFOLD, SEED)):
        tr_pool = Pool(X.iloc[tr_idx], label=y[tr_idx], cat_features=cat_idx)
        va_pool = Pool(X.iloc[va_idx], label=y[va_idx], cat_features=cat_idx)

        model = CatBoostRegressor(
            loss_function="RMSE",
            eval_metric="RMSE",
            **params
        )
        model.fit(tr_pool, eval_set=va_pool, use_best_model=True)
        pred = model.predict(va_pool)
        rmse = mean_squared_error(y[va_idx], pred, squared=False)
        rmses.append(rmse)

        trial.report(np.mean(rmses), step=step)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    # minimize
    return float(np.mean(rmses))

# --------------- Tuning runners ---------------
def tune_classifier(X, y, cat_idx, n_trials=N_TRIALS_CLS, timeout=TIMEOUT_SEC):
    study = optuna.create_study(
        direction="maximize",
        sampler=TPESampler(seed=SEED),
        pruner=MedianPruner(n_warmup_steps=2),
        study_name="catboost_classifier_tuning"
    )
    study.optimize(lambda tr: objective_classifier(tr, X, y, cat_idx),
                   n_trials=n_trials, timeout=timeout, show_progress_bar=True)
    return study

def tune_regressor(X, y, cat_idx, n_trials=N_TRIALS_REG, timeout=TIMEOUT_SEC):
    study = optuna.create_study(
        direction="minimize",
        sampler=TPESampler(seed=SEED),
        pruner=MedianPruner(n_warmup_steps=2),
        study_name="catboost_regressor_tuning"
    )
    study.optimize(lambda tr: objective_regressor(tr, X, y, cat_idx),
                   n_trials=n_trials, timeout=timeout, show_progress_bar=True)
    return study

# --------------- Fit best models with CV and (optionally) predict test ---------------
def fit_cv_ensemble_classifier(X, y, cat_idx, best_params, n_splits=NFOLD, seed=SEED, X_test=None):
    oof = np.zeros(len(X), dtype=float)
    models = []
    test_pred = np.zeros(len(X_test), dtype=float) if X_test is not None else None
    fold_scores = []
    for fold, (tr_idx, va_idx) in enumerate(kfold_generator_classification(y, n_splits, seed), 1):
        tr_pool = Pool(X.iloc[tr_idx], label=y[tr_idx], cat_features=cat_idx)
        va_pool = Pool(X.iloc[va_idx], label=y[va_idx], cat_features=cat_idx)
        te_pool = Pool(X_test, cat_features=cat_idx) if X_test is not None else None

        model = CatBoostClassifier(
            loss_function="Logloss",
            eval_metric="AUC",
            **best_params
        )
        model.fit(tr_pool, eval_set=va_pool, use_best_model=True)
        oof[va_idx] = model.predict_proba(va_pool)[:, 1]
        fold_auc = roc_auc_score(y[va_idx], oof[va_idx])
        fold_scores.append(fold_auc)
        models.append(model)

        if te_pool is not None:
            test_pred += model.predict_proba(te_pool)[:, 1] / n_splits

    oof_metric = roc_auc_score(y, oof)
    return models, oof, oof_metric, fold_scores, test_pred

def fit_cv_ensemble_regressor(X, y, cat_idx, best_params, n_splits=NFOLD, seed=SEED, X_test=None):
    oof = np.zeros(len(X), dtype=float)
    models = []
    test_pred = np.zeros(len(X_test), dtype=float) if X_test is not None else None
    fold_metrics = []
    for fold, (tr_idx, va_idx) in enumerate(kfold_generator_regression(X, n_splits, seed), 1):
        tr_pool = Pool(X.iloc[tr_idx], label=y[tr_idx], cat_features=cat_idx)
        va_pool = Pool(X.iloc[va_idx], label=y[va_idx], cat_features=cat_idx)
        te_pool = Pool(X_test, cat_features=cat_idx) if X_test is not None else None

        model = CatBoostRegressor(
            loss_function="RMSE",
            eval_metric="RMSE",
            **best_params
        )
        model.fit(tr_pool, eval_set=va_pool, use_best_model=True)
        oof[va_idx] = model.predict(va_pool)
        fold_rmse = mean_squared_error(y[va_idx], oof[va_idx], squared=False)
        fold_metrics.append(fold_rmse)
        models.append(model)

        if te_pool is not None:
            test_pred += model.predict(te_pool) / n_splits

    oof_metric = mean_squared_error(y, oof, squared=False)
    return models, oof, oof_metric, fold_metrics, test_pred

# --------------- EXAMPLE USAGE ---------------
# Пример для классификации
# 1) загрузка
# train_cls.csv должен содержать TARGET_CLASS, фичи + (опц.) ID_COL
try:
    train_cls_pl = pl.read_csv("train_cls.csv")
    assert TARGET_CLASS in train_cls_pl.columns
    feats_cls = get_features(train_cls_pl, TARGET_CLASS, ID_COL, FEATURES_CLS)
    # каст категорий к строкам
    cast_cols = [pl.col(c).cast(pl.Utf8) for c in CAT_COLS if c in train_cls_pl.columns]
    if cast_cols:
        train_cls_pl = train_cls_pl.with_columns(cast_cols)

    X_cls, cat_idx_cls = to_pandas_and_prepare(train_cls_pl, feats_cls, CAT_COLS)
    y_cls = train_cls_pl[TARGET_CLASS].to_numpy()

    # 2) тюнинг
    study_cls = tune_classifier(X_cls, y_cls, cat_idx_cls, n_trials=N_TRIALS_CLS, timeout=TIMEOUT_SEC)
    best_params_cls = study_cls.best_params
    # фиксируем общие не-тюнимые параметры (ES, итерации и т.д.), чтобы были воспроизводимы
    best_params_cls.update({
        "iterations": 10000,
        "od_type": "Iter",
        "od_wait": EARLY_STOPPING,
        "random_seed": SEED,
        "thread_count": N_THREADS,
        "task_type": "GPU" if GPU_AVAILABLE else "CPU",
        "allow_writing_files": False,
        "verbose": 0
    })
    print("Best (Classifier) score:", study_cls.best_value)
    print("Best (Classifier) params:", best_params_cls)

    # 3) финальное CV-обучение и (опц.) предсказание на тесте
    X_test_cls_pl = pl.read_csv("test_cls.csv") if os.path.exists("test_cls.csv") else None
    X_test_cls_pd = None
    if X_test_cls_pl is not None:
        # согласуем типы
        if cast_cols:
            X_test_cls_pl = X_test_cls_pl.with_columns([pl.col(c).cast(pl.Utf8) for c in CAT_COLS if c in X_test_cls_pl.columns])
        X_test_cls_pd, _ = to_pandas_and_prepare(X_test_cls_pl, feats_cls, CAT_COLS)

    models_cls, oof_cls, oof_auc_cls, fold_aucs_cls, test_pred_cls = fit_cv_ensemble_classifier(
        X_cls, y_cls, cat_idx_cls, best_params_cls, n_splits=NFOLD, seed=SEED, X_test=X_test_cls_pd
    )
    print(f"[Classifier] OOF AUC: {oof_auc_cls:.6f} | per-fold: {', '.join(f'{x:.5f}' for x in fold_aucs_cls)}")
    if test_pred_cls is not None:
        print("Test preds (classifier) shape:", test_pred_cls.shape)

except Exception as e:
    print("Classification part skipped or failed:", e)

gc.collect()

# Пример для регрессии
try:
    train_reg_pl = pl.read_csv("train_reg.csv")
    assert TARGET_REG in train_reg_pl.columns
    feats_reg = get_features(train_reg_pl, TARGET_REG, ID_COL, FEATURES_REG)
    cast_cols = [pl.col(c).cast(pl.Utf8) for c in CAT_COLS if c in train_reg_pl.columns]
    if cast_cols:
        train_reg_pl = train_reg_pl.with_columns(cast_cols)

    X_reg, cat_idx_reg = to_pandas_and_prepare(train_reg_pl, feats_reg, CAT_COLS)
    y_reg = train_reg_pl[TARGET_REG].to_numpy()

    study_reg = tune_regressor(X_reg, y_reg, cat_idx_reg, n_trials=N_TRIALS_REG, timeout=TIMEOUT_SEC)
    best_params_reg = study_reg.best_params
    best_params_reg.update({
        "iterations": 10000,
        "od_type": "Iter",
        "od_wait": EARLY_STOPPING,
        "random_seed": SEED,
        "thread_count": N_THREADS,
        "task_type": "GPU" if GPU_AVAILABLE else "CPU",
        "allow_writing_files": False,
        "verbose": 0
    })
    print("Best (Regressor) score (RMSE):", study_reg.best_value)
    print("Best (Regressor) params:", best_params_reg)

    X_test_reg_pl = pl.read_csv("test_reg.csv") if os.path.exists("test_reg.csv") else None
    X_test_reg_pd = None
    if X_test_reg_pl is not None:
        if cast_cols:
            X_test_reg_pl = X_test_reg_pl.with_columns([pl.col(c).cast(pl.Utf8) for c in CAT_COLS if c in X_test_reg_pl.columns])
        X_test_reg_pd, _ = to_pandas_and_prepare(X_test_reg_pl, feats_reg, CAT_COLS)

    models_reg, oof_reg, oof_rmse_reg, fold_rmses_reg, test_pred_reg = fit_cv_ensemble_regressor(
        X_reg, y_reg, cat_idx_reg, best_params_reg, n_splits=NFOLD, seed=SEED, X_test=X_test_reg_pd
    )
    print(f"[Regressor] OOF RMSE: {oof_rmse_reg:.6f} | per-fold: {', '.join(f'{x:.5f}' for x in fold_rmses_reg)}")
    if test_pred_reg is not None:
        print("Test preds (regressor) shape:", test_pred_reg.shape)

except Exception as e:
    print("Regression part skipped or failed:", e)

# --------------- (Опционально) важность гиперпараметров по Optuna ---------------
def print_param_importance(study, title="Param importance"):
    try:
        imp = get_param_importances(study)
        print(title)
        for k, v in imp.items():
            print(f"- {k}: {v:.4f}")
    except Exception as e:
        print("Param importance unavailable:", e)

try:
    print_param_importance(study_cls, "Classifier param importance")
except:
    pass

try:
    print_param_importance(study_reg, "Regressor param importance")
except:
    pass

In [None]:
#https://www.predictiveresearchsolutions.com/post/data-science-tips-feature-selection-using-boruta-in-python

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

In [None]:
X, y = train_data[feature_columns], train_data[target_column]

n_folds = 5
n_bins = 20
discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='quantile', random_state=42)
y_binned = discretizer.fit_transform(y.values.reshape(-1, 1)).ravel().astype(int)

stratified_kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
catboost_scores = []

for fold, (train_indices, valid_indices) in enumerate(stratified_kfold.split(X, y_binned), 1):
    X_train, X_valid = X.iloc[train_indices].copy(), X.iloc[valid_indices].copy()
    y_train, y_valid = y.iloc[train_indices], y.iloc[valid_indices]
    sample_weights_train = weights[train_indices]
    sample_weights_valid = weights[valid_indices]

    train_pool = Pool(
        data=X_train,
        label=y_train,
        cat_features=categorical_features,
        weight=sample_weights_train
    )
    
    valid_pool = Pool(
        data=X_valid,
        label=y_valid,
        cat_features=categorical_features,
        weight=sample_weights_valid
    )

    catboost_model = CatBoostRegressor(
        iterations=3000,
        depth=8,
        loss_function='MAE',
        eval_metric='MAE',
        random_seed=42 + fold,
        task_type='GPU',
        early_stopping_rounds=100
    )

    catboost_model.fit(
        train_pool,
        eval_set=valid_pool,
        use_best_model=True, 
        verbose=200
    )
    catboost_predictions = catboost_model.predict(X_valid)
    catboost_fold_mae = mean_absolute_error(y_valid, catboost_predictions)
    catboost_scores.append(catboost_fold_mae)

    catboost_model.save_model(f"cb_f{fold}.cbm")



In [None]:
np.mean(catboost_scores)

In [None]:
1/ (1+np.mean(catboost_scores))

In [None]:
np.mean(catboost_scores)

In [None]:
1/ (1+np.mean(catboost_scores))

In [None]:
catboost_test_predictions=[]

for fold in [1,2,4]: #2,4
    catboost_model = CatBoostRegressor()
    catboost_model.load_model(f"/kaggle/working/cb_f{fold}.cbm")
    catboost_test_predictions.append(catboost_model.predict(test_data[feature_columns]))
    
catboost_mean_predictions = np.mean(catboost_test_predictions, axis=0)
final_predictions =  catboost_mean_predictions 
submission = test_data[['id']]
submission["target"] = final_predictions
submission.to_csv("submission.csv", index=False)