# Tabular Advanced Pipeline - Продвинутая работа с табличными данными

Пайплайны для:
- Feature Engineering
- CatBoost, XGBoost, LightGBM
- Обработка пропусков и выбросов
- Feature Selection
- Target Encoding

In [None]:
!pip install pandas numpy scikit-learn catboost xgboost lightgbm category_encoders optuna -q

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, mean_squared_error
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
import catboost as cb
import xgboost as xgb
import lightgbm as lgb
from category_encoders import TargetEncoder
import warnings
warnings.filterwarnings('ignore')

print("✓ Библиотеки загружены!")

## 1. Загрузка данных

In [None]:
# === ВАШИ ДАННЫЕ ===
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

TARGET_COL = 'target'
ID_COL = 'id'

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"\nИнформация о данных:")
print(train_df.info())
print(f"\nПервые строки:\n{train_df.head()}")

## 2. EDA и визуализация

In [None]:
# Пропущенные значения
missing = train_df.isnull().sum()
missing_percent = (missing / len(train_df)) * 100
missing_df = pd.DataFrame({
    'Missing_Count': missing,
    'Percentage': missing_percent
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Percentage', ascending=False)

if len(missing_df) > 0:
    print("Пропущенные значения:")
    print(missing_df)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x=missing_df.index, y=missing_df['Percentage'])
    plt.xticks(rotation=90)
    plt.title('Процент пропусков по колонкам')
    plt.tight_layout()
    plt.show()
else:
    print("✓ Пропущенных значений нет!")

In [None]:
# Статистика по целевой переменной
if TARGET_COL in train_df.columns:
    print(f"\nРаспределение целевой переменной '{TARGET_COL}':")
    print(train_df[TARGET_COL].value_counts())
    print(f"\nПропорции:")
    print(train_df[TARGET_COL].value_counts(normalize=True))
    
    plt.figure(figsize=(8, 5))
    train_df[TARGET_COL].value_counts().plot(kind='bar')
    plt.title(f'Распределение {TARGET_COL}')
    plt.xlabel(TARGET_COL)
    plt.ylabel('Количество')
    plt.show()

In [None]:
# Корреляционная матрица для числовых признаков
numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
if ID_COL in numeric_cols:
    numeric_cols.remove(ID_COL)

if len(numeric_cols) > 0:
    plt.figure(figsize=(12, 10))
    correlation = train_df[numeric_cols].corr()
    sns.heatmap(correlation, annot=False, cmap='coolwarm', center=0)
    plt.title('Корреляционная матрица')
    plt.tight_layout()
    plt.show()
    
    # Корреляция с таргетом
    if TARGET_COL in correlation.columns:
        target_corr = correlation[TARGET_COL].sort_values(ascending=False)
        print(f"\nТоп-10 признаков по корреляции с {TARGET_COL}:")
        print(target_corr.head(11))  # 11 т.к. включая сам таргет

## 3. Feature Engineering

In [None]:
def advanced_feature_engineering(df, is_train=True):
    """Продвинутый feature engineering"""
    df = df.copy()
    
    # 1. Взаимодействия числовых признаков
    numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
    if ID_COL in numeric_features:
        numeric_features.remove(ID_COL)
    if TARGET_COL in numeric_features and is_train:
        numeric_features.remove(TARGET_COL)
    
    # Создаем произведения и суммы важных пар признаков
    # (для примера берем первые 3 числовых признака)
    if len(numeric_features) >= 2:
        for i in range(min(3, len(numeric_features))):
            for j in range(i+1, min(3, len(numeric_features))):
                col1, col2 = numeric_features[i], numeric_features[j]
                df[f'{col1}_{col2}_mul'] = df[col1] * df[col2]
                df[f'{col1}_{col2}_add'] = df[col1] + df[col2]
                df[f'{col1}_{col2}_diff'] = df[col1] - df[col2]
                # Избегаем деления на 0
                df[f'{col1}_{col2}_div'] = df[col1] / (df[col2] + 1e-5)
    
    # 2. Статистические признаки по строкам
    if len(numeric_features) > 0:
        df['numeric_mean'] = df[numeric_features].mean(axis=1)
        df['numeric_std'] = df[numeric_features].std(axis=1)
        df['numeric_max'] = df[numeric_features].max(axis=1)
        df['numeric_min'] = df[numeric_features].min(axis=1)
        df['numeric_median'] = df[numeric_features].median(axis=1)
        df['numeric_range'] = df['numeric_max'] - df['numeric_min']
    
    # 3. Полиномиальные признаки для важных колонок
    for col in numeric_features[:3]:  # Берем первые 3 для примера
        df[f'{col}_squared'] = df[col] ** 2
        df[f'{col}_cubed'] = df[col] ** 3
        df[f'{col}_sqrt'] = np.sqrt(np.abs(df[col]))
        df[f'{col}_log'] = np.log1p(np.abs(df[col]))
    
    # 4. Категориальные признаки - количество уникальных комбинаций
    cat_features = df.select_dtypes(include=['object']).columns.tolist()
    if len(cat_features) >= 2:
        for i in range(min(2, len(cat_features))):
            for j in range(i+1, min(2, len(cat_features))):
                df[f'{cat_features[i]}_{cat_features[j]}_combo'] = \
                    df[cat_features[i]].astype(str) + '_' + df[cat_features[j]].astype(str)
    
    return df

# Применение
train_fe = advanced_feature_engineering(train_df, is_train=True)
test_fe = advanced_feature_engineering(test_df, is_train=False)

print(f"\nПризнаков до FE: {len(train_df.columns)}")
print(f"Признаков после FE: {len(train_fe.columns)}")
print(f"Создано новых признаков: {len(train_fe.columns) - len(train_df.columns)}")

## 4. Обработка пропусков

In [None]:
X.fillna

In [None]:
def handle_missing_values(train, test, target_col, id_col):
    """Обработка пропущенных значений"""
    train = train.copy()
    test = test.copy()
    
    # Числовые признаки - медиана
    numeric_cols = train.select_dtypes(include=[np.number]).columns.tolist()
    if id_col in numeric_cols:
        numeric_cols.remove(id_col)
    if target_col in numeric_cols:
        numeric_cols.remove(target_col)
    
    for col in numeric_cols:
        if train[col].isnull().sum() > 0:
            median_value = train[col].median()
            train[col].fillna(median_value, inplace=True)
            test[col].fillna(median_value, inplace=True)
            # Создаем индикатор пропуска
            train[f'{col}_is_missing'] = train[col].isnull().astype(int)
            test[f'{col}_is_missing'] = test[col].isnull().astype(int)
    
    # Категориальные признаки - режим или 'missing'
    cat_cols = train.select_dtypes(include=['object']).columns.tolist()
    
    for col in cat_cols:
        if train[col].isnull().sum() > 0:
            train[col].fillna('missing', inplace=True)
            test[col].fillna('missing', inplace=True)
    
    return train, test

train_fe, test_fe = handle_missing_values(train_fe, test_fe, TARGET_COL, ID_COL)
print("✓ Пропуски обработаны!")

## 5. Encoding категориальных признаков

In [None]:
# Получаем категориальные признаки
cat_features = train_fe.select_dtypes(include=['object']).columns.tolist()

# Label Encoding для признаков с низкой кардинальностью
low_card_features = [col for col in cat_features if train_fe[col].nunique() < 10]

label_encoders = {}
for col in low_card_features:
    le = LabelEncoder()
    train_fe[col] = le.fit_transform(train_fe[col].astype(str))
    test_fe[col] = le.transform(test_fe[col].astype(str))
    label_encoders[col] = le

print(f"Label Encoding применен к {len(low_card_features)} признакам")

# Target Encoding для признаков с высокой кардинальностью
high_card_features = [col for col in cat_features if col not in low_card_features]

if len(high_card_features) > 0 and TARGET_COL in train_fe.columns:
    target_encoder = TargetEncoder(cols=high_card_features)
    train_fe[high_card_features] = target_encoder.fit_transform(
        train_fe[high_card_features], 
        train_fe[TARGET_COL]
    )
    test_fe[high_card_features] = target_encoder.transform(test_fe[high_card_features])
    print(f"Target Encoding применен к {len(high_card_features)} признакам")

print("✓ Encoding завершен!")

## 6. Feature Selection

In [None]:
# Подготовка данных для feature selection
feature_cols = [col for col in train_fe.columns if col not in [TARGET_COL, ID_COL]]
X = train_fe[feature_cols]
y = train_fe[TARGET_COL]

# Feature importance с Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X, y)

feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nТоп-20 важных признаков:")
print(feature_importance.head(20))

# Визуализация
plt.figure(figsize=(10, 8))
sns.barplot(data=feature_importance.head(20), x='importance', y='feature')
plt.title('Топ-20 важных признаков')
plt.tight_layout()
plt.show()

# Выбираем топ-N признаков
TOP_N_FEATURES = 50
selected_features = feature_importance.head(TOP_N_FEATURES)['feature'].tolist()
print(f"\nВыбрано признаков: {len(selected_features)}")

## 7. Обучение моделей

In [None]:
# Используем выбранные признаки или все
USE_SELECTED = False  # Переключите на True для использования отобранных признаков
final_features = selected_features if USE_SELECTED else feature_cols

X_train = train_fe[final_features]
y_train = train_fe[TARGET_COL]
X_test = test_fe[final_features]

# Train/Val split
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, 
    test_size=0.2, 
    random_state=42,
    stratify=y_train  # Для классификации
)

print(f"Train: {X_tr.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

### 7.1 CatBoost

In [None]:
catboost_params = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'depth': 6,
    'loss_function': 'Logloss',  # или 'RMSE' для регрессии
    'eval_metric': 'AUC',
    'random_seed': 42,
    'verbose': 100,
    'early_stopping_rounds': 50
}

cat_model = cb.CatBoostClassifier(**catboost_params)
cat_model.fit(
    X_tr, y_tr,
    eval_set=(X_val, y_val),
    use_best_model=True,
    verbose=100
)

# Предсказания
cat_val_pred = cat_model.predict_proba(X_val)[:, 1]
cat_test_pred = cat_model.predict_proba(X_test)[:, 1]

cat_score = roc_auc_score(y_val, cat_val_pred)
print(f"\nCatBoost Validation AUC: {cat_score:.6f}")

### 7.2 XGBoost

In [None]:
xgb_params = {
    'max_depth': 6,
    'learning_rate': 0.05,
    'n_estimators': 1000,
    'objective': 'binary:logistic',  # или 'reg:squarederror' для регрессии
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'random_state': 42,
    'n_jobs': -1
}

xgb_model = xgb.XGBClassifier(**xgb_params)
xgb_model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=50,
    verbose=100
)

xgb_val_pred = xgb_model.predict_proba(X_val)[:, 1]
xgb_test_pred = xgb_model.predict_proba(X_test)[:, 1]

xgb_score = roc_auc_score(y_val, xgb_val_pred)
print(f"\nXGBoost Validation AUC: {xgb_score:.6f}")

### 7.3 LightGBM

In [None]:
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42,
    'n_jobs': -1
}

lgb_train = lgb.Dataset(X_tr, y_tr)
lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

lgb_model = lgb.train(
    lgb_params,
    lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_train, lgb_val],
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
)

lgb_val_pred = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)
lgb_test_pred = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)

lgb_score = roc_auc_score(y_val, lgb_val_pred)
print(f"\nLightGBM Validation AUC: {lgb_score:.6f}")

## 8. Ансамблирование (Averaging)

In [None]:
# Простое усреднение предсказаний
ensemble_val_pred = (cat_val_pred + xgb_val_pred + lgb_val_pred) / 3
ensemble_test_pred = (cat_test_pred + xgb_test_pred + lgb_test_pred) / 3

ensemble_score = roc_auc_score(y_val, ensemble_val_pred)

# Сравнение
results = pd.DataFrame({
    'Model': ['CatBoost', 'XGBoost', 'LightGBM', 'Ensemble'],
    'Validation AUC': [cat_score, xgb_score, lgb_score, ensemble_score]
})

print("\n" + "="*50)
print("СРАВНЕНИЕ МОДЕЛЕЙ")
print("="*50)
print(results.to_string(index=False))
print("="*50)

## 9. Submission

In [None]:
# Используем лучшую модель или ансамбль
final_predictions = ensemble_test_pred  # или cat_test_pred, xgb_test_pred и т.д.

submission = pd.DataFrame({
    ID_COL: test_df[ID_COL],
    'prediction': final_predictions
})

submission.to_csv('tabular_submission.csv', index=False)
print("\n✓ Submission сохранен!")
print(submission.head())