In [6]:
!pip install catboost



In [None]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

def detect_outliers(df, features):
    for feature in features:
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
        # Удаление выбросов
        df = df[(df[feature] >= lower_bound) & (df[feature] <= upper_bound)]
    return df

features = train_df.columns[:-1]  # Все признаки кроме target
train_df = detect_outliers(train_df, features)

# Логарифмирование
for col in ['area', 'perimeter', 'major_axis', 'minor_axis', 'convex_area']:
    train_df[col] = np.log1p(train_df[col])
    test_df[col] = np.log1p(test_df[col])

# Обработка асимметрии
skew_columns = train_df.skew().sort_values(ascending=False)
high_skew = skew_columns[abs(skew_columns) > 0.8].index.tolist()
pt = PowerTransformer(method='yeo-johnson')
train_df[high_skew] = pt.fit_transform(train_df[high_skew])
test_df[high_skew] = pt.fit_transform(test_df[high_skew])

# Проверка корреляции с target
correlation_matrix = train_df.corr()
target_corr = correlation_matrix['target'].sort_values(ascending=False)
low_corr_features = target_corr[abs(target_corr) < 0.15].index.tolist()

# Удаление признаков с низкой корреляцией
train_df = train_df.drop(columns=low_corr_features, errors='ignore')
test_df = test_df.drop(columns=low_corr_features, errors='ignore')

X = train_df.drop(columns=['target'])
y = train_df['target']

rf = RandomForestClassifier(random_state=42).fit(X, y)
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
low_importance = importances[importances < 0.01].index.tolist()

# Удаление наименее важных признаков
train_df = train_df.drop(columns=low_importance)
test_df = test_df.drop(columns=low_importance)

train_df['eccentricity_roundness'] = train_df['eccentricity'] * train_df['roundness']
test_df['eccentricity_roundness'] = test_df['eccentricity'] * test_df['roundness']

train_df['solidity_ratio'] = train_df['area'] / train_df['convex_area']
test_df['solidity_ratio'] = test_df['area'] / test_df['convex_area']

train_df['shapefactor_1_3'] = train_df['shapefactor_1'] * train_df['shapefactor_3']
test_df['shapefactor_1_3'] = test_df['shapefactor_1'] * test_df['shapefactor_3']

# Разделение данных
X = train_df.drop(columns=['target'])
y = train_df['target']
X_test = test_df

X_train, y_train = X, y

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [9]:
cat_model = CatBoostClassifier(random_state=42, verbose=0)
cat_model.fit(X_train, y_train)
y_pred_proba = cat_model.predict_proba(X_test)[:, 1]

best_threshold = 0.53
y_pred_binary = (y_pred_proba >= best_threshold).astype(int)
answers_df = pd.DataFrame({'target': y_pred_binary})
answers_df.to_csv('answers.csv', index=False, header=False)
print("Submission file 'answers.csv' created successfully.")

Submission file 'answers.csv' created successfully.
