This is a notebook for a kaggle competition: https://www.kaggle.com/competitions/spaceship-titanic/

In [None]:
from catboost import CatBoostClassifier
import pandas as pd
from sklearn.model_selection import GridSearchCV,cross_val_score
train_df = pd.read_csv("../datasets/classification/spaceship_train.csv",usecols=['HomePlanet','CryoSleep','Cabin','Destination','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Transported'])
y_train=train_df['Transported']
x=train_df.drop(columns=['Transported'])
x_train = x.copy()
cat_cols = ['HomePlanet', 'Cabin', 'Destination']
bool_cols = ['CryoSleep', 'VIP']
num_cols = [col for col in x_train.columns if col not in cat_cols + bool_cols]
x_train[cat_cols] = x_train[cat_cols].fillna('missing')
x_train[num_cols] = x_train[num_cols].fillna(x_train[num_cols].mean())
x_train[bool_cols] = x_train[bool_cols].fillna(False).astype(int)
x_sub=pd.read_csv("../datasets/classification/spaceship_test.csv",usecols=['HomePlanet','CryoSleep','Cabin','Destination','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck'])
x_sub[cat_cols] = x_sub[cat_cols].fillna('missing')
x_sub[num_cols] = x_sub[num_cols].fillna(x_train[num_cols].mean())
x_sub[bool_cols] = x_sub[bool_cols].fillna(False).astype(int)
id_df=pd.read_csv("../datasets/classification/spaceship_test.csv",usecols=['PassengerId'])
params={
    'depth': [6,7,8],
    'iterations':[100,125,150],
    'learning_rate':[0.05,0.03],
}
grid = GridSearchCV(CatBoostClassifier(cat_features=cat_cols,verbose=0),params,cv=5)
grid.fit(x_train,y_train)
model = grid.best_estimator_
print(f"Лучшие параметры:{grid.best_params_}")
predictions = model.predict(x_sub)
submission=pd.DataFrame({
    'PassengerId':id_df['PassengerId'],
    'Transported': predictions})
submission.to_csv('submissions/spaceship_catboost_submission.csv', index=False, encoding='utf-8')
scores = cross_val_score(model, x_train, y_train, cv=5)
print("Оценки на каждом фолде:", scores)
print("Среднее качество:", scores.mean())

In [14]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
train_df = pd.read_csv("../datasets/classification/spaceship_train.csv",usecols=['HomePlanet','CryoSleep','Cabin','Destination','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Transported'])
x_sub=pd.read_csv("../datasets/classification/spaceship_test.csv",usecols=['HomePlanet','CryoSleep','Cabin','Destination','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck'])
y_train=train_df['Transported']
x_train=train_df.drop(columns=['Transported'])
nom_cols = ['HomePlanet', 'Cabin', 'Destination']
ord_cols = ['CryoSleep', 'VIP']
ordinal_cats=[[False,True],[False,True]]
num_cols = [col for col in x_train.columns if col not in nom_cols + ord_cols]
id_df=pd.read_csv("../datasets/classification/spaceship_test.csv",usecols=['PassengerId'])
params = {
    'clf__n_estimators':[75,90,100,110,125],
    'clf__max_depth':range(3,13)
}
nom_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='constant',fill_value='Missing')),
    ('encoder',OneHotEncoder(handle_unknown='ignore'))
])
num_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])
ord_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='constant',fill_value=False)),
    ('encoder',OrdinalEncoder(categories=ordinal_cats))
])
preprocessor = ColumnTransformer(transformers=[
    ('num',num_pipeline,num_cols),
    ('nom',nom_pipeline,nom_cols),
    ('ord',ord_pipeline,ord_cols)
])
clf = RandomForestClassifier()
pipeline = Pipeline([
    ('prep',preprocessor),
    ('clf',clf)
])
grid = GridSearchCV(pipeline,params,cv=5,scoring='accuracy')
grid.fit(x_train,y_train)
print(f"Параметры: {grid.best_params_} Показатели: {grid.best_score_}")
model = grid.best_estimator_
predictions = model.predict(x_sub)
submission=pd.DataFrame({
    'PassengerId':id_df['PassengerId'],
    'Transported': predictions})
submission.to_csv('submissions/spaceship_rforest_submission.csv', index=False, encoding='utf-8')
scores = cross_val_score(model, x_train, y_train, cv=5)
print("Оценки на каждом фолде:", scores)
print("Среднее качество:", scores.mean())

Параметры: {'clf__max_depth': 11, 'clf__n_estimators': 100} Показатели: 0.7407137152087327
Оценки на каждом фолде: [0.72742956 0.73375503 0.73433007 0.75201381 0.75143843]
Среднее качество: 0.7397933815116685
