In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer

In [None]:
# Загрузка данных
data = pd.read_csv('train.csv')

# Удаление ненужных столбцов
data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# Заполнение пропусков
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

# Кодирование категориальных признаков
label_encoder = LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])  # Male -> 1, Female -> 0
data['Embarked'] = label_encoder.fit_transform(data['Embarked'])  # S -> 0, C -> 1, Q -> 2

# Разделение на признаки и целевую переменную
X = data.drop('Survived', axis=1)  # Все признаки, кроме Survived
y = data['Survived']  # Целевая переменная (Survived)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Создание модели KNN с K=5
knn = KNeighborsClassifier(n_neighbors=5)

# Обучение модели на обучающей выборке
knn.fit(X_train, y_train)

# Предсказание на тестовой выборке
y_pred = knn.predict(X_test)

# Оценка качества модели
accuracy = accuracy_score(y_test, y_pred)  # Точность (доля правильных ответов)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))  # Подробный отчёт о качестве

Accuracy: 0.7206703910614525
              precision    recall  f1-score   support

           0       0.72      0.85      0.78       105
           1       0.71      0.54      0.62        74

    accuracy                           0.72       179
   macro avg       0.72      0.69      0.70       179
weighted avg       0.72      0.72      0.71       179



In [5]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold

# Определение сетки параметров
param_grid = {'n_neighbors': range(1, 21)}

# Стратегии кросс-валидации
cv_strategy = StratifiedKFold(n_splits=5)

# GridSearchCV
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=cv_strategy, scoring='accuracy')
grid_search.fit(X_train, y_train)

# RandomizedSearchCV
random_search = RandomizedSearchCV(KNeighborsClassifier(), param_grid, cv=cv_strategy, scoring='accuracy', n_iter=10)
random_search.fit(X_train, y_train)

# Лучшие параметры
print(f'Best parameters (GridSearchCV): {grid_search.best_params_}')
print(f'Best parameters (RandomizedSearchCV): {random_search.best_params_}')

# Оценка качества оптимальной модели
best_knn = grid_search.best_estimator_
y_pred_best = best_knn.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f'Best model accuracy: {accuracy_best}')
print(classification_report(y_test, y_pred_best))

Best parameters (GridSearchCV): {'n_neighbors': 5}
Best parameters (RandomizedSearchCV): {'n_neighbors': 5}
Best model accuracy: 0.7206703910614525
              precision    recall  f1-score   support

           0       0.72      0.85      0.78       105
           1       0.71      0.54      0.62        74

    accuracy                           0.72       179
   macro avg       0.72      0.69      0.70       179
weighted avg       0.72      0.72      0.71       179



In [6]:
print(f'Original model accuracy: {accuracy}')
print(f'Optimized model accuracy: {accuracy_best}')

Original model accuracy: 0.7206703910614525
Optimized model accuracy: 0.7206703910614525
