# ЛР1 (KNN)

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, r2_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# Предобработка данных для задачи классификации

In [37]:
# Считывание данных
data_classification = pd.read_csv(r'C:\Users\xwxsz\Desktop\Video_games_esrb_rating.csv')
data_classification.head()

Unnamed: 0,title,console,alcohol_reference,animated_blood,blood,blood_and_gore,cartoon_violence,crude_humor,drug_reference,fantasy_violence,...,sexual_content,sexual_themes,simulated_gambling,strong_janguage,strong_sexual_content,suggestive_themes,use_of_alcohol,use_of_drugs_and_alcohol,violence,esrb_rating
0,Monster Jam Steel Titans 2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,E
1,Subnautica: Below Zero,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ET
2,NIER REPLICANT VER.1.22474487139…,1,0,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,M
3,Jamestown+,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,ET
4,Neptunia Virtual Stars,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,T


In [38]:
# Преобразование целевой переменной
label_encoder = LabelEncoder()
data_classification['title'] = label_encoder.fit_transform(data_classification['title'])
data_classification['esrb_rating'] = label_encoder.fit_transform(data_classification['esrb_rating'])

# Разделение данных
X = data_classification.drop('esrb_rating', axis=1) 
y = data_classification['esrb_rating'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование данных
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Классификация (бейзлайн)

In [39]:
# Обучение модели KNN
knn_class = KNeighborsClassifier(n_neighbors=5)
knn_class.fit(X_train, y_train)

In [40]:
# Предикт
y_pred_class_knn = knn_class.predict(X_test)

# Оценка 
accuracy_knn = accuracy_score(y_test, y_pred_class_knn)
f1_knn = f1_score(y_test, y_pred_class_knn, average='weighted')

In [41]:
print(f'Accuracy: {accuracy_knn}')
print(f'F1: {f1_knn}')

Accuracy: 0.8047493403693932
F1: 0.8025380280601467


# Классификация (улучшенный бейзлайн)

In [42]:
# Создание пайплайна
pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Обработка пропущенных значений
    ('scaler', StandardScaler()),  # Масштабирование данных
    ('knn', KNeighborsClassifier())  # Модель KNN
])

In [43]:
# Определение гиперпараметров для подбора
param_grid_classification = {
    'knn__n_neighbors': [3, 5, 7, 9, 11],  # Проверим разные значения для n_neighbors
    'knn__weights': ['uniform', 'distance'],  # Типы взвешивания
}

In [44]:
# Настройка GridSearch для подбора гиперпараметров по метрике F1
grid_search_classification = GridSearchCV(pipeline, param_grid_classification, cv=5, n_jobs=-1, scoring='f1_weighted')

In [45]:
# Обучение модели с подбором гиперпараметров
grid_search_classification.fit(X_train, y_train)

In [46]:
# Оптимальные гиперпараметры
print(f'{grid_search_classification.best_params_}')

{'knn__n_neighbors': 7, 'knn__weights': 'distance'}


In [47]:
# Предикт
y_pred = grid_search_classification.predict(X_test)

# Оценка
accuracy_knn = accuracy_score(y_test, y_pred)
f1_knn = f1_score(y_test, y_pred, average='weighted')

In [48]:
print(f'Accuracy: {accuracy_knn}')
print(f'F1: {f1_knn}')

Accuracy: 0.8284960422163589
F1: 0.8271962395000577


# Классификация (самостоятельная имплементация)

In [49]:
# Реализация алгоритма KNN
class KNN:
    def __init__(self, k=3, ):
        self.k = k
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def predict(self, X_test):
        y_pred = [self._predict(x) for x in X_test]
        return np.array(y_pred)
    
    def _predict(self, x):
        distances = [self._euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train.iloc[i] for i in k_indices]  # Используем .iloc для индексации
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]
    
    def _euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))

In [50]:
# Обучение модели

knn = KNN(k=5)
knn.fit(X_train, y_train)

In [51]:
# Предикт
y_pred = knn.predict(X_test)

# Оценка
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

In [52]:
print(f"Accuracy (до улучшения): {accuracy}")
print(f"F1 (до улучшения): {f1}")

Accuracy (до улучшения): 0.8073878627968337
F1 (до улучшения): 0.8052817500847229


In [53]:
# Улучшенный KNN-классификатор
class ImprovedKNNClassifier:
    def __init__(self, k=5, weighted=True):
        self.k = k
        self.weighted = weighted
        self.scaler = StandardScaler()

    def fit(self, X, y):
        self.X_train = self.scaler.fit_transform(X)
        self.y_train = np.array(y)

    def predict(self, X):
        X_test = self.scaler.transform(X)
        predictions = []
        for x in X_test:
            distances = np.linalg.norm(self.X_train - x, axis=1)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]

            if self.weighted:
                weights = 1 / (distances[k_indices] + 1e-5)  # Для предотвращения деления на ноль
                predicted_label = np.bincount(k_nearest_labels, weights=weights).argmax()
            else:
                predicted_label = np.bincount(k_nearest_labels).argmax()

            predictions.append(predicted_label)

        return np.array(predictions)

    def tune_k(self, X, y, k_values):
        X_scaled = self.scaler.fit_transform(X)
        best_k = None
        best_score = 0
        for k in k_values:
            self.k = k
            preds = self.predict(X_scaled)
            score = accuracy_score(y, preds)
            if score > best_score:
                best_score = score
                best_k = k
        self.k = best_k
        return best_k

    def evaluate(self, X, y):
        preds = self.predict(X)
        accuracy = accuracy_score(y, preds)
        f1 = f1_score(y, preds, average='weighted')
        return accuracy, f1

In [54]:
# Разделение данных 
X = data_classification.drop(columns=['esrb_rating'])  
y = data_classification['esrb_rating']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
# Создание и обучение модели
knn = ImprovedKNNClassifier(k=5, weighted=True)
knn.fit(X_train, y_train)

In [56]:
# Подбор оптимального значения k
best_k = knn.tune_k(X_train, y_train, k_values=range(3, 20))
print(f"Лучшее значение k: {best_k}")

Лучшее значение k: 4


In [57]:
# Предикт
predictions = knn.predict(X_test)

# Оценка
accuracy, f1 = knn.evaluate(X_test, y_test)

In [58]:
print(f"Accuracy: {accuracy}")
print(f"F1: {f1}")

Accuracy: 0.8126649076517151
F1: 0.8121154748781471


# Предобработка данных для задачи регрессии

In [111]:
data_regression = pd.read_excel(r'C:\Users\xwxsz\Desktop\Real estate valuation data set.xlsx')
data_regression.head()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.916667,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.916667,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583333,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833333,5.0,390.5684,5,24.97937,121.54245,43.1


In [112]:
X = data_regression.drop(columns=['Y house price of unit area'])
y = data_regression['Y house price of unit area']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Регрессия (бейзлайн)

In [113]:
model_knn = KNeighborsRegressor(n_neighbors=5)
model_knn.fit(X_train, y_train)

In [114]:
y_pred_knn = model_knn.predict(X_test)

mae_knn = mean_absolute_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)

In [115]:
print(f'MAE: {mae_knn}')
print(f'R^2: {r2_knn}')

MAE: 6.102650602409638
R^2: 0.6114713906643064


# Регрессия (улучшенный бейзлайн)

In [116]:
# Создание пайплайна
pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Обработка пропущенных значений
    ('scaler', StandardScaler()),  # Масштабирование данных
    ('knn', KNeighborsRegressor())  # Модель KNN для регрессии
])

In [117]:
# Определение гиперпараметров для подбора
param_grid_regression = {
    'knn__n_neighbors': [3, 5, 7, 9, 11],  # Проверим разные значения для n_neighbors
    'knn__weights': ['uniform', 'distance'],  # Типы взвешивания
}

In [118]:
# Настройка GridSearch для подбора гиперпараметров по метрике R2
grid_search_regression = GridSearchCV(pipeline, param_grid_regression, cv=5, n_jobs=-1, scoring='r2')

In [119]:
# Обучение модели с подбором гиперпараметров
grid_search_regression.fit(X_train, y_train)

# Оптимальные гиперпараметры
print(f'{grid_search_regression.best_params_}')

{'knn__n_neighbors': 11, 'knn__weights': 'distance'}


In [120]:
# Предикт
y_pred = grid_search_regression.predict(X_test)

# Оценка
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [121]:
print(f'MAE: {mae}')
print(f'R2: {r2}')

MAE: 5.076973969858367
R2: 0.7112023499804518


# Регрессия (самостоятельная имплементация)

In [122]:
# Масштабирование данных
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [123]:
# Реализация алгоритма KNN для регрессии
class KNNRegressor:
    def __init__(self, k=5, weighted=True):
        self.k = k
        self.weighted = weighted
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def predict(self, X_test):
        y_pred = [self._predict(x) for x in X_test]
        return np.array(y_pred)
    
    def _predict(self, x):
        distances = [self._euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train.iloc[k_indices]

        if self.weighted:
            weights = 1 / (np.array(distances)[k_indices] + 1e-5)  # Для предотвращения деления на ноль
            predicted_label = np.average(k_nearest_labels, weights=weights)
        else:
            predicted_label = np.mean(k_nearest_labels)

        return predicted_label
    
    def _euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))

In [124]:
# Создание и обучение модели
knn_regressor = KNNRegressor(k=5, weighted=True)
knn_regressor.fit(X_train_scaled, y_train)

# Предикт
y_pred = knn_regressor.predict(X_test_scaled)

# Оценка
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE (до улучшения): {mae}")
print(f"R^2 (до улучшения): {r2}")

MAE (до улучшения): 5.019676031852231
R^2 (до улучшения): 0.7108229148476644


In [125]:
# Реализация алгоритма KNN для регрессии
class KNNRegressor:
    def __init__(self, k=5, weighted=True):
        self.k = k
        self.weighted = weighted
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def predict(self, X_test):
        y_pred = [self._predict(x) for x in X_test]
        return np.array(y_pred)
    
    def _predict(self, x):
        distances = [self._euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train.iloc[k_indices]

        if self.weighted:
            weights = 1 / (np.array(distances)[k_indices] + 1e-5)  # Для предотвращения деления на ноль
            predicted_label = np.average(k_nearest_labels, weights=weights)
        else:
            predicted_label = np.mean(k_nearest_labels)

        return predicted_label
    
    def _euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))

    def get_params(self, deep=True):
        # Возвращаем параметры модели для GridSearchCV
        return {'k': self.k, 'weighted': self.weighted}

    def set_params(self, **params):
        # Устанавливаем параметры модели
        for param, value in params.items():
            setattr(self, param, value)
        return self

In [126]:
# Создание KNN
knn_regressor = KNNRegressor()

In [127]:
# Определение параметров для подбора с gridsearchcv
param_grid = {
    'k': range(1, 15),  # Ограничиваем k до 15
    'weighted': [True, False]  # Попробуем с и без взвешивания
}

In [128]:
# GridSearchCV с кросс-валидацией
grid_search = GridSearchCV(estimator=knn_regressor, param_grid=param_grid,
                           scoring='r2', cv=10)

In [129]:
# Обучаем модель
grid_search.fit(X_train_scaled, y_train)

# Оптимальные параметры
print(grid_search.best_estimator_)

<__main__.KNNRegressor object at 0x000001EC87C008F0>


In [130]:
# Предикт
y_pred = best_knn_regressor.predict(X_test_scaled)

# Оценка
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [131]:
print(f"MAE (после улучшения): {mae}")
print(f"R^2 (после улучшения): {r2}")

MAE (после улучшения): 4.980292598967298
R^2 (после улучшения): 0.7118144823386139
