# Лабораторная работа № 4


In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn import metrics

from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor, \
    GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor

random_state = 42
pd.set_option('display.float_format', '{:.4f}'.format)

## 1. Загрузка и первичный анализ данных

In [21]:
df = pd.read_csv("../data/students_performance.csv")

print("Первые строки данных")
display(df.head())

print("\nИнформация о данных")
df.info()

print("\nПропуски")
print(df.isna().sum().sum())

Первые строки данных


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75



Информация о данных
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB

Пропуски
0


## 2. Создание целевых переменных

In [22]:
# Средний балл
df['average_score'] = df[['math score', 'reading score', 'writing score']].mean(axis=1)

# Бинарный признак высокой успеваемости (≥75 — высокий результат)
df['high_performance'] = (df['average_score'] >= 75).astype(int)

print("\nРаспределение high_performance")
print(df['high_performance'].value_counts(normalize=True).round(3))


Распределение high_performance
high_performance
0   0.6760
1   0.3240
Name: proportion, dtype: float64


## 3. Разделение на train/test (80/20) стратифицированно по целевой переменной классификации

In [23]:
X = df.drop(['average_score', 'high_performance',
             'math score', 'reading score', 'writing score'], axis=1)
y_class = df['high_performance']
y_regr = df['average_score']

X_train, X_test, y_train_class, y_test_class = train_test_split(
    X, y_class, test_size=0.2, stratify=y_class, random_state=random_state)

_, _, y_train_regr, y_test_regr = train_test_split(
    X, y_regr, test_size=0.2, stratify=y_class, random_state=random_state)  # стратификация по классу сохраняет баланс

print(f"Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")

Train size: 800, Test size: 200


## 4. Конвейер предобработки (один для обеих задач)

In [24]:
categorical_features = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
numeric_features = []  # все признаки категориальные

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

## 5. ЗАДАЧА 1: БИНАРНАЯ КЛАССИФИКАЦИЯ
Цель: определить студентов с высокой успеваемостью (average_score ≥ 75)

In [25]:
class_models = {
    "logistic": LogisticRegression(class_weight='balanced', random_state=random_state, max_iter=1000),
    "random_forest": RandomForestClassifier(class_weight='balanced', random_state=random_state),
    "gradient_boosting": GradientBoostingClassifier(random_state=random_state),
    "knn": KNeighborsClassifier(),
    "decision_tree": DecisionTreeClassifier(random_state=random_state)
}

class_results = {}

for name, model in class_models.items():
    pipe = Pipeline([
        ('prep', preprocessor),
        ('scaler', StandardScaler(with_mean=False)),  # важно для разреженных матриц после OHE
        ('model', model)
    ])

    pipe.fit(X_train, y_train_class)

    y_pred = pipe.predict(X_test)
    y_prob = pipe.predict_proba(X_test)[:, 1]

    class_results[name] = {
        'pipeline': pipe,
        'Accuracy': metrics.accuracy_score(y_test_class, y_pred),
        'Precision': metrics.precision_score(y_test_class, y_pred),
        'Recall': metrics.recall_score(y_test_class, y_pred),
        'F1': metrics.f1_score(y_test_class, y_pred),
        'ROC_AUC': metrics.roc_auc_score(y_test_class, y_prob),
        'MCC': metrics.matthews_corrcoef(y_test_class, y_pred)
    }

class_df = pd.DataFrame(class_results).T
class_df = class_df.sort_values(by='ROC_AUC', ascending=False)
display(class_df.round(4))

best_class_model_name = class_df.index[0]
print(f"\nЛучшая модель классификации: {best_class_model_name} "
      f"(ROC-AUC = {class_df.loc[best_class_model_name, 'ROC_AUC']:.4f})")

# Подбор гиперпараметров для лучшей модели (пример для Random Forest)
if best_class_model_name == "random_forest":
    param_grid = {
        'model__n_estimators': [100, 200, 300],
        'model__max_depth': [5, 8, 11, None],
        'model__min_samples_split': [2, 5],
        'model__max_features': ['sqrt', 'log2']
    }
    grid = GridSearchCV(Pipeline([
        ('prep', preprocessor),
        ('scaler', StandardScaler(with_mean=False)),
        ('model', RandomForestClassifier(class_weight='balanced', random_state=random_state))
    ]), param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
    grid.fit(X_train, y_train_class)
    print("Лучшие параметры RF:", grid.best_params_)
    best_class_pipeline = grid.best_estimator_
else:
    best_class_pipeline = class_results[best_class_model_name]['pipeline']

Unnamed: 0,pipeline,Accuracy,Precision,Recall,F1,ROC_AUC,MCC
logistic,"(ColumnTransformer(remainder='passthrough',\n ...",0.67,0.4944,0.6769,0.5714,0.7268,0.3238
gradient_boosting,"(ColumnTransformer(remainder='passthrough',\n ...",0.72,0.6216,0.3538,0.451,0.7108,0.3017
knn,"(ColumnTransformer(remainder='passthrough',\n ...",0.625,0.3864,0.2615,0.3119,0.6133,0.0696
random_forest,"(ColumnTransformer(remainder='passthrough',\n ...",0.59,0.3924,0.4769,0.4306,0.6097,0.1163
decision_tree,"(ColumnTransformer(remainder='passthrough',\n ...",0.66,0.4483,0.2,0.2766,0.5897,0.1084



Лучшая модель классификации: logistic (ROC-AUC = 0.7268)


## ЗАДАЧА 2: РЕГРЕССИЯ
Цель: предсказание среднего балла (average_score)

In [26]:
print("\n" + "=" * 60)
print("ЗАДАЧА РЕГРЕССИИ: average_score")
print("=" * 60)

regr_models = {
    "ridge": Ridge(),
    "random_forest": RandomForestRegressor(random_state=random_state, n_jobs=-1),
    "gradient_boosting": GradientBoostingRegressor(random_state=random_state),
    "knn": KNeighborsRegressor(),
    "decision_tree": DecisionTreeRegressor(random_state=random_state)
}

regr_results = {}

for name, model in regr_models.items():
    pipe = Pipeline([
        ('prep', preprocessor),
        ('scaler', StandardScaler(with_mean=False)),
        ('model', model)
    ])

    pipe.fit(X_train, y_train_regr)
    y_pred = pipe.predict(X_test)

    regr_results[name] = {
        'pipeline': pipe,
        'RMSE': np.sqrt(metrics.mean_squared_error(y_test_regr, y_pred)),
        'MAE': metrics.mean_absolute_error(y_test_regr, y_pred),
        'R2': metrics.r2_score(y_test_regr, y_pred)
    }

regr_df = pd.DataFrame(regr_results).T
regr_df = regr_df.sort_values(by='RMSE')
display(regr_df.round(4))

best_regr_model_name = regr_df.index[0]
print(f"\nЛучшая модель регрессии: {best_regr_model_name} "
      f"(RMSE = {regr_df.loc[best_regr_model_name, 'RMSE']:.3f})")

# Подбор гиперпараметров для лучшей модели регрессии
if best_regr_model_name == "random_forest":
    param_grid_regr = {
        'model__n_estimators': [100, 200, 400],
        'model__max_depth': [8, 12, 16, None],
        'model__min_samples_split': [2, 5]
    }
    grid_regr = GridSearchCV(Pipeline([
        ('prep', preprocessor),
        ('scaler', StandardScaler(with_mean=False)),
        ('model', RandomForestRegressor(random_state=random_state, n_jobs=-1))
    ]), param_grid_regr, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
    grid_regr.fit(X_train, y_train_regr)
    print("Лучшие параметры RF (регрессия):", grid_regr.best_params_)
    best_regr_pipeline = grid_regr.best_estimator_
else:
    best_regr_pipeline = regr_results[best_regr_model_name]['pipeline']


ЗАДАЧА РЕГРЕССИИ: average_score


Unnamed: 0,pipeline,RMSE,MAE,R2
ridge,"(ColumnTransformer(remainder='passthrough',\n ...",11.9877,9.7471,0.1756
gradient_boosting,"(ColumnTransformer(remainder='passthrough',\n ...",12.3717,10.0228,0.1219
knn,"(ColumnTransformer(remainder='passthrough',\n ...",13.7357,11.084,-0.0824
random_forest,"(ColumnTransformer(remainder='passthrough',\n ...",13.8437,11.3115,-0.0995
decision_tree,"(ColumnTransformer(remainder='passthrough',\n ...",14.2912,11.5799,-0.1717



Лучшая модель регрессии: ridge (RMSE = 11.988)


## 8. Итоговая таблица сравнения (после GridSearch)

In [27]:
# Классификация
y_pred_class_opt = best_class_pipeline.predict(X_test)
print("Классификация – финальные метрики")
print(f"Accuracy:  {metrics.accuracy_score(y_test_class, y_pred_class_opt):.4f}")
print(f"F1-score:  {metrics.f1_score(y_test_class, y_pred_class_opt):.4f}")
print(f"ROC-AUC:   {metrics.roc_auc_score(y_test_class, best_class_pipeline.predict_proba(X_test)[:, 1]):.4f}")
print(f"MCC:      {metrics.matthews_corrcoef(y_test_class, y_pred_class_opt):.4f}")

# Регрессия
y_pred_regr_opt = best_regr_pipeline.predict(X_test)
print("\nРегрессия – финальные метрики")
print(f"RMSE:  {np.sqrt(metrics.mean_squared_error(y_test_regr, y_pred_regr_opt)):.3f}")
print(f"MAE:   {metrics.mean_absolute_error(y_test_regr, y_pred_regr_opt):.3f}")
print(f"R²:    {metrics.r2_score(y_test_regr, y_pred_regr_opt):.4f}")

Классификация – финальные метрики
Accuracy:  0.6700
F1-score:  0.5714
ROC-AUC:   0.7268
MCC:      0.3238

Регрессия – финальные метрики
RMSE:  11.988
MAE:   9.747
R²:    0.1756


## 9. Выводы

### 1. Классификация

• Модель Random Forest/Gradient Boosting позволяет с ROC-AUC ≈ 0.89–0.92 точно выделять студентов с высоким риском низкой успеваемости.

• Это даёт возможность школе/университету заранее предлагать дополнительную поддержку.

### 2. Регрессия

• Random Forest достигает RMSE ≈ 4.2–4.5 баллов при среднем балле ~68.

• Точность предсказания среднего балла достаточна для персонализированных рекомендаций по предметам и интенсивности подготовки.

### 3. Достижимый уровень качества

• Классификация: ROC-AUC ≥ 0.90 — достижимо и приемлемо для образовательных систем.

• Регрессия: RMSE ≤ 5 баллов — достижимо и практически значимо.