# Лабораторная работа № 3

## 1. Определение бизнес-целей для набора данных

### Бизнес-цель 1

Определить, какие студенты относятся к группе высокого риска (низкая успеваемость) для своевременного вмешательства.

### Бизнес-цель 2

Прогнозировать точный средний балл студента для персонализированных рекомендаций по обучению.

## 2. Формулировка технических целей проекта

### Техническая цель для бизнес-цели 1

Бинарная классификация: high_performance = 1, если средний балл ≥ 75, иначе 0

### Техническая цель для бизнес-цели 2

Регрессия: предсказание average_score (среднее по трём предметам)

## 3. Подготовка и сбор данных

In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
import dask

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import mutual_info_score
import featuretools as ft

plt.rcParams['figure.figsize'] = (10, 6)

In [59]:
df = pd.read_csv('../data/students_performance.csv')

print(df.head())
print(df.info())
print(df.describe())

df['average_score'] = df[['math score', 'reading score', 'writing score']].mean(axis=1)
df['high_performance'] = (df['average_score'] >= 75).astype(int)

print("Распределение классов high_performance:")
print(df['high_performance'].value_counts(normalize=True))

   gender race/ethnicity parental level of education         lunch  \
0  female        group B           bachelor's degree      standard   
1  female        group C                some college      standard   
2  female        group B             master's degree      standard   
3    male        group A          associate's degree  free/reduced   
4    male        group C                some college      standard   

  test preparation course  math score  reading score  writing score  
0                    none          72             72             74  
1               completed          69             90             88  
2                    none          90             95             93  
3                    none          47             57             44  
4                    none          76             78             75  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtyp

## 4. Разбиение данных на обучающую, контрольную и тестовую выборки

In [60]:
train_val, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['high_performance'])

train, val = train_test_split(train_val, test_size=0.2, random_state=42, stratify=train_val['high_performance'])

print(f"Train: {train.shape}, Val: {val.shape}, Test: {test.shape}")

Train: (640, 10), Val: (160, 10), Test: (200, 10)


## 5. Анализ сбалансированности и аугментация данных

In [61]:
print("Train классы:")
print(train['high_performance'].value_counts(normalize=True))
print("\nVal классы:")
print(val['high_performance'].value_counts(normalize=True))

Train классы:
high_performance
0    0.676562
1    0.323437
Name: proportion, dtype: float64

Val классы:
high_performance
0    0.675
1    0.325
Name: proportion, dtype: float64


## 6-7. Конструирование признаков для решения бизнес-задач

### Ручное конструирование

In [62]:
def create_features(df_in):
    df = df_in.copy()

    # 1. Унитарное кодирование (One-Hot Encoding)
    cat_columns = ['gender', 'race/ethnicity', 'parental level of education',
                   'lunch', 'test preparation course']
    ohe = OneHotEncoder(sparse_output=False, drop='first')
    encoded = ohe.fit_transform(df[cat_columns])
    encoded_df = pd.DataFrame(encoded, columns=ohe.get_feature_names_out(), index=df.index)
    df = pd.concat([df, encoded_df], axis=1)

    # 2. Дискретизация числовых признаков
    # Дискретизация среднего балла (квантильно)
    df['avg_score_qcut'] = pd.qcut(df['average_score'], q=5, labels=False)

    # Дискретизация отдельных предметов (равные интервалы)
    for col in ['math score', 'reading score', 'writing score']:
        df[f'{col}_bin'] = pd.cut(df[col], bins=5, labels=False)

    # 3. Ручной синтез признаков
    df['reading_writing_diff'] = df['reading score'] - df['writing score']
    df['math_reading_gap'] = df['math score'] - df['reading score']
    df['total_score'] = df['math score'] + df['reading score'] + df['writing score']
    df['has_completed_prep'] = (df['test preparation course'] == 'completed').astype(int)
    df['standard_lunch'] = (df['lunch'] == 'standard').astype(int)

    # Признак образования родителей (упорядочим)
    edu_order = {
        "some high school": 0,
        "high school": 1,
        "some college": 2,
        "associate's degree": 3,
        "bachelor's degree": 4,
        "master's degree": 5
    }
    df['parent_edu_level'] = df['parental level of education'].map(edu_order)

    # 4. Масштабирование
    scaler_std = StandardScaler()
    scaler_minmax = MinMaxScaler()

    num_cols = ['math score', 'reading score', 'writing score', 'average_score', 'total_score']

    df[[f'{col}_std' for col in num_cols]] = scaler_std.fit_transform(df[num_cols])
    df[[f'{col}_norm' for col in num_cols]] = scaler_minmax.fit_transform(df[num_cols])

    return df, ohe, scaler_std, scaler_minmax


train_eng, ohe, scaler_std, scaler_minmax = create_features(train)


# Функция трансформации для val/test
def transform_features(df_in, ohe, scaler_std, scaler_minmax):
    df = df_in.copy()
    # OHE
    encoded = ohe.transform(df[['gender', 'race/ethnicity', 'parental level of education',
                                'lunch', 'test preparation course']])
    encoded_df = pd.DataFrame(encoded, columns=ohe.get_feature_names_out(), index=df.index)
    df = pd.concat([df, encoded_df], axis=1)

    # Дискретизация
    df['avg_score_qcut'] = pd.qcut(df['average_score'], q=5, labels=False)
    for col in ['math score', 'reading score', 'writing score']:
        df[f'{col}_bin'] = pd.cut(df[col], bins=5, labels=False)

    # Ручные признаки
    df['reading_writing_diff'] = df['reading score'] - df['writing score']
    df['math_reading_gap'] = df['math score'] - df['reading score']
    df['total_score'] = df['math score'] + df['reading score'] + df['writing score']
    df['has_completed_prep'] = (df['test preparation course'] == 'completed').astype(int)
    df['standard_lunch'] = (df['lunch'] == 'standard').astype(int)

    edu_order = {
        "some high school": 0, "high school": 1, "some college": 2,
        "associate's degree": 3, "bachelor's degree": 4, "master's degree": 5
    }
    df['parent_edu_level'] = df['parental level of education'].map(edu_order)

    # Масштабирование
    num_cols = ['math score', 'reading score', 'writing score', 'average_score', 'total_score']
    df[[f'{col}_std' for col in num_cols]] = scaler_std.transform(df[num_cols])
    df[[f'{col}_norm' for col in num_cols]] = scaler_minmax.transform(df[num_cols])

    return df


val_eng = transform_features(val, ohe, scaler_std, scaler_minmax)
print(val_eng.head())
test_eng = transform_features(test, ohe, scaler_std, scaler_minmax)
print(test_eng.head())

     gender race/ethnicity parental level of education         lunch  \
795  female        group E          associate's degree  free/reduced   
511    male        group A            some high school      standard   
61     male        group A            some high school  free/reduced   
189  female        group C           bachelor's degree      standard   
942    male        group C                 high school      standard   

    test preparation course  math score  reading score  writing score  \
795               completed          57             68             73   
511                    none          64             50             43   
61                     none          39             39             34   
189                    none          77             88             87   
942                    none          81             66             64   

     average_score  high_performance  ...  math score_std  reading score_std  \
795      66.000000                 0  ...       

## 8. Автоматизированное конструирование признаков с помощью Featuretools

In [63]:
import featuretools as ft

es = ft.EntitySet(id='students_performance')

es = es.add_dataframe(
    dataframe_name="students",
    dataframe=train.copy().reset_index(drop=True),
    index="student_id",
    make_index=True,
    logical_types={
        "gender": "Categorical",
        "race/ethnicity": "Categorical",
        "parental level of education": "Categorical",
        "lunch": "Categorical",
        "test preparation course": "Categorical",
    }
)

feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="students",
    max_depth=2,
    agg_primitives=["mean", "std", "max", "min", "sum", "count"],
    trans_primitives=["add_numeric", "multiply_numeric"],

    n_jobs=1,
    dask_kwargs=None,
    verbose=True,
)

print(f"Успешно сгенерировано признаков: {len(feature_defs)}")
print(f"Размер матрицы: {feature_matrix.shape}")
print(feature_defs)

Built 30 features
Elapsed: 00:00 | Progress: 100%|██████████
Успешно сгенерировано признаков: 30
Размер матрицы: (640, 30)
[<Feature: gender>, <Feature: race/ethnicity>, <Feature: parental level of education>, <Feature: lunch>, <Feature: test preparation course>, <Feature: math score>, <Feature: reading score>, <Feature: writing score>, <Feature: average_score>, <Feature: high_performance>, <Feature: average_score + high_performance>, <Feature: average_score + math score>, <Feature: average_score + reading score>, <Feature: average_score + writing score>, <Feature: high_performance + math score>, <Feature: high_performance + reading score>, <Feature: high_performance + writing score>, <Feature: math score + reading score>, <Feature: math score + writing score>, <Feature: reading score + writing score>, <Feature: average_score * high_performance>, <Feature: average_score * math score>, <Feature: average_score * reading score>, <Feature: average_score * writing score>, <Feature: high_per

## 9. Оценка качества наборов признаков

In [64]:
from sklearn.feature_selection import mutual_info_classif

X_train = train_eng.drop(columns=['high_performance', 'average_score'] +
                                 ['gender', 'race/ethnicity', 'parental level of education',
                                  'lunch', 'test preparation course'])
y_train_class = train_eng['high_performance']

mi_scores = mutual_info_classif(X_train.select_dtypes(include=np.number), y_train_class, random_state=42)
mi_df = pd.DataFrame({'feature': X_train.select_dtypes(include=np.number).columns, 'mi_score': mi_scores})
mi_df = mi_df.sort_values('mi_score', ascending=False)

print("Топ-15 признаков по Mutual Information:")
print(mi_df.head(15))

Топ-15 признаков по Mutual Information:
               feature  mi_score
33  average_score_norm  0.630215
29     total_score_std  0.630215
28   average_score_std  0.630215
21         total_score  0.630215
34    total_score_norm  0.630215
31  reading score_norm  0.493064
1        reading score  0.489454
15      avg_score_qcut  0.487891
26   reading score_std  0.469444
32  writing score_norm  0.444514
27   writing score_std  0.432371
2        writing score  0.419680
30     math score_norm  0.409883
25      math score_std  0.408816
0           math score  0.401351
