In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('jamb_exam_results.csv')

df.head()

In [None]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

df = df.drop('student_id', axis=1)

df = df.fillna(0)

Разбиение выборки на признаки и целевую переменную

In [None]:
X = df.drop('jamb_score', axis=1)
y = df['jamb_score']

Разделение выборки на train/validation/test 60/20/20

In [None]:
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.25, random_state=1
)

Преобразование датафреймов в матрицы с помощью DictVectorizer

In [None]:

train_dict = X_train.to_dict(orient='records')
val_dict = X_val.to_dict(orient='records')
test_dict = X_test.to_dict(orient='records')

dv = DictVectorizer(sparse=True)
X_train_encoded = dv.fit_transform(train_dict)
X_val_encoded = dv.transform(val_dict)
X_test_encoded = dv.transform(test_dict)


In [None]:
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train_encoded, y_train)

feature_names = dv.get_feature_names_out()
root_feature = feature_names[dt.tree_.feature[0]]

print("Вопрос 1: Какой признак используется для разбиения данных?")
print(f"Ответ: {root_feature}")


In [None]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train_encoded, y_train)
y_pred = rf.predict(X_val_encoded)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print("Вопрос 2: Какое значение RMSE у этой модели на валидационных данных?")
print(f"Ответ: {rmse:.2f}")

In [None]:
n_estimators_list = list(range(10, 201, 10))
rmse_scores = []

for n_est in n_estimators_list:
    rf = RandomForestRegressor(n_estimators=n_est, random_state=1, n_jobs=-1)
    rf.fit(X_train_encoded, y_train)
    y_pred = rf.predict(X_val_encoded)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores.append(rmse)

# Поиск точки, после которой RMSE перестает улучшаться
best_rmse = float('inf')
best_n_estimators = 0
improvement_threshold = 0.001  # Порог улучшения

for i, (n_est, score) in enumerate(zip(n_estimators_list, rmse_scores)):
    if score < best_rmse - improvement_threshold:
        best_rmse = score
        best_n_estimators = n_est

print("Вопрос 3: После какого значения n_estimators RMSE перестает улучшаться?")
print(f"Ответ: {best_n_estimators}")

In [None]:
max_depth_list = [10, 15, 20, 25]
depth_rmse_scores = []

for depth in max_depth_list:
    rmse_scores_depth = []
    for n_est in range(10, 201, 10):
        rf = RandomForestRegressor(
            n_estimators=n_est,
            max_depth=depth,
            random_state=1,
            n_jobs=-1
        )
        rf.fit(X_train_encoded, y_train)
        y_pred = rf.predict(X_val_encoded)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores_depth.append(rmse)

    mean_rmse = np.mean(rmse_scores_depth)
    depth_rmse_scores.append(mean_rmse)
    print(f"max_depth={depth}, средний RMSE: {mean_rmse:.4f}")

best_depth_idx = np.argmin(depth_rmse_scores)
best_depth = max_depth_list[best_depth_idx]

print("\nВопрос 4: Какое значение max_depth оказалось лучшим по среднему RMSE?")
print(f"Ответ: {best_depth}")


In [None]:
rf_final = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)
rf_final.fit(X_train_encoded, y_train)

feature_importances = rf_final.feature_importances_
feature_importance_dict = dict(zip(feature_names, feature_importances))

target_features = ['study_hours_per_week', 'attendance_rate', 'distance_to_school', 'teacher_quality']
most_important_feature = None
max_importance = -1

for feature in target_features:
    for full_feature in feature_names:
        if feature in full_feature:
            if feature_importance_dict[full_feature] > max_importance:
                max_importance = feature_importance_dict[full_feature]
                most_important_feature = feature
            break

print("Вопрос 5: Какой признак оказался самым важным?")
print(f"Ответ: {most_important_feature}")

In [None]:
print(f"Ответы на вопросы:")
print(f"Вопрос 1: {root_feature}")
print(f"Вопрос 2: {rmse:.2f}")
print(f"Вопрос 3: {best_n_estimators}")
print(f"Вопрос 4: {best_depth}")
print(f"Вопрос 5: {most_important_feature}")