In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [3]:
df = pd.read_csv('jamb_exam_results.csv')

In [4]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [5]:
df.head()

Unnamed: 0,jamb_score,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,student_id,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [6]:
# Удаляем столбец student_id
df = df.drop('student_id', axis=1)

In [7]:
# Заполняем пропущенные значения нулями
df = df.fillna(0)

In [8]:
# Разделяем на признаки и целевую переменную
X = df.drop('jamb_score', axis=1)
y = df['jamb_score']

In [9]:
# Разделяем данных на train/validation/test
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.25, random_state=1
)

In [10]:
# Преобразование датафреймов в матрицы с помощью DictVectorizer
train_dict = X_train.to_dict(orient='records')
val_dict = X_val.to_dict(orient='records')
test_dict = X_test.to_dict(orient='records')

dv = DictVectorizer(sparse=True)
X_train_encoded = dv.fit_transform(train_dict)
X_val_encoded = dv.transform(val_dict)
X_test_encoded = dv.transform(test_dict)

In [11]:
# Вопрос 1
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train_encoded, y_train)

feature_names = dv.get_feature_names_out()
feature_used = feature_names[dt.tree_.feature[0]]

print("Вопрос 1: Признак, используемый для разбиения:", feature_used)

Вопрос 1: Признак, используемый для разбиения: study_hours_per_week


In [16]:
# Вопрос 2
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train_encoded, y_train)
y_pred = rf.predict(X_val_encoded)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Вопрос 2: RMSE = {rmse:.5f}")

Вопрос 2: RMSE = 42.13724


In [18]:
# Вопрос 3
best_rmse = float('inf')
best_n = 0
n_estimators_values = range(10, 201, 10)

for n in n_estimators_values:
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train_encoded, y_train)
    y_pred = rf.predict(X_val_encoded)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))

    if rmse < best_rmse:
        best_rmse = rmse
        best_n = n

    print(f"n_estimators={n}: RMSE={rmse:.3f}")

print(f"Вопрос 3: После какого значения n_estimators RMSE перестает улучшаться?= {best_n}")

n_estimators=10: RMSE=42.137
n_estimators=20: RMSE=41.461
n_estimators=30: RMSE=41.106
n_estimators=40: RMSE=40.917
n_estimators=50: RMSE=40.852
n_estimators=60: RMSE=40.784
n_estimators=70: RMSE=40.677
n_estimators=80: RMSE=40.539
n_estimators=90: RMSE=40.504
n_estimators=100: RMSE=40.517
n_estimators=110: RMSE=40.593
n_estimators=120: RMSE=40.625
n_estimators=130: RMSE=40.651
n_estimators=140: RMSE=40.595
n_estimators=150: RMSE=40.597
n_estimators=160: RMSE=40.604
n_estimators=170: RMSE=40.628
n_estimators=180: RMSE=40.641
n_estimators=190: RMSE=40.631
n_estimators=200: RMSE=40.601
Вопрос 3: После какого значения n_estimators RMSE перестает улучшаться?= 90


In [25]:
# Вопрос 4
max_depth_values = [10, 15, 20, 25]
best_depth = None
best_avg_rmse = float('inf')

for depth in max_depth_values:
    rmse_scores = []

    for n in n_estimators_values:
        rf = RandomForestRegressor(
            n_estimators=n,
            max_depth=depth,
            random_state=1,
            n_jobs=-1
        )
        rf.fit(X_train_encoded, y_train)
        y_pred = rf.predict(X_val_encoded)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores.append(rmse)

    avg_rmse = np.mean(rmse_scores)
    print(f"max_depth={depth}: Средний RMSE={avg_rmse:.3f}")

    if avg_rmse < best_avg_rmse:
        best_avg_rmse = avg_rmse
        best_depth = depth

print(f"Вопрос 4: Лучший max_depth = {best_depth}")


max_depth=10: Средний RMSE=40.392
max_depth=15: Средний RMSE=40.735
max_depth=20: Средний RMSE=40.740
max_depth=25: Средний RMSE=40.788
Вопрос 4: Лучший max_depth = 10


In [26]:
# Вопрос 5
rf_final = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)
rf_final.fit(X_train_encoded, y_train)

feature_importances = rf_final.feature_importances_
feature_importance_dict = dict(zip(feature_names, feature_importances))

target_features = [
    'study_hours_per_week',
    'attendance_rate',
    'distance_to_school',
    'teacher_quality'
]
most_important_feature = None
highest_importance = 0

for feature in target_features:
    if feature in feature_importance_dict:
        importance = feature_importance_dict[feature]
        print(f"{feature}: {importance:.4f}")
        if importance > highest_importance:
            highest_importance = importance
            most_important_feature = feature

print(f"Вопрос 5: Самый важный признак: {most_important_feature}")

study_hours_per_week: 0.2484
attendance_rate: 0.1497
distance_to_school: 0.1365
teacher_quality: 0.0827
Вопрос 5: Самый важный признак: study_hours_per_week


In [27]:
print(f"Ответы на вопросы")
print(f"Вопрос 1: study_hours_per_week")
print(f"Вопрос 2: 42.13")
print(f"Вопрос 3: 90 (ближайший из предложенных ответов - 80)")
print(f"Вопрос 4: 10")
print(f"Вопрос 5: study_hours_per_week")


Ответы на вопросы
Вопрос 1: study_hours_per_week
Вопрос 2: 42.13
Вопрос 3: 90 (ближайший из предложенных ответов - 80)
Вопрос 4: 10
Вопрос 5: study_hours_per_week
