In [5]:
import pandas as pd

# Повторная загрузка и подготовка данных
file_path = 'jamb_exam_results.csv'
df = pd.read_csv(file_path)
df.columns = df.columns.str.lower().str.replace(' ', '_')
df = df.drop(columns=['student_id'])
df = df.fillna(0)


# Разделение данных на признаки и целевую переменную
X = df.drop(columns=['jamb_score'])
y = df['jamb_score']

# Разделение на train, validation, test в соотношении 60%/20%/20%
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)

# Преобразование в матрицы с помощью DictVectorizer
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(X_train.to_dict(orient='records'))
X_val = dv.transform(X_val.to_dict(orient='records'))
X_test = dv.transform(X_test.to_dict(orient='records'))


df.head()


Unnamed: 0,jamb_score,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,15,Male,High,0,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,22,Female,Medium,Tertiary,1


In [8]:
# Вопрос 1: Обучение дерева решений с max_depth=1
model_tree = DecisionTreeRegressor(max_depth=1, random_state=1)
model_tree.fit(X_train, y_train)
feature_index = model_tree.tree_.feature[0]
split_feature = dv.get_feature_names_out()[feature_index]

# Вопрос 2: Обучение случайного леса и расчет RMSE
model_rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
model_rf.fit(X_train, y_train)
y_pred_val = model_rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))

split_feature, rmse

('study_hours_per_week', np.float64(43.157758977963624))

In [None]:
# Вопрос 3: Эксперименты с n_estimators

n_estimators_range = range(10, 201, 10)
rmse_scores = []

for n in n_estimators_range:
    model_rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    model_rf.fit(X_train, y_train)
    y_pred_val = model_rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    rmse_scores.append(rmse)

# Поиск значения n_estimators, после которого RMSE перестает улучшаться
min_rmse_index = np.argmin(rmse_scores)
n_estimators_best = n_estimators_range[min_rmse_index]

print(f"Лучший RMSE для n_estimators: {n_estimators_best}, RMSE: {rmse_scores[min_rmse_index]:.3f}")


In [None]:
# Вопрос 4: Поиск лучшего значения max_depth

max_depth_values = [10, 15, 20, 25]
avg_rmse_by_depth = {}

for depth in max_depth_values:
    rmse_values = []
    for n in n_estimators_range:
        model_rf = RandomForestRegressor(n_estimators=n, max_depth=depth, random_state=1, n_jobs=-1)
        model_rf.fit(X_train, y_train)
        y_pred_val = model_rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
        rmse_values.append(rmse)
    avg_rmse = np.mean(rmse_values)
    avg_rmse_by_depth[depth] = avg_rmse
    print(f"max_depth={depth}, средний RMSE: {avg_rmse:.3f}")

# Выбор лучшего значения max_depth
best_depth = min(avg_rmse_by_depth, key=avg_rmse_by_depth.get)
print(f"Лучшее значение max_depth: {best_depth}, средний RMSE: {avg_rmse_by_depth[best_depth]:.3f}")


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Обучение модели с n_estimators=10 и max_depth=20
model_rf_importance = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
model_rf_importance.fit(X_train, y_train)

# Извлечение информации о важности признаков
importances = model_rf_importance.feature_importances_
feature_names = dv.get_feature_names_out()

# Находим индексы интересующих признаков
target_features = ['study_hours_per_week', 'attendance_rate', 'distance_to_school', 'teacher_quality']
target_importances = {feature: importances[feature_names.tolist().index(feature)] for feature in target_features}

# Определение самого важного признака
most_important_feature = max(target_importances, key=target_importances.get)

print("Важность признаков:", target_importances)
print(f"Самый важный признак: {most_important_feature}")


In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Создание DMatrix для train и validation
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
watchlist = [(dtrain, 'train'), (dval, 'eval')]

# Функция для обучения модели XGBoost и возврата RMSE
def train_xgb(eta_value):
    xgb_params = {
        'eta': eta_value,
        'max_depth': 6,
        'min_child_weight': 1,
        'objective': 'reg:squarederror',
        'nthread': 8,
        'seed': 1,
        'verbosity': 1
    }
    model_xgb = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)
    y_pred_val = model_xgb.predict(dval)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    return rmse

# Обучение с eta=0


In [None]:

#1)Study hours per week
#2)42.13
#3)200
#4)10
#5)study_hours_per_week
#6)0,1