In [107]:
# Импорт библиотек
import pandas as pd
import numpy as np
from joblib import dump
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

In [108]:
# Загрузка данных
data = pd.read_csv('Student Depression Dataset.csv')
# Проверяем наличие NaN в DataFrame
print(data.isnull().values.any()) 

True


In [109]:
# Заполнение пропусков
for col in data.columns:
    if data[col].isnull().any():
        if data[col].dtype in ['int64', 'float64']: 
          median_value = data[col].median() # Замена на медиану для численных признаков
          data[col] = data[col].fillna(median_value)
        else:
            mode_value = data[col].mode()[0]  # Замена на моду для категориальных признаков
            data[col] = data[col].fillna(mode_value)

In [110]:
#Удаляем ненужные столбцы для анализа
data = data.drop(['id','City', 'Profession', 'Work Pressure', 'Job Satisfaction', 'Degree'], axis=1)
data.head()

Unnamed: 0,Gender,Age,Academic Pressure,CGPA,Study Satisfaction,Sleep Duration,Dietary Habits,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,Male,33.0,5.0,8.97,2.0,5-6 hours,Healthy,Yes,3.0,1.0,No,1
1,Female,24.0,2.0,5.9,5.0,5-6 hours,Moderate,No,3.0,2.0,Yes,0
2,Male,31.0,3.0,7.03,5.0,Less than 5 hours,Healthy,No,9.0,1.0,Yes,0
3,Female,28.0,3.0,5.59,2.0,7-8 hours,Moderate,Yes,4.0,5.0,Yes,1
4,Female,25.0,4.0,8.13,3.0,5-6 hours,Moderate,Yes,1.0,1.0,No,0


In [111]:
# Определение признаков
categorical_features = ['Gender', 'Sleep Duration', 'Dietary Habits', 'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']
numerical_features = ['Age', 'Academic Pressure', 'CGPA', 'Study Satisfaction', 'Work/Study Hours', 'Financial Stress']

# Разделение на обучающую и тестовую выборки
X = data.drop('Depression', axis=1)
y = data['Depression']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [112]:
# Выбор категориальных колонок из обучающей выборки
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Создание и применение OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train[categorical_cols])
X_test_encoded = encoder.transform(X_test[categorical_cols])
feature_names = encoder.get_feature_names_out(categorical_cols)


# Преобразование в DataFrame
X_train_encoded = pd.DataFrame(X_train_encoded, columns = feature_names)
X_test_encoded = pd.DataFrame(X_test_encoded, columns = feature_names)
# Выбор числовых признаков
X_train_num = X_train[numerical_features].reset_index(drop=True)
X_test_num = X_test[numerical_features].reset_index(drop = True)

#Объединение числовых и категориальных признаков
X_train = pd.concat([X_train_num, X_train_encoded], axis = 1)
X_test = pd.concat([X_test_num, X_test_encoded], axis = 1)

X_train.head()


Unnamed: 0,Age,Academic Pressure,CGPA,Study Satisfaction,Work/Study Hours,Financial Stress,Gender_Male,Sleep Duration_7-8 hours,Sleep Duration_Less than 5 hours,Sleep Duration_More than 8 hours,Sleep Duration_Others,Dietary Habits_Moderate,Dietary Habits_Others,Dietary Habits_Unhealthy,Have you ever had suicidal thoughts ?_Yes,Family History of Mental Illness_Yes
0,28.0,2.0,8.29,5.0,4.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,33.0,4.0,9.05,4.0,12.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,33.0,4.0,8.08,4.0,12.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,29.0,2.0,5.76,4.0,10.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,20.0,5.0,5.77,5.0,11.0,5.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [113]:
# Обучение модели
model = GradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)

In [114]:
# Оценка модели
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"RMSE: {rmse}")
print(f"R2 Score: {r2}")

RMSE: 0.34220421344976626
R2 Score: 0.5192208590319141


In [115]:
dump(model, 'depression_model.pkl')

['depression_model.pkl']

In [106]:
#LinearRegression()
#RMSE: 0.34940422215674777
#R2 Score: 0.4987767487314202

#GradientBoostingRegressor(random_state=42)
#RMSE: 0.34220421344976626
#R2 Score: 0.5192208590319141

#RandomForestClassifier(n_estimators=100, random_state=42)
#RMSE: 0.41732761291200865
#R2 Score: 0.2849619475514439