In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import joblib

In [18]:
df_numeric = joblib.load('df_numeric.pkl')
print("Data loaded successfully!")

Data loaded successfully!


In [19]:
age_features = [
    'self_employed_encoded',
    'remote_work_encoded',
    'tech_company_encoded',
    'benefits_encoded',
    'care_options_encoded',
    'wellness_program_encoded',
    'seek_help_encoded',
    'leave_encoded',
    'mental_health_interview_encoded',
    'phys_health_interview_encoded'
]

In [20]:
X = df_numeric[age_features]
y = df_numeric['Age']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [15]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42, learning_rate=0.1)
}

In [16]:
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    results.append({
        "Model": name,
        "MSE": mse,
        "RMSE": rmse,
        "R² Score": r2
    })

In [17]:
results_df = pd.DataFrame(results)
print(results_df)

               Model        MSE      RMSE  R² Score
0  Linear Regression  43.954411  6.629812  0.027628
1      Random Forest  56.401374  7.510085 -0.247727
2            XGBoost  52.247162  7.228220 -0.155826
