In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import joblib

In [3]:
df_clean = joblib.load('cleaned_data.pkl')
print("Data loaded successfully!")

Data loaded successfully!


In [4]:
df_numeric = df_clean.select_dtypes(include=[int, float]).copy()

In [5]:
if 'treatment_encoded' not in df_numeric.columns:
    raise ValueError("'treatment_encoded' column is missing. Please check encoding step.")

In [6]:
selected_columns = [
    'self_employed_encoded',
    'mental_health_consequence_encoded',
    'family_history_encoded',
    'work_interfere_encoded',
    'benefits_encoded',
    'care_options_encoded',
    'wellness_program_encoded',
    'seek_help_encoded',
    'coworkers_encoded',
    'supervisor_encoded',
    'mental_health_interview_encoded'
]

In [7]:
X = df_numeric[selected_columns]
y = df_numeric['treatment_encoded']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [9]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, random_state=42, learning_rate=0.1, use_label_encoder=False, eval_metric='logloss')
}

In [10]:
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    results.append({
        "Model": name,
        "Accuracy": acc
    })
    
    print(f"\n{name} Classification Report:")
    print(classification_report(y_test, y_pred))


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.80      0.75       108
           1       0.82      0.74      0.78       139

    accuracy                           0.77       247
   macro avg       0.76      0.77      0.76       247
weighted avg       0.77      0.77      0.77       247


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.79      0.74       108
           1       0.82      0.74      0.78       139

    accuracy                           0.76       247
   macro avg       0.76      0.76      0.76       247
weighted avg       0.77      0.76      0.76       247


XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.81      0.75       108
           1       0.83      0.74      0.78       139

    accuracy                           0.77       247
   macro avg  

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [11]:
results_df = pd.DataFrame(results)
print("\nAccuracy Summary:")
print(results_df)


Accuracy Summary:
                 Model  Accuracy
0  Logistic Regression  0.765182
1        Random Forest  0.761134
2              XGBoost  0.769231


In [12]:
joblib.dump(df_numeric, 'df_numeric.pkl')

['df_numeric.pkl']