In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import accuracy_score

In [3]:
from google.colab import files
uploaded = files.upload()
df = pd.read_csv("cleaned_burnout_dataset.csv")
print(df.head())

Saving cleaned_burnout_dataset.csv to cleaned_burnout_dataset.csv
   EmployeeID  Age      Gender    Country            JobRole Department  \
0        1001   50        Male         UK    Sales Associate         HR   
1        1002   36        Male    Germany  Software Engineer         IT   
2        1003   29  Non-binary      India           IT Admin         IT   
3        1004   42        Male  Australia      HR Specialist         IT   
4        1005   40        Male     Brazil   Customer Support    Support   

   YearsAtCompany  WorkHoursPerWeek RemoteWork  BurnoutLevel  ...  \
0              14                47         No          3.37  ...   
1               1                59     Hybrid          7.39  ...   
2              13                59     Hybrid          7.10  ...   
3              15                31        Yes          4.18  ...   
4               6                34        Yes          8.28  ...   

   CommuteTime  HasMentalHealthSupport  ManagerSupportScore  HasTher

In [4]:
X = df.drop("BurnoutRisk", axis=1)
y = df["BurnoutRisk"]
print("Features and target separated.")
print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features and target separated.
Features shape: (3000, 24)
Target shape: (3000,)


In [5]:
#Convert string columns to numeric
X = pd.get_dummies(X)
print(X.columns)

Index(['EmployeeID', 'Age', 'YearsAtCompany', 'WorkHoursPerWeek',
       'BurnoutLevel', 'JobSatisfaction', 'StressLevel', 'ProductivityScore',
       'SleepHours', 'PhysicalActivityHrs', 'CommuteTime',
       'ManagerSupportScore', 'MentalHealthDaysOff', 'WorkLifeBalanceScore',
       'TeamSize', 'CareerGrowthScore', 'Gender_Female', 'Gender_Male',
       'Gender_Non-binary', 'Gender_Prefer not to say', 'Country_Australia',
       'Country_Brazil', 'Country_Canada', 'Country_Germany', 'Country_India',
       'Country_UK', 'Country_USA', 'JobRole_Customer Support',
       'JobRole_Data Scientist', 'JobRole_HR Specialist', 'JobRole_IT Admin',
       'JobRole_Marketing Manager', 'JobRole_Project Manager',
       'JobRole_Sales Associate', 'JobRole_Software Engineer',
       'Department_Engineering', 'Department_HR', 'Department_IT',
       'Department_Marketing', 'Department_Sales', 'Department_Support',
       'RemoteWork_Hybrid', 'RemoteWork_No', 'RemoteWork_Yes',
       'HasMentalHeal

In [6]:
#Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])

Training samples: 2400
Testing samples: 600


In [7]:
#Train Models with All Features

#Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_preds = dt.predict(X_test)

#Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

#k-NN (needs scaling)
#k-NN is a distance-based algorithm,If features are on different scales, the distance calculation becomes unfair.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)
knn_preds = knn.predict(X_test_scaled)

print("Decision Tree:", accuracy_score(y_test, dt_preds))
print("Random Forest:", accuracy_score(y_test, rf_preds))
print("k-NN:", accuracy_score(y_test, knn_preds))


Decision Tree: 1.0
Random Forest: 1.0
k-NN: 0.725


In [8]:
# Use mutual information to score features
mi_scores = mutual_info_classif(X, y)
mi_series = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)
top3_mi = mi_series.head(3)
print("Top 3 Features (Mutual Info):\n", top3_mi)


Top 3 Features (Mutual Info):
 BurnoutLevel             0.631453
JobRole_HR Specialist    0.016094
WorkHoursPerWeek         0.013417
dtype: float64


In [9]:
X_top3 = X[top3_mi.index]
X_train3, X_test3, y_train3, y_test3 = train_test_split(X_top3, y, test_size=0.2, random_state=42)

# Decision Tree
dt.fit(X_train3, y_train3)
dt3_preds = dt.predict(X_test3)

# Random Forest
rf.fit(X_train3, y_train3)
rf3_preds = rf.predict(X_test3)

# k-NN
X_train3_scaled = scaler.fit_transform(X_train3)
X_test3_scaled = scaler.transform(X_test3)
knn.fit(X_train3_scaled, y_train3)
knn3_preds = knn.predict(X_test3_scaled)



In [10]:
print("Decision Tree (Top 3):", accuracy_score(y_test3, dt3_preds))
print("Random Forest (Top 3):", accuracy_score(y_test3, rf3_preds))
print("k-NN (Top 3):", accuracy_score(y_test3, knn3_preds))

Decision Tree (Top 3): 1.0
Random Forest (Top 3): 1.0
k-NN (Top 3): 0.99


| Model         | Full Accuracy | Top 3 Accuracy |
| ------------- | ------------- | -------------- |
| Decision Tree | 1.0           | 1.0            |
| Random Forest | 1.0           | 1.0            |
| k-NN          | 0.725         | ~ 0.99         |


1) Trees don’t care much if we give them the whole forest(all features) or just a few branches(top 3 features) they still did a good job.(100% accuracy)

2) k-NN improved a lot with 3 good features (from 72% to 99%)

3) We can concludee that, removing the useless features helped the model focus better,especially k-NN which is based on distance.

4)Sometimes less is more. smart feature selection can give big results, especially for models like k-NN