In [1]:
# 1. Import Libraries
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [2]:
df=pd.read_csv(r"C:\Users\vansh\Downloads\archive (3)\synthetic_coffee_health_10000.csv")

In [3]:
df.isnull().sum()

ID                            0
Age                           0
Gender                        0
Country                       0
Coffee_Intake                 0
Caffeine_mg                   0
Sleep_Hours                   0
Sleep_Quality                 0
BMI                           0
Heart_Rate                    0
Stress_Level                  0
Physical_Activity_Hours       0
Health_Issues              5941
Occupation                    0
Smoking                       0
Alcohol_Consumption           0
dtype: int64

In [4]:
# Step 1: Separate rows with and without missing Health_Issues
df_known = df[df['Health_Issues'].notnull()].copy()
df_missing = df[df['Health_Issues'].isnull()].copy()

In [5]:
df_known.head(2)

Unnamed: 0,ID,Age,Gender,Country,Coffee_Intake,Caffeine_mg,Sleep_Hours,Sleep_Quality,BMI,Heart_Rate,Stress_Level,Physical_Activity_Hours,Health_Issues,Occupation,Smoking,Alcohol_Consumption
2,3,42,Male,Brazil,5.3,503.7,5.9,Fair,22.7,59,Medium,11.2,Mild,Office,0,0
3,4,53,Male,Germany,2.6,249.2,7.3,Good,24.7,71,Low,6.6,Mild,Other,0,0


In [6]:
df_missing.head(2)

Unnamed: 0,ID,Age,Gender,Country,Coffee_Intake,Caffeine_mg,Sleep_Hours,Sleep_Quality,BMI,Heart_Rate,Stress_Level,Physical_Activity_Hours,Health_Issues,Occupation,Smoking,Alcohol_Consumption
0,1,40,Male,Germany,3.5,328.1,7.5,Good,24.9,78,Low,14.5,,Other,0,0
1,2,33,Male,Germany,1.0,94.1,6.2,Good,20.0,67,Low,11.0,,Service,0,0


In [7]:
df_known.isnull().sum()

ID                         0
Age                        0
Gender                     0
Country                    0
Coffee_Intake              0
Caffeine_mg                0
Sleep_Hours                0
Sleep_Quality              0
BMI                        0
Heart_Rate                 0
Stress_Level               0
Physical_Activity_Hours    0
Health_Issues              0
Occupation                 0
Smoking                    0
Alcohol_Consumption        0
dtype: int64

In [8]:
# Step 2: Encode categorical variables for modeling


categorical_cols = ['Gender','Country','Sleep_Quality','Stress_Level','Occupation']
df_encoded = df_known.copy()

label_encoders = {}
for col in categorical_cols + ['Health_Issues']:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le


In [9]:
df_encoded.head(2)

Unnamed: 0,ID,Age,Gender,Country,Coffee_Intake,Caffeine_mg,Sleep_Hours,Sleep_Quality,BMI,Heart_Rate,Stress_Level,Physical_Activity_Hours,Health_Issues,Occupation,Smoking,Alcohol_Consumption
2,3,42,1,2,5.3,503.7,5.9,1,22.7,59,2,11.2,0,1,0,0
3,4,53,1,7,2.6,249.2,7.3,2,24.7,71,1,6.6,0,2,0,0


In [10]:
# Step 3: Define features & target
features = ['Age','Coffee_Intake','Caffeine_mg','Sleep_Hours','BMI',
            'Heart_Rate','Physical_Activity_Hours','Smoking','Alcohol_Consumption',
            'Gender','Country','Sleep_Quality','Stress_Level','Occupation']

X = df_encoded[features]
y = df_encoded['Health_Issues']


In [11]:
# Step 4: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:

# Step 5: Train RandomForest model
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)


In [13]:

# Step 6: Evaluate
y_pred = rf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       720
           1       0.98      1.00      0.99        88
           2       1.00      0.75      0.86         4

    accuracy                           1.00       812
   macro avg       0.99      0.92      0.95       812
weighted avg       1.00      1.00      1.00       812

Accuracy: 0.9975369458128078


In [14]:
# Step 7: Predict missing values
df_missing_encoded = df_missing.copy()
for col in categorical_cols:
    df_missing_encoded[col] = label_encoders[col].transform(df_missing_encoded[col])


In [15]:
X_missing = df_missing_encoded[features]
pred_missing = rf.predict(X_missing)


In [16]:
# Step 8: Fill missing values back into df
df.loc[df['Health_Issues'].isnull(), 'Health_Issues'] = label_encoders['Health_Issues'].inverse_transform(pred_missing)


In [17]:
df.isnull().sum()

ID                         0
Age                        0
Gender                     0
Country                    0
Coffee_Intake              0
Caffeine_mg                0
Sleep_Hours                0
Sleep_Quality              0
BMI                        0
Heart_Rate                 0
Stress_Level               0
Physical_Activity_Hours    0
Health_Issues              0
Occupation                 0
Smoking                    0
Alcohol_Consumption        0
dtype: int64

In [18]:
df.to_csv('cofee_health.csv', index=False)