## Import 

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer # Corrected import for SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

## Dataset

In [5]:
df = pd.read_csv("social_media_vs_productivity.csv")
df.head().T

Unnamed: 0,0,1,2,3,4
age,56,46,32,60,25
gender,Male,Male,Male,Female,Male
job_type,Unemployed,Health,Finance,Unemployed,IT
daily_social_media_time,4.18094,3.249603,,,
social_platform_preference,Facebook,Twitter,Twitter,Facebook,Telegram
number_of_notifications,61,59,57,59,66
work_hours_per_day,6.753558,9.169296,7.910952,6.355027,6.214096
perceived_productivity_score,8.040464,5.063368,3.861762,2.916331,8.868753
actual_productivity_score,7.291555,5.165093,3.474053,1.774869,
stress_level,4.0,7.0,4.0,6.0,7.0


## Preprocessing

In [7]:
# feature & target
all_features = [
    'daily_social_media_time',
    'number_of_notifications',
    'screen_time_before_sleep',
    'breaks_during_work',
    'uses_focus_apps',
    'has_digital_wellbeing_enabled',
    'coffee_consumption_per_day',
    'sleep_hours',
    'weekly_offline_hours',
    'age',
    'gender',
    'job_type',
    'social_platform_preference',
    'work_hours_per_day'
]
target = 'actual_productivity_score'

In [15]:
# missing value handling
print("\n--- Initial Missing Values Across All Columns ---")
missing_info = df.isnull().sum()
missing_percentage = (df.isnull().sum() / len(df)) * 100
missing_df = pd.DataFrame({'Missing Count': missing_info, 'Missing %': missing_percentage})
# This line prints the missing_df, filtered for columns with missing values, and sorted by "Missing %" in descending order
print(missing_df.sort_values(by='Missing %', ascending=False))
print("-" * 50) 


--- Initial Missing Values Across All Columns ---
                                Missing Count  Missing %
daily_social_media_time                  2765   9.216667
job_satisfaction_score                   2730   9.100000
sleep_hours                              2598   8.660000
actual_productivity_score                2365   7.883333
screen_time_before_sleep                 2211   7.370000
stress_level                             1904   6.346667
perceived_productivity_score             1614   5.380000
work_hours_per_day                          0   0.000000
number_of_notifications                     0   0.000000
gender                                      0   0.000000
social_platform_preference                  0   0.000000
job_type                                    0   0.000000
breaks_during_work                          0   0.000000
uses_focus_apps                             0   0.000000
has_digital_wellbeing_enabled               0   0.000000
coffee_consumption_per_day           