## Import 

In [75]:
import os
import heapq
import joblib
import random
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder

## Dataset

In [43]:
df = pd.read_csv("social_media_vs_productivity.csv")
df.head().T

Unnamed: 0,0,1,2,3,4
age,56,46,32,60,25
gender,Male,Male,Male,Female,Male
job_type,Unemployed,Health,Finance,Unemployed,IT
daily_social_media_time,4.18094,3.249603,,,
social_platform_preference,Facebook,Twitter,Twitter,Facebook,Telegram
number_of_notifications,61,59,57,59,66
work_hours_per_day,6.753558,9.169296,7.910952,6.355027,6.214096
perceived_productivity_score,8.040464,5.063368,3.861762,2.916331,8.868753
actual_productivity_score,7.291555,5.165093,3.474053,1.774869,
stress_level,4.0,7.0,4.0,6.0,7.0


## Column Types

In [44]:
# feature & target
all_features = [
    'daily_social_media_time',
    'number_of_notifications',
    'screen_time_before_sleep',
    'breaks_during_work',
    'uses_focus_apps',
    'has_digital_wellbeing_enabled',
    'coffee_consumption_per_day',
    'sleep_hours',
    'weekly_offline_hours',
    'age',
    'gender',
    'job_type',
    'social_platform_preference',
    'work_hours_per_day'
]
target = 'actual_productivity_score'

X = df[all_features].copy() # Use .copy() to avoid SettingWithCopyWarning
y = df[target]

In [45]:
# numeric & categorical
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(include='object').columns.tolist()

## Missing Value

In [46]:
# check missing
print("\n--- Initial Missing Values Across All Columns ---")
missing_info = df.isnull().sum()
missing_percentage = (df.isnull().sum() / len(df)) * 100
missing_df = pd.DataFrame({'Missing Count': missing_info, 'Missing %': missing_percentage})
print(missing_df.sort_values(by='Missing %', ascending=False))
print("-" * 50) 


--- Initial Missing Values Across All Columns ---
                                Missing Count  Missing %
daily_social_media_time                  2765   9.216667
job_satisfaction_score                   2730   9.100000
sleep_hours                              2598   8.660000
actual_productivity_score                2365   7.883333
screen_time_before_sleep                 2211   7.370000
stress_level                             1904   6.346667
perceived_productivity_score             1614   5.380000
work_hours_per_day                          0   0.000000
number_of_notifications                     0   0.000000
gender                                      0   0.000000
social_platform_preference                  0   0.000000
job_type                                    0   0.000000
breaks_during_work                          0   0.000000
uses_focus_apps                             0   0.000000
has_digital_wellbeing_enabled               0   0.000000
coffee_consumption_per_day           

## Tran-Test Split

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nTrain set shape: {X_train.shape}, Test set shape: {X_test.shape}")


Train set shape: (24000, 14), Test set shape: (6000, 14)


## Pre-Processing Pipelines

In [50]:
# numeric (imputer, scaler)
numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler())
])

# categorical (imputer, encoder)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)

In [74]:
# fit & transform
print("Fitting and transforming data with preprocessor...")
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
print(f"X_train_transformed shape: {X_train_transformed.shape}")

Fitting and transforming data with preprocessor...
X_train_transformed shape: (24000, 25)


In [67]:
# preview result
feature_names_out = preprocessor.get_feature_names_out()
if hasattr(X_train_transformed, 'toarray'):
    X_transformed_dense = X_train_transformed.toarray()
else:
    X_transformed_dense = X_train_transformed
X_transformed_df = pd.DataFrame(X_transformed_dense, columns=feature_names_out)

print(f"\nShape of Transformed X DataFrame: {X_transformed_df.shape}")
print("\n--- First 5 rows of Transformed X (as DataFrame) ---")
X_transformed_df.head().T


Shape of Transformed X DataFrame: (30000, 25)

--- First 5 rows of Transformed X (as DataFrame) ---


Unnamed: 0,0,1,2,3,4
num__daily_social_media_time,0.535151,0.068479,-0.829395,0.619826,0.316361
num__number_of_notifications,0.134811,-0.124134,-0.383079,-0.124134,0.782174
num__screen_time_before_sleep,-0.957441,-0.559208,-0.633582,0.281729,1.341478
num__breaks_during_work,0.947731,0.63264,-1.572998,-1.257907,-1.257907
num__coffee_consumption_per_day,1.418913,0.000496,0.709705,-1.41792,-0.708712
num__sleep_hours,-0.979844,-0.988801,1.47489,-0.316757,-0.775091
num__weekly_offline_hours,1.588729,-1.423109,-0.005303,1.856513,0.040227
num__age,1.049017,0.326212,-0.685715,1.338138,-1.191679
num__work_hours_per_day,-0.118753,1.090505,0.460609,-0.318248,-0.388795
cat__gender_Female,0.0,0.0,0.0,1.0,0.0


In [None]:
# export proprocessor
PREPROCESSOR_PATH = "model_preprocessor.pkl"
joblib.dump(preprocessor, PREPROCESSOR_PATH)
print(f"Preprocessor saved to: {PREPROCESSOR_PATH}")

## Random Forest Regressor

In [72]:
# fit
print("Training RandomForestRegressor model...")
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_transformed_df, y_train)
print("RandomForestRegressor model training complete.")

Training RandomForestRegressor model...


NameError: name 'y_train' is not defined

In [None]:
# eval
y_pred = model.predict(X_test_transformed)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"\nModel Performance:")
print(f"  Mean Squared Error (MSE): {mse:.4f}")
print(f"  R2 Score: {r2:.4f}")

## Informed Search