In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# load data
df = pd.read_csv("/home/hanyan/dev/DS3001_project/DATA/sleep_cycle_productivity.csv") # !!change this to your path!!

# feature engineering 
# 1. sleep consistency = std dev of total sleep hours per person
sleep_consistency = df.groupby('Person_ID')['Total Sleep Hours'].std().rename("Sleep Consistency")
df = df.merge(sleep_consistency, on="Person_ID")

# 2. sleep efficiency = total sleep hours / (sleep end - sleep start)
df["Sleep Duration Window"] = (df["Sleep End Time"] - df["Sleep Start Time"]) % 24
df["Sleep Efficiency"] = df["Total Sleep Hours"] / df["Sleep Duration Window"]

# 3. screen ratio = screen time before bed / sleep duration window
df["Evening Screen Ratio"] = df["Screen Time Before Bed (mins)"] / (df["Sleep Duration Window"] * 60)

# drop intermediate columns
df = df.drop(columns=["Sleep Duration Window"])

# one-hot encoding categorical variables
categorical_features = ["Gender"]
numerical_features = df.select_dtypes(include=["number"]).drop(columns=["Person_ID"]).columns.tolist()

# handle multicollinearity: remove highly correlated features 
corr_matrix = df[numerical_features].corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.8)]
df_reduced = df.drop(columns=to_drop)

# standardize numeric features for kNN/ANN 
final_numeric = df_reduced.select_dtypes(include=["number"]).drop(columns=["Person_ID"]).columns
scaler = StandardScaler()
df_scaled = df_reduced.copy()
df_scaled[final_numeric] = scaler.fit_transform(df_reduced[final_numeric])

df_scaled.head()

# drop ID, Date, and categorical columns
exclude_cols = ["Person_ID", "Date"]
X = df_scaled.drop(columns=exclude_cols + ["Productivity Score"], errors='ignore')
y = df_scaled["Productivity Score"]

# remove any object/datetime columns just in case
X = X.select_dtypes(include=[np.number])

# remove rows with missing values
Xy = pd.concat([X, y], axis=1).dropna()
X = Xy.drop(columns=["Productivity Score"])
y = Xy["Productivity Score"]

# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

# lasso for feature selection
lasso = LassoCV(cv=5, random_state = 50)
lasso.fit(X_train, y_train)
lasso_selected_features = X.columns[(lasso.coef_ != 0)].tolist()

# pca for 95% variance
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X)

lasso_selected_features, X_pca.shape

# dataframe of principal components
pca_columns = [f"PC{i+1}" for i in range(X_pca.shape[1])]
df_pca = pd.DataFrame(X_pca, columns=pca_columns)
df_pca["Productivity Score"] = y.values  # add target back in

df_pca.head()

# split pca data
X = df_pca.drop(columns=["Productivity Score"])
y = df_pca["Productivity Score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

# basic ANN
model = Sequential([
    Dense(64, activation='relu', input_shape=(X.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)  # regression output
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)


Epoch 1/50


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 1.0517 - mae: 0.8893 - val_loss: 1.0119 - val_mae: 0.8441
Epoch 2/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.9784 - mae: 0.8511 - val_loss: 1.0012 - val_mae: 0.8423
Epoch 3/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 1.0090 - mae: 0.8732 - val_loss: 1.0036 - val_mae: 0.8460
Epoch 4/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.9727 - mae: 0.8580 - val_loss: 1.0020 - val_mae: 0.8417
Epoch 5/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.9348 - mae: 0.8379 - val_loss: 0.9980 - val_mae: 0.8407
Epoch 6/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.9729 - mae: 0.8595 - val_loss: 0.9982 - val_mae: 0.8434
Epoch 7/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.9398 - mae: 0.845

<keras.src.callbacks.history.History at 0x7fa8c05f09e0>

The model provides a coarse estimate of productivity, but its predictive power is limited. further progress likely requires richer or more subjective data.
still, the pipeline demonstrates an end-to-end architecture from raw behavioral data to interpretable prediction.

