# Learnsync ML use case implementation (Simulation version)

In [None]:
# Import libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import skew
from matplotlib_venn import venn3
from scipy.stats import shapiro
from IPython.display import display
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm

## Generate dataset
- The original dataset we chose from [Time Management and Productivity Insights](https://www.kaggle.com/datasets/hanaksoy/time-management-and-productivity-insights/data) is fictitious. And does not contain relationships between the independent features and the Productivity Level. Thus we decide to generate our own dataset for one of our ML use case `Providing time management tips and learning strategies`.
- The columns in our generated dataset are consistent with the original dataset.

In [None]:
def gen_dataset(n_samples=500):
    """
    Input: sample numbers
    Output: dataset in pd.DataFrame format

    Assumption:
    - Factors with positive relationship to productivity:
        daily_work_hours, daily_sleep_hours, exercise_hours, daily_leisure_hours
    - Factors with negative relationship to productivity:
        commute_time, screen_time, age

    The productivity score is generated using a weighted sum of features.
    Each weight reflects how strongly that factor is assumed to influence productivity.
    A small amount of random noise is added to simulate real-world unpredictability.
    """
    import numpy as np
    import pandas as pd

    np.random.seed(42)

    # Generate features
    user_id = np.arange(1, n_samples + 1)
    age = np.random.randint(20, 60, size=n_samples)
    daily_work_hours = np.random.uniform(4, 10, size=n_samples)
    daily_leisure_hours = np.random.uniform(0.5, 5, size=n_samples)
    daily_exercise_minutes = np.random.uniform(0, 120, size=n_samples)
    daily_sleep_hours = np.random.uniform(5, 9, size=n_samples)
    screen_time = np.random.uniform(1, 6, size=n_samples)
    commute_time = np.random.uniform(0, 2, size=n_samples)

    # Convert exercise time to hours
    exercise_hours = daily_exercise_minutes / 60.0

    # Assume the productivity score follows a realistic pattern:
    productivity_score = (
        1.5 * daily_work_hours           # More work hours can increase productivity
        + 1.0 * daily_sleep_hours        # Better sleep supports better focus and output
        + 0.8 * exercise_hours           # Physical activity improves energy and alertness
        + 0.4 * daily_leisure_hours      # Moderate leisure helps reduce stress
        - 0.6 * commute_time             # Longer commutes reduce time and energy for work
        - 0.05 * age                     # Slight decline in productivity with age
        - 0.2 * screen_time              # Excessive screen time may distract or tire users
        + np.random.normal(0, 2, size=n_samples)  # Add some noise to mimic real-life variance
    )

    # Clip productivity score to 0–100 range
    productivity_score = np.clip(productivity_score, 0, 100)

    # Create the final DataFrame
    df = pd.DataFrame({
        'User ID': user_id,
        'Age': age,
        'Daily Work Hours': daily_work_hours,
        'Daily Leisure Hours': daily_leisure_hours,
        'Daily Exercise Minutes': daily_exercise_minutes,
        'Daily Sleep Hours': daily_sleep_hours,
        'Screen Time (hours)': screen_time,
        'Commute Time (hours)': commute_time,
        'Productivity Level': productivity_score
    })

    return df


Generate 1000 data. 
Shape of dataframe: (1000, 9)


Unnamed: 0,User ID,Age,Daily Work Hours,Daily Leisure Hours,Daily Exercise Minutes,Daily Sleep Hours,Screen Time (hours),Commute Time (hours),Productivity Score
451,452,45,5.353241,3.625027,61.474171,5.425349,1.555754,1.835201,11.919997
677,678,36,7.894448,4.863235,77.109313,6.260042,5.919532,0.811624,18.26601
837,838,30,4.557575,3.354529,0.772225,8.21298,3.706813,1.394896,14.312226
635,636,52,5.130095,2.895159,74.223247,8.846463,2.548344,1.427304,12.70555
493,494,31,4.74532,1.782086,61.604185,6.486186,3.285298,0.743454,13.190941


## Store dataframe into csv file.

In [47]:
# Save the generated dataset to a CSV file
path = "../data/synthetic_productivity_dataset.csv"
gen_data.to_csv(path, index=False)
print(f"Dataset saved as {path}")

Dataset saved as ../data/synthetic_productivity_dataset.csv


## Load dataset again from csv file.

In [48]:
# Create dataset
path = "../data/synthetic_productivity_dataset.csv"
data = pd.read_csv(path)

## Split into training and testing set

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Select features
features = ['Age', 'Daily Work Hours', 'Daily Leisure Hours', 'Daily Exercise Minutes',
            'Daily Sleep Hours', 'Screen Time (hours)', 'Commute Time (hours)']
X = data[features]
y = data['Productivity Score']

# Prepare Training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Conduct Z-scaled normalization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)   # Fit on training data
X_test_scaled = scaler.transform(X_test)         # Transform test data using training stats

## MLR: Multi-input regression task

In [51]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Create and train regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict the Productivity Score on testing set
y_pred = model.predict(X_test_scaled)

# Evluate model by testing results
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation results
print(f"MSE: {mse:.2f}")
print(f"R_square: {r2:.2f}")

# Print model coefficients
coefficients = pd.DataFrame({
    'Feature': features,
    'Coefficient': model.coef_
})
print(coefficients.sort_values(by='Coefficient', ascending=False))

MSE: 3.62
R_square: 0.76
                  Feature  Coefficient
1        Daily Work Hours     2.579189
4       Daily Sleep Hours     1.214959
2     Daily Leisure Hours     0.508517
3  Daily Exercise Minutes     0.479787
5     Screen Time (hours)    -0.187482
6    Commute Time (hours)    -0.388894
0                     Age    -0.551266
