In [None]:
#1. Data import and cleaning

In [None]:
#Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#Load data
df = pd.read_csv('dailyActivity_merged.csv')
sleep = pd.read_csv('sleepDay_merged.csv')
weight = pd.read_csv('weightLogInfo_merged.csv')

In [None]:
#Basic inspect
df.head()  # View first 5 rows
df.info()  # Structure and data types
df.describe()  # Summary stats

In [None]:
#Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

In [None]:
#Check for duplicates
duplicates = df.duplicated()
print(f"Total duplicates: {duplicates.sum()}")

#df = df.drop_duplicates() #Remove duplicates

In [None]:
#Convert date columns to datetime format
df['ActivityDate'] = pd.to_datetime(df['ActivityDate'])

In [None]:
#Rename columns for clarity
df.rename(columns={
    'ActivityDate': 'Date',
    'TotalSteps': 'Steps',
    'TotalDistance': 'Distance',
    'VeryActiveMinutes': 'ActiveMinutes',
    'SedentaryMinutes': 'IdleMinutes',
}, inplace=True)

In [None]:
#Check column types and names after changes
df.dtypes

In [None]:
#2 Data Plotting

In [None]:
#Distribution of key metrics
#Steps(TotalSteps)
#Histogram
import seaborn as sns
import matplotlib.pyplot as plt

sns.histplot(df['Steps'], kde=True, bins=30, color='skyblue')
plt.title("Distribution of Total Steps")
plt.xlabel("Total Steps")
plt.ylabel("Frequency")
plt.show()

In [None]:
#Calories
#Boxplot
sns.boxplot(x=df['Calories'], color='salmon')
plt.title("Calories Burned - Boxplot")
plt.xlabel("Calories")
plt.show()

In [None]:
#Time series trend
daily_steps = df.groupby('Date')['Steps'].sum()
daily_steps.plot()
plt.title("Daily Total Steps Over Time")
plt.ylabel("Total Steps")
plt.xlabel("Date")
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
#Correlation analysis
numeric_df = df.select_dtypes(include='number')  # get only numeric variables
correlation = numeric_df.corr()

sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
#Top active users by average steps
top_users = df.groupby('Id')['Steps'].mean().sort_values(ascending=False).head(10)
top_users.plot(kind='bar', color='teal')
plt.title("Top 10 Users by Average Steps")
plt.xlabel("User ID")
plt.ylabel("Average Steps")
plt.xticks(rotation=45)
plt.show()

In [None]:
#Steps by day of the week
df['Weekday'] = df['Date'].dt.day_name()

sns.boxplot(x='Weekday', y='Steps', data=df, order=[
    'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'
])
plt.title("Steps by Day of the Week")
plt.xlabel("Day of Week")
plt.ylabel("Total Steps")
plt.xticks(rotation=45)
plt.show()

In [None]:
#Sleep vs calories burned
df_sleep = pd.read_csv('SleepDay_merged.csv')
df_sleep['SleepDay'] = pd.to_datetime(df_sleep['SleepDay'])
df_activity = df[['Id', 'Date', 'Steps', 'Calories']]
df_activity['Date'] = pd.to_datetime(df_activity['Date'])

df_combined = pd.merge(df_activity, df_sleep, left_on=['Id', 'Date'], right_on=['Id', 'SleepDay'])

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_combined, x='TotalMinutesAsleep', y='Calories', 
                hue='Steps', size='Steps', palette='viridis', sizes=(20, 200), alpha=0.7)
plt.title('Sleep vs. Calories Burned with Step Volume')
plt.xlabel('Total Minutes Asleep')
plt.ylabel('Calories Burned')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, title="Total Steps")
plt.tight_layout()
plt.show()

In [None]:
#Steps trend by user
#Get users at specific positions in table
selected_users = df['Id'].unique()[[2, 5, 7, 11]]
#Get users by ID value
#selected_users = [1503960366, 1624580081, 1844505072]

# Filter data for selected users
df_sample = df[df['Id'].isin(selected_users)].copy()

# Plot
g = sns.FacetGrid(df_sample, col='Id', col_wrap=2, height=4, aspect=1.5)
g.map_dataframe(sns.lineplot, x='Date', y='Steps')
g.set_titles("User ID: {col_name}")
g.set_axis_labels("Date", "Total Steps")
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Step Trends by User')
plt.show()

In [None]:
#3. Feature Engineering

In [None]:
#Intensity Ratio
df['TotalActiveMinutes'] = df['ActiveMinutes'] + df['FairlyActiveMinutes'] + df['LightlyActiveMinutes']
df['IntensityRatio'] = df['ActiveMinutes'] / df['TotalActiveMinutes']
df['IntensityRatio'] = df['IntensityRatio'].fillna(0)
#df['IntensityRatio']

In [None]:
#Steps per minute of activity
df['StepsPerActiveMinute'] = df['Steps'] / df['TotalActiveMinutes']
df['StepsPerActiveMinute'] = df['StepsPerActiveMinute'].replace([np.inf, -np.inf], np.nan).fillna(0)
#df['StepsPerActiveMinute']

In [None]:
#Sleep efficientcy
sleep = pd.read_csv('sleepDay_merged.csv')

sleep['SleepEfficiency'] = sleep['TotalMinutesAsleep'] / sleep['TotalTimeInBed']
sleep['SleepEfficiency'] = sleep['SleepEfficiency'].fillna(0)
#sleep['SleepEfficiency']

In [None]:
#Weight change over time
weight = pd.read_csv('weightLogInfo_merged.csv')

weight = weight.sort_values(['Id', 'Date'])
weight['WeightChange'] = weight.groupby('Id')['WeightKg'].diff()
#weight['WeightChange']


In [None]:
#Days since first record
df['Date'] = pd.to_datetime(df['Date'])
df['DaysSinceStart'] = (df['Date'] - df.groupby('Id')['Date'].transform('min')).dt.days
#df['DaysSinceStart']

In [None]:
#Sedentary to active ratio
df['SedentaryToActiveRatio'] = np.where(
    df['TotalActiveMinutes'] == 0,
    np.nan,
    df['IdleMinutes'] / df['TotalActiveMinutes']
)
#df['SedentaryToActiveRatio']
# Filter data
filtered_data = df[df['SedentaryToActiveRatio'] < 10]

# Plot
plt.figure(figsize=(8, 5))
sns.histplot(filtered_data['SedentaryToActiveRatio'], bins=30, kde=True)
plt.title('Distribution of Sedentary to Active Ratio (< 10)')
plt.xlabel('Sedentary / Active Minutes')
plt.ylabel('Frequency')
plt.show()

In [None]:
#Sedentary to active ratio over time for a specific user
user_id = df['Id'].unique()[0]
df_user = df[df['Id'] == user_id]

plt.figure(figsize=(10, 4))
sns.lineplot(data=df_user, x='Date', y='SedentaryToActiveRatio')
plt.title(f'Sedentary to Active Ratio Over Time for User {user_id}')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
#Sedantory to active ratio correltaion with calories burnt
plt.figure(figsize=(6, 6))
sns.scatterplot(data=df, x='SedentaryToActiveRatio', y='Calories')
plt.title('Sedentary to Active Ratio vs Calories Burned')
plt.xlabel('Sedentary / Active Minutes')
plt.ylabel('Calories')
plt.show()


In [None]:
# Filter data
filtered_data = df[df['SedentaryToActiveRatio'] < 10]

# Plot
plt.figure(figsize=(6, 6))
sns.scatterplot(data=filtered_data, x='SedentaryToActiveRatio', y='Calories')
plt.title('Sedentary to Active Ratio vs Calories Burned (Ratio < 10)')
plt.xlabel('Sedentary / Active Minutes')
plt.ylabel('Calories')
plt.show()


In [None]:
#Binning
#Create ActivityLevel categories based on SedentaryToActiveRatio
df['ActivityLevel'] = pd.qcut(df['SedentaryToActiveRatio'], q=3, labels=['High', 'Medium', 'Low'])
#df['ActivityLevel']

In [None]:
#4. Model building and evaluation
#Predicting calories burnt

In [None]:
#Import libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
#Selecting features and target
features = [
    'Steps', 'ActiveMinutes', 'FairlyActiveMinutes', 
    'LightlyActiveMinutes', 'IdleMinutes',
    'Distance', 'SedentaryToActiveRatio'
]

target = 'Calories'

X = df[features]
y = df[target]

In [None]:
#Handle missing values
X = X.fillna(0)
y = y.fillna(0)

In [None]:
#Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
#Train Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

In [None]:
#Evaluate Linear Regression Model
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f"Linear Regression - Mean Squared Error: {mse_lr:.2f}")
print(f"Linear Regression - R² Score: {r2_lr:.2f}")

In [None]:
#Visualize Predictions vs Actual
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred_lr)
plt.xlabel("Actual Calories")
plt.ylabel("Predicted Calories")
plt.title("Actual vs Predicted Calories - Linear Regression")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.show()

In [None]:
#Train Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

In [None]:
#Evaluate Random Forest Regressor
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest - Mean Squared Error: {mse_rf:.2f}")
print(f"Random Forest - R² Score: {r2_rf:.2f}")

In [None]:
#Compare model performance
comparison_df = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest'],
    'MSE': [mse_lr, mse_rf],
    'R2 Score': [r2_lr, r2_rf]
})

comparison_df

In [None]:
#Train Decision Tree Model
from sklearn.tree import DecisionTreeRegressor
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)
dt_preds = dt_model.predict(X_test)

In [None]:
#Evaluate Decision Tree Model
mse_dt = mean_squared_error(y_test, dt_preds)
r2_dt = r2_score(y_test, dt_preds)

print(f"Decision Tree - Mean Squared Error: {mse_dt:.2f}")
print(f"Decision Tree - R² Score: {r2_dt:.2f}")

In [None]:
#Train XGBoost Regressor
#pip install xgboost
from xgboost import XGBRegressor

xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)

In [None]:
#Evaluate XGBoost Model
mse_xgb = mean_squared_error(y_test, xgb_preds)
r2_xgb = r2_score(y_test, xgb_preds)

print(f"XGBoost - Mean Squared Error: {mse_xgb:.2f}")
print(f"XGBoost - R² Score: {r2_xgb:.2f}")

In [None]:
#Compare all model performances
comparison_df = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'Decision Tree', 'XGBoost'],
    'MSE': [mse_lr, mse_rf, mse_dt, mse_xgb],
    'R2 Score': [r2_lr, r2_rf, r2_dt, r2_xgb]
})

comparison_df
#Random forest and XGBoost models with lovest MSE and highest R2 -> best performance

In [None]:
#XGBoost model feature importance
importances = xgb_model.feature_importances_
features = X.columns
sorted_idx = importances.argsort()

plt.barh(features[sorted_idx], importances[sorted_idx])
plt.title("XGBoost Feature Importance")
plt.show()


In [None]:
#Hyperparameter tuning - immprove Decision Tree and XGBoost models
from sklearn.model_selection import GridSearchCV

# Hyperparameter tuning for Decision Tree
dt_params = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_grid = GridSearchCV(DecisionTreeRegressor(random_state=42), dt_params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
dt_grid.fit(X_train, y_train)

print(f"Best Decision Tree params: {dt_grid.best_params_}")

# Use the best estimator for prediction
best_dt = dt_grid.best_estimator_
y_pred_dt = best_dt.predict(X_test)

# Hyperparameter tuning for XGBoost
xgb_params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

xgb_grid = GridSearchCV(XGBRegressor(random_state=42, objective='reg:squarederror'), xgb_params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
xgb_grid.fit(X_train, y_train)

print(f"Best XGBoost params: {xgb_grid.best_params_}")

# Use the best estimator for prediction
best_xgb = xgb_grid.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)


In [None]:
#Evaluate Decision Tree Model after Hyperparameter tuning
mse_dt_h = mean_squared_error(y_test, y_pred_dt)
r2_dt_h = r2_score(y_test, y_pred_dt)

print(f"Decision Tree - Mean Squared Error: {mse_dt_h:.2f}")
print(f"Decision Tree - R² Score: {r2_dt_h:.2f}")

In [None]:
#Evaluate XGBoost Model after Hyperparameter tuning
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost - Mean Squared Error: {mse_xgb:.2f}")
print(f"XGBoost - R² Score: {r2_xgb:.2f}")

#Slightly better evaluation results fr both models after hyperparameter tuning

In [None]:
#5 pandas
#more data manipulation

In [None]:
import pandas as pd

# Load data
df = pd.read_csv('dailyActivity_merged.csv')

# Convert date column
df['ActivityDate'] = pd.to_datetime(df['ActivityDate'])

# Rolling average of steps (7-day)
df_sorted = df.sort_values('ActivityDate')
df_sorted['7day_steps_avg'] = df_sorted['TotalSteps'].rolling(window=7).mean()

# Pivot table: average steps per user
pivot = df.pivot_table(index='Id', values='TotalSteps', aggfunc='mean')

# Merge with sleep data
sleep = pd.read_csv('sleepDay_merged.csv')
sleep['SleepDay'] = pd.to_datetime(sleep['SleepDay'], format='%m/%d/%Y %I:%M:%S %p')
merged = pd.merge(df, sleep, left_on=['Id', 'ActivityDate'], right_on=['Id', 'SleepDay'], how='left')

#df_sorted
#pivot
#merged

In [None]:
#6 numpy

In [None]:
import numpy as np

# Arrays
arr = np.array([1, 2, 3])
zeros = np.zeros((2, 3))
ones = np.ones((3, 2))
rand = np.random.rand(2, 2) #random elements from -1 to 1

print("Array:", arr)
print("Zeros:\n", zeros)
print("Ones:\n", ones)
print("Random:\n", rand)
print("Shape:", arr.shape) #arr shape
print("Datatype:", arr.dtype) #arr datatype
print("Size:", arr.size) #arr size

In [None]:
#Matrix
matrix = np.arange(1, 10).reshape((3, 3))
flat = matrix.flatten() #matrix in single row

print("Matrix:\n", matrix)
print("Flattened:", flat)

print("Second row:", matrix[1]) #slice row
print("Element (2,2):", matrix[1, 1]) #index element

matrix + 5  # +5 to each element

mask = matrix > 5 #masking
print("Mask:\n", mask)
print("Filtered values:", matrix[mask])

In [None]:
#compare computation speed - squaring numbers
# Base Python
py_list = list(range(1, 10001))
%timeit [x**2 for x in py_list]

# NumPy
np_array = np.array(py_list)
%timeit np_array ** 2


In [None]:
#linear algebra
A = np.array([[2, 1], [3, 4]])
b = np.array([1, 2])

# Solve Ax = b
x = np.linalg.solve(A, b)
print("Solution x:", x)

# Inverse A
inv = np.linalg.inv(A)
print("Inverse:\n", inv)

In [None]:
#simulate 10 coin flips
coin_flips = np.random.choice(['H', 'T'], size=10)
print("Flips:", coin_flips)

#simlate a sample of 1000 observations from a normal distribution
samples = np.random.normal(loc=0, scale=1, size=1000)
#samples