In [62]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit,GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error,make_scorer
from sklearn.ensemble import RandomForestRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
import keras_tuner as kt
import warnings
warnings.filterwarnings('ignore')

In [63]:
features = pd.read_csv('data/features.csv')
cleaned = pd.read_csv('cleaned_data.csv')

In [64]:
# Merge the Tmporal Variables in Features with Static Variables from Cleaned Data
cleaned['time'] = pd.to_datetime(cleaned['time'])
cleaned['date'] = cleaned['time'].dt.date

numeric_cols = cleaned.select_dtypes(include='number').columns.tolist()
aggregated = cleaned.groupby(['id', 'date'])[numeric_cols].mean().reset_index()
aggregated = aggregated.rename(columns={'date': 'target_date'})

merged_df = pd.merge(features, aggregated, on=['id', 'target_date'], how='left')

In [65]:
# Drop unwanted columns
appcat_cols = [
    'appCat.builtin', 'appCat.communication', 'appCat.entertainment', 'appCat.finance',
    'appCat.game', 'appCat.office', 'appCat.other', 'appCat.social',
    'appCat.travel', 'appCat.unknown', 'appCat.utilities', 'appCat.weather', 'mood'
]
merged_df = merged_df.drop(columns=[col for col in appcat_cols if col in merged_df.columns])

In [66]:
temporal_features = [
    'mood_day5', 'app_usage_day5', 'screen_day5',
    'mood_day4', 'app_usage_day4', 'screen_day4',
    'mood_day3', 'app_usage_day3', 'screen_day3',
    'mood_day2', 'app_usage_day2', 'screen_day2',
    'mood_day1', 'app_usage_day1', 'screen_day1'
]

In [67]:
features = temporal_features
merged_df = merged_df.dropna(subset=features + ['target_mood'])

In [68]:
#Normalize the Dataset using Standard Scaler 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(merged_df[features])

In [69]:
X_seq = X_scaled.reshape(-1, 5, 3)#Reshape to the number of samples, time steps, feature per step
y_seq = merged_df['target_mood'].values

In [70]:
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X_seq, y_seq, groups=merged_df['id']))
X_train, X_test = X_seq[train_idx], X_seq[test_idx]
y_train, y_test = y_seq[train_idx], y_seq[test_idx]

#### Hyperparameter Tuning

In [71]:
def build_model(hp): 
    model = Sequential()
    model.add(LSTM(units=hp.Int('units', min_value=32, max_value=128, step=32),
                   input_shape=(X_train.shape[1], X_train.shape[2]),
                   return_sequences=False))
    
    if hp.Boolean('dropout'):
        model.add(Dense(1, activation='linear'))
    else:
        model.add(Dense(1))

    optimizer = Adam(learning_rate=hp.Float('lr', 1e-4, 1e-2, sampling='log'))
    model.compile(optimizer=optimizer, loss=hp.Choice('loss', ['mean_squared_error', 'mean_absolute_error']))
    
    return model

tuner = kt.RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    overwrite=True,
    directory='tuner_dir',
    project_name='lstm_mood_prediction'
)

In [72]:
tuner.search(X_train, y_train, epochs=10, validation_data=(X_test, y_test), batch_size=32) #Search for Best Model Attributes

Trial 10 Complete [00h 00m 02s]
val_loss: 0.34792351722717285

Best val_loss So Far: 0.33516576886177063
Total elapsed time: 00h 00m 16s


In [73]:
best_model = tuner.get_best_models(num_models=1)[0]
best_trial = tuner.oracle.get_best_trials(num_trials=1)[0]
print("Best Hyperparameters:", best_trial.hyperparameters.values) # Best Model Hyperparameters

Best Hyperparameters: {'units': 128, 'dropout': True, 'lr': 0.007783541188537225, 'loss': 'mean_squared_error'}


In [74]:
y_pred_best = best_model.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
mae_best = mean_absolute_error(y_test, y_pred_best)

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 


In [75]:
print(f"Best Model Error Metrics :")
print(f"MSE: {mse_best:0.4F}")
print(f"MAE: {mae_best:0.4F}")

Best Model Error Metrics :
MSE: 0.3352
MAE: 0.4255


In [76]:
df = pd.read_csv('data/features_with_aggregated_cleaned_data.csv')

In [77]:
df = df.dropna(subset=temporal_features + ['target_mood'])
print(f"Dataset shape after cleaning: {df.shape}")

Dataset shape after cleaning: (1125, 28)


In [78]:
X = df[temporal_features].copy()
y = df['target_mood'].values

In [79]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [80]:
splitter = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, test_idx in splitter.split(X_scaled, groups=df['id']):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

In [81]:
train_groups = df['id'].iloc[train_idx].values

print(f" Train set: {X_train.shape[0]} samples")
print(f" Test set: {X_test.shape[0]} samples")
print(f" Number of unique participants in train: {np.unique(train_groups).shape[0]}")
print(f" Number of unique participants in test: {np.unique(df['id'].iloc[test_idx]).shape[0]}")

 Train set: 873 samples
 Test set: 252 samples
 Number of unique participants in train: 21
 Number of unique participants in test: 6


In [82]:
def group_cv_iterator(X, y, groups, n_splits=3):
    for i in range(n_splits):
        splitter = GroupShuffleSplit(n_splits=1, test_size=0.25, random_state=i)
        for train_idx, val_idx in splitter.split(X, groups=groups):
            yield train_idx, val_idx

In [83]:
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

In [84]:
base_model = RandomForestRegressor(n_estimators=100, random_state=42)
base_model.fit(X_train, y_train)
base_y_pred = base_model.predict(X_test)
base_mse = mean_squared_error(y_test, base_y_pred)
base_mae = mean_absolute_error(y_test, base_y_pred)

In [85]:
print(" Baseline Random Forest Performance:")
print(f" MSE: {base_mse:.4f}")
print(f" MAE: {base_mae:.4f}")

 Baseline Random Forest Performance:
 MSE: 0.3966
 MAE: 0.4649
 RMSE: 0.6298


In [86]:
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}# Parameter gird for Searching 

In [87]:
cv = list(group_cv_iterator(X_train, y_train, train_groups, n_splits=3))

# Create and fit GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    scoring=mse_scorer,
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 432 candidates, totalling 1296 fits


In [88]:
print(" GridSearchCV results:")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV score: {-grid_search.best_score_:.4f} (MSE)")

 GridSearchCV results:
Best parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.4236 (MSE)


In [89]:
best_grid_model = grid_search.best_estimator_
grid_y_pred = best_grid_model.predict(X_test)
grid_mse = mean_squared_error(y_test, grid_y_pred)
grid_mae = mean_absolute_error(y_test, grid_y_pred)

In [90]:
print(" GridSearchCV best model performance:")
print(f" MSE: {grid_mse:.4f}")
print(f" MAE: {grid_mae:.4f}")

 GridSearchCV best model performance:
 MSE: 0.3411
 MAE: 0.4230
