In [5]:
print("--- [STEP 1] Starting Python 3.11 setup (This takes ~1 minute) ---")

# 1. Install Python 3.11 and its development libraries
!sudo apt-get update -y > /dev/null
!sudo apt-get install python3.11 python3.11-dev -y > /dev/null

# 2. Set Python 3.11 as the default 'python3'
!sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 > /dev/null

# 3. Install pip for Python 3.11
!sudo apt-get install python3.11-distutils -y > /dev/null
!curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 > /dev/null

# 4. Install PyCaret 3.3.2 (the latest stable version) using the new Python 3.11
# We also install openpyxl for Task D
print("Installing PyCaret 3.3.2, mlflow, and openpyxl... (This may take a moment)")
# FIX: Added 'mlflow' to the install list to prevent ImportError
!python3.11 -m pip install pycaret[full]==3.3.2 openpyxl mlflow > /dev/null

print("--- Python 3.11 setup complete. ---")
!python3.11 --version

print("\n\n" + "="*70)
print("  >>>>> IMPORTANT: GO TO 'Runtime' -> 'Restart Session' NOW <<<<<  ")
print("="*70 + "\n\n")


--- [STEP 1] Starting Python 3.11 setup (This takes ~1 minute) ---
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Installing PyCaret 3.3.2, mlflow, and openpyxl... (This may take a moment)
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mizani 0.13.5 requires pandas>=2.2.0, but you have pandas 2.1.4 which is incompatible.
plotnine 0.14.6 requires matplotlib>=3.8.0, but you have matplotlib 3.7.5 which is incompatible.
plotnine 0.14.6 requires pandas>=2.2.0, but you have pandas 2.1.4 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.25.2 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.25.2 which is incompatible.
tsfresh 0.21.0 requires sci

In [3]:
# 1. --- CRITICAL VALIDATION ---
# Check the Python version *after* restart.
import sys
print(f"--- Running Python Version: {sys.version_info.major}.{sys.version_info.minor} ---")
if not (sys.version_info.major == 3 and sys.version_info.minor == 11):
    print("="*70)
    print("  >>>>> ERROR: PYTHON 3.11 IS NOT ACTIVE. <<<<<  ")
    print("  Please re-run STEP 1, then RESTART SESSION, then run this cell.")
    print("="*70)
else:
    print("SUCCESS: Python 3.11 is active.")

    # 2. --- Import Libraries (FIXED) ---
    import pandas as pd
    from google.colab import drive
    import os

    # Import PyCaret modules with aliases
    import pycaret.classification as pc
    import pycaret.regression as pr
    import pycaret.clustering as pclub
    import pycaret.anomaly as pa
    import pycaret.time_series as pts

    print("PyCaret modules imported successfully.")

    # 3. --- Mount Google Drive & Load Data ---
    print("Mounting Google Drive...")
    drive.mount('/content/drive', force_remount=True)

    # This is the verified, correct path from our debugging
    DATA_PATH = '/content/drive/MyDrive/fitness_dataset.csv'

    try:
        data = pd.read_csv(DATA_PATH, parse_dates=['timestamp'])
        print("Data loaded successfully:")
        print(data.head())
    except Exception as e:
        print(f"FATAL ERROR: Could not load data from Drive: {e}")
        print(f"Please ensure '{DATA_PATH}' is the correct path.")

    # ----------------------------------------------------------------------
    # [TASK A-1] Running Binary Classification
    # ----------------------------------------------------------------------
    print("\n--- [TASK A-1] Running Binary Classification ---")
    data_binary = data.copy()
    data_binary['high_calorie_burn'] = (data_binary['calories_burned'] > 300).astype(int)
    data_binary = data_binary.drop(columns=['record_id', 'timestamp', 'user_id', 'calories_burned'])

    clf1 = pc.setup(data=data_binary,
                    target='high_calorie_burn',
                    session_id=123,
                    log_experiment=False,
                    verbose=False)

    best_binary = pc.compare_models(n_select=1, verbose=False)
    tuned_binary = pc.tune_model(best_binary, optimize='AUC', verbose=False)

    pc.plot_model(tuned_binary, plot='auc', save=True)
    pc.plot_model(tuned_binary, plot='confusion_matrix', save=True)

    try:
        pc.plot_model(tuned_binary, plot='feature', save=True)
    except TypeError:
        print("Note: Feature Importance plot is not available for the selected model.")

    pc.save_model(tuned_binary, 'high_calorie_model')
    print("--- Binary Classification Complete ---")


    # ----------------------------------------------------------------------
    # [TASK A-2] Running Multiclass Classification
    # ----------------------------------------------------------------------
    print("\n--- [TASK A-2] Running Multiclass Classification ---")
    data_multi = data.copy()
    data_multi = data_multi.drop(columns=['record_id', 'timestamp', 'user_id'])

    clf2 = pc.setup(data=data_multi,
                    target='activity_type',
                    session_id=124,
                    log_experiment=False,
                    verbose=False)

    best_multi = pc.compare_models(n_select=1, verbose=False)
    tuned_multi = pc.tune_model(best_multi, optimize='Accuracy', verbose=False)

    pc.plot_model(tuned_multi, plot='confusion_matrix', save=True)

    try:
        pc.plot_model(tuned_multi, plot='feature', save=True)
    except TypeError:
        print("Note: Feature Importance plot is not available for the selected model.")

    pc.save_model(tuned_multi, 'activity_type_model')
    print("--- Multiclass Classification Complete ---")


    # ----------------------------------------------------------------------
    # [TASK A-3] Running Regression
    # ----------------------------------------------------------------------
    print("\n--- [TASK A-3] Running Regression ---")
    data_reg = data.copy()
    data_reg = data_reg.drop(columns=['record_id', 'timestamp', 'user_id'])

    reg1 = pr.setup(data=data_reg,
                    target='calories_burned',
                    session_id=125,
                    log_experiment=False,
                    verbose=False)

    best_reg = pr.compare_models(n_select=1, verbose=False)
    tuned_reg = pr.tune_model(best_reg, optimize='MAE', verbose=False)

    pr.plot_model(tuned_reg, plot='residuals', save=True)
    pr.plot_model(tuned_reg, plot='error', save=True)

    try:
        pr.plot_model(tuned_reg, plot='feature', save=True)
    except TypeError:
        print("Note: Feature Importance plot is not available for the selected model.")

    pr.save_model(tuned_reg, 'calorie_regression_model')
    print("--- Regression Complete ---")


    # ----------------------------------------------------------------------
    # [TASK B] Running Clustering
    # ----------------------------------------------------------------------
    print("\n--- [TASK B] Running Clustering ---")
    data_clu = data.copy()
    data_clu = data_clu.drop(columns=['record_id', 'timestamp', 'user_id', 'activity_type'])

    clu1 = pclub.setup(data=data_clu,
                       session_id=126,
                       log_experiment=False,
                       verbose=False)

    kmeans = pclub.create_model('kmeans', num_clusters=4, verbose=False)
    pclub.assign_model(kmeans)

    pclub.plot_model(kmeans, plot='cluster', save=True)
    pclub.plot_model(kmeans, plot='elbow', save=True)
    print("--- Clustering Complete ---")


    # ----------------------------------------------------------------------
    # [TASK C] Running Anomaly Detection
    # ----------------------------------------------------------------------
    print("\n--- [TASK C] Running Anomaly Detection ---")
    data_anom = data.copy()
    data_anom = data_anom.drop(columns=['record_id', 'timestamp', 'user_id', 'activity_type'])

    anom1 = pa.setup(data=data_anom,
                     session_id=127,
                     log_experiment=False,
                     verbose=False)

    iforest = pa.create_model('iforest', contamination=0.05, verbose=False)
    pa.assign_model(iforest)

    pa.plot_model(iforest, plot='umap', save=True)
    print("--- Anomaly Detection Complete ---")


    # ----------------------------------------------------------------------
    # [TASK E] Running Time Series Forecasting
    # ----------------------------------------------------------------------
    print("\n--- [TASK E] Running Time Series Forecasting ---")

    # --- Prepare Data for Time Series ---
    ts_data = data.set_index('timestamp').resample('D').agg({
        'steps': 'sum',
        'duration_min': 'sum'
    })
    ts_data = ts_data.asfreq('D').fillna(0) # Fill missing days with 0
    print("Time Series Data (Daily):")
    print(ts_data.head())

    # --- Task E-1: Univariate without Exogenous Variables ---
    print("\n--- [TASK E-1] Running Univariate Forecast (No Exog) ---")

    setup_ts_no_exog = pts.setup(data=ts_data['steps'],
                                 fh=7,  # Forecast 7 days ahead
                                 session_id=129,
                                 log_experiment=False,
                                 verbose=False)

    best_ts_no_exog = pts.compare_models(n_select=1, verbose=False)
    pts.plot_model(best_ts_no_exog, plot='forecast', save=True)
    print("--- Univariate Forecast Complete ---")

    # --- Task E-2: Univariate with Exogenous Variables ---
    print("\n--- [TASK E-2] Running Univariate Forecast (With Exog) ---")

    setup_ts_exog = pts.setup(data=ts_data,
                              target='steps',
                              fh=7,
                              session_id=130,
                              log_experiment=False,
                              verbose=False)

    best_ts_exog = pts.compare_models(n_select=1, verbose=False)
    pts.plot_model(best_ts_exog, plot='forecast', save=True)
    print("--- Univariate Forecast with Exogenous Complete ---")


    print("\n--- Step 2 (Tasks A, B, C, E) Execution Complete ---")

--- Running Python Version: 3.11 ---
SUCCESS: Python 3.11 is active.
PyCaret modules imported successfully.
Mounting Google Drive...
Mounted at /content/drive
Data loaded successfully:
   record_id           timestamp user_id  age  gender      activity_type  \
0          1 2025-07-22 09:22:54    U026   67    Male            Cycling   
1          2 2025-05-28 13:53:54    U039   16  Female  Strength Training   
2          3 2025-08-07 01:36:54    U046   62  Female            Cycling   
3          4 2025-07-17 01:56:54    U048   56  Female           Swimming   
4          5 2025-08-11 03:22:54    U042   42   Other               Yoga   

   duration_min  calories_burned  heart_rate_avg  steps  sleep_hours  \
0             7              322             175   3591          7.0   
1            83              265             113  22312          5.7   
2           118              508              98   5418          6.6   
3            75              270             150  19809          7.9  

Transformation Pipeline and Model Successfully Saved
--- Binary Classification Complete ---

--- [TASK A-2] Running Multiclass Classification ---


Note: Feature Importance plot is not available for the selected model.
Transformation Pipeline and Model Successfully Saved
--- Multiclass Classification Complete ---

--- [TASK A-3] Running Regression ---


Transformation Pipeline and Model Successfully Saved
--- Regression Complete ---

--- [TASK B] Running Clustering ---


--- Clustering Complete ---

--- [TASK C] Running Anomaly Detection ---


--- Anomaly Detection Complete ---

--- [TASK E] Running Time Series Forecasting ---
Time Series Data (Daily):
            steps  duration_min
timestamp                      
2025-04-24  11118           149
2025-04-25  26158           197
2025-04-26      0             0
2025-04-27  37303           209
2025-04-28  11255            23

--- [TASK E-1] Running Univariate Forecast (No Exog) ---
--- Univariate Forecast Complete ---

--- [TASK E-2] Running Univariate Forecast (With Exog) ---
--- Univariate Forecast with Exogenous Complete ---

--- Step 2 (Tasks A, B, C, E) Execution Complete ---
