In [None]:
!pip install --upgrade dask[complete] matplotlib seaborn pandas numpy scikit-learn

In [2]:
import dask
from dask import delayed, compute
import dask.dataframe as dd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [3]:
# Load datasets
path = "/home/work/Final/"
dataset_files = [
    "features_watch_a.csv",
    "features_watch_g.csv",
    "features_phone_a.csv",
    "features_phone_g.csv"
]

In [4]:
def average_classification_reports(reports):
    keys = list(reports[0].keys())
    avg_report = {}
    all_class_labels = set()
    for r in reports:
        all_class_labels.update([k for k in r.keys() if k not in ['accuracy', 'macro avg', 'weighted avg']])
    all_class_labels = sorted(all_class_labels, key=lambda x: (not x.isdigit(), x))
    for k in keys:
        if k == 'accuracy':
            avg_report[k] = np.mean([r.get(k, 0) for r in reports])
        elif isinstance(reports[0][k], dict):
            avg_report[k] = {}
            if k in all_class_labels or k in ['macro avg', 'weighted avg']:
                metrics = set()
                for r in reports:
                    if k in r:
                        metrics.update(r[k].keys())
                for metric in metrics:
                    vals = [r[k][metric] for r in reports if k in r and metric in r[k]]
                    avg_report[k][metric] = np.mean(vals) if vals else 0
        else:
            avg_report[k] = np.mean([r.get(k, 0) for r in reports])
    for class_label in all_class_labels:
        if class_label not in avg_report:
            avg_report[class_label] = {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}
    return avg_report

summary_results = []

In [None]:
for file in dataset_files:
    print(f"\n=== Running LOSO on {file} ===")
    df_dd = dd.read_csv(path + file)
    df = df_dd.compute()

    subject_ids = df['sub_id'].unique()

    label_enc = LabelEncoder()
    df['activity_encoded'] = label_enc.fit_transform(df['activity'])

    drop_cols = ['sub_id', 'group_id', 'activity', 'activity_encoded']
    X_all = df.drop(columns=drop_cols)
    y_all = df['activity_encoded']

    n_classes = len(label_enc.classes_)
    n_features = X_all.shape[1]

    @delayed
    def run_loso_fold(test_sub):
        train_mask = df['sub_id'] != test_sub
        test_mask = df['sub_id'] == test_sub

        X_train = X_all.loc[train_mask]
        y_train = y_all.loc[train_mask]
        X_test = X_all.loc[test_mask]
        y_test = y_all.loc[test_mask]

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        rf = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=5, min_samples_split=5, random_state=42, n_jobs=-1)
        rf.fit(X_train_scaled, y_train)

        y_train_pred = rf.predict(X_train_scaled)
        y_test_pred = rf.predict(X_test_scaled)

        train_acc = accuracy_score(y_train, y_train_pred)
        test_acc = accuracy_score(y_test, y_test_pred)

        cm = confusion_matrix(y_test, y_test_pred, labels=range(n_classes))
        report = classification_report(y_test, y_test_pred, output_dict=True, zero_division=0)
        feature_importances = rf.feature_importances_

        return train_acc, test_acc, cm, report, feature_importances

    results = [run_loso_fold(test_sub) for test_sub in subject_ids]
    train_accs, test_accs, cms, reports, feature_imps = zip(*compute(*results))

    avg_train_acc = np.mean(train_accs)
    avg_test_acc = np.mean(test_accs)
    conf_matrix_sum = np.sum(cms, axis=0)
    avg_feature_importances = np.mean(feature_imps, axis=0)
    conf_matrix_avg = conf_matrix_sum / conf_matrix_sum.sum(axis=1, keepdims=True)

    summary_results.append({
        "Dataset": file,
        "Train Accuracy": avg_train_acc,
        "Test Accuracy": avg_test_acc
    })
    
    avg_report = average_classification_reports(reports)

    print(f"Average Train Accuracy: {avg_train_acc:.3f}")
    print(f"Average Test Accuracy:  {avg_test_acc:.3f}")
    print("\nAverage Confusion Matrix (Normalized by True Class):")
    print(conf_matrix_avg)
    print("\nAverage Confusion Matrix (Raw counts):")
    print(conf_matrix_sum)

    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix_sum, annot=True, fmt='d',
                xticklabels=label_enc.classes_, yticklabels=label_enc.classes_)
    plt.title(f"Confusion Matrix (Raw Counts) - {file}")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.tight_layout()
    plt.show()

    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix_avg, annot=True, fmt='.2f',
                xticklabels=label_enc.classes_, yticklabels=label_enc.classes_)
    plt.title(f"Confusion Matrix (Normalized by True Class) - {file}")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.tight_layout()
    plt.show()

    print("\nAverage Classification Report:")
    for label_idx, label_name in enumerate(label_enc.classes_):
        pr = avg_report[str(label_idx)]
        print(f"{label_name}: Precision: {pr['precision']:.3f}, Recall: {pr['recall']:.3f}, F1-score: {pr['f1-score']:.3f}")

    print(f"\nOverall Accuracy: {avg_report['accuracy']:.3f}")

    feature_names = X_all.columns
    indices = np.argsort(avg_feature_importances)[::-1]
    top_n = 45

    plt.figure(figsize=(12, 6))
    plt.title(f"Average Feature Importances (Random Forest LOSO) - {file}")
    plt.bar(range(top_n), avg_feature_importances[indices[:top_n]], align='center')
    plt.xticks(range(top_n), feature_names[indices[:top_n]], rotation=90)
    plt.tight_layout()
    plt.show()


In [6]:
summary_df = pd.DataFrame(summary_results)
print("\n=== Summary of Train and Test Accuracies ===")
print(summary_df.to_string(index=False))



=== Summary of Train and Test Accuracies ===
             Dataset  Train Accuracy  Test Accuracy
features_watch_a.csv        0.624856       0.625979
features_watch_g.csv        0.591107       0.557044
features_phone_a.csv        0.408191       0.314701
features_phone_g.csv        0.386601       0.327061
