In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# ------------- CONFIGURABLE PARTS -------------
# Format of your time columns
TIME_FORMAT = "%H:%M"

# Custom mapping for activity labels
def get_activity_label(task_type, file_num):
    if task_type == 1:
        return "Python"
    elif task_type == 2:
        return "Python" if file_num in [1, 3] else "C"
    elif task_type == 3:
        return "Python" if file_num in [1, 3] else "Writing"
    else:
        return "Unknown"

# Path settings
PARTICIPANT_FOLDER = "participants" # path containing all participant EEG data folders
SEGMENT_FILE = "segments.csv"       # the file you described above

# ------------- FEATURE EXTRACTION -------------
def extract_features_for_segment(eeg_df):
    # Example statistical features
    return [
        np.mean(eeg_df), np.std(eeg_df), np.min(eeg_df), np.max(eeg_df),
        np.percentile(eeg_df, 25), np.percentile(eeg_df, 75),
        np.median(eeg_df), np.ptp(eeg_df), np.mean(np.diff(eeg_df)),
        np.var(eeg_df)
    ]

def parse_time(tstr):
    # Assumes times do not cross midnight and are HH:MM
    return datetime.strptime(tstr.strip(), TIME_FORMAT)

def extract_all_segment_features(segment_df):
    features = []
    labels = []
    meta = []
    all_participants = segment_df['Participant'].unique()
    for participant in all_participants:
        part_seg = segment_df[segment_df['Participant'] == participant]
        eeg_file = os.path.join(PARTICIPANT_FOLDER, str(participant), "eeg.csv") # update if naming differs!
        if not os.path.exists(eeg_file):
            print(f"File not found: {eeg_file}")
            continue
        eeg_data = pd.read_csv(eeg_file)
        if 'ftime' not in eeg_data.columns:
            print(f"'ftime' column not found in {eeg_file}")
            continue
        eeg_data['ftime'] = eeg_data['ftime'].map(parse_time)
        for _, row in part_seg.iterrows():
            task_type = int(row['Type'])
            file_num = int(row['File'])
            start = parse_time(row['Start'])
            end = parse_time(row['End'])
            activity = get_activity_label(task_type, file_num)
            # Select data in [start, end)
            window = eeg_data[(eeg_data['ftime'] >= start) & (eeg_data['ftime'] < end)]
            # For all channels, average feature vectors for this segment
            chan_feats = []
            for col in eeg_data.columns:
                if col not in ['ftime']:
                    feats = extract_features_for_segment(window[col].dropna().values)
                    chan_feats.extend(feats)
            features.append(chan_feats)
            labels.append(activity)
            meta.append((participant, task_type, file_num, start, end))
    return np.array(features), np.array(labels), meta

# ------------- MAIN ANALYSIS -------------
# 1. Load segmentation file
segments = pd.read_csv(SEGMENT_FILE)  # columns: Participant, Type, File, Start, End

# 2. Extract features for all participants/tasks/segments
X, y, meta = extract_all_segment_features(segments)
print(f"Feature shape: {X.shape}, Labels shape: {y.shape}")

# 3. t-SNE dimensionality reduction and plot
tsne = TSNE(n_components=2, random_state=123, perplexity=30, init='pca')
X_embedded = tsne.fit_transform(X)

df_vis = pd.DataFrame({
    'tSNE1': X_embedded[:,0],
    'tSNE2': X_embedded[:,1],
    'Task': y
})

plt.figure(figsize=(8,6))
sns.scatterplot(data=df_vis, x="tSNE1", y="tSNE2", hue="Task", s=80, alpha=0.75)
plt.title("t-SNE Projection of EEG Features for Python, C, and Writing")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.legend(title="Task", fontsize=12)
plt.tight_layout()
plt.show()
