In [None]:
import pandas as pd

file_path = "combined_data_imputed.csv"

df = pd.read_csv(file_path)

# Assume your original DataFrame is 'df'
cols_to_drop = ['Timestamp', 'participant_id', 
                'eda_phasic_entropy', 'acc_acc_y_entropy', 
                'acc_acc_z_entropy', 'acc_acc_x_entropy', 'eda_tonic_entropy']

# Save a cleaned copy of your dataset (all rows, all remaining columns)
cleaned_df = df.drop(columns=cols_to_drop)
cleaned_df.to_csv("cleaned_dataset.csv", index=False)
print("Cleaned dataset saved as 'cleaned_dataset.csv'.")


In [None]:
import os
from sklearn.model_selection import GroupKFold

# Ensure you have the cleaned features and label available
# Here we work with the cleaned dataset to create folds for modeling
# (The original 'df' is used for splitting so that the participant_id is available)
# Define X and y from cleaned_df
X = cleaned_df.drop(columns=cleaned_df.columns[cleaned_df.columns.str.contains('slider')])
y = cleaned_df[cleaned_df.columns[cleaned_df.columns.str.contains('slider')]]



# Use the participant_id from the original df as groups (ensure same order as cleaned_df)
groups = df['participant_id']

# Create folder to save folds
fold_dir = "folds"
os.makedirs(fold_dir, exist_ok=True)

# Initialize GroupKFold with 5 splits
gkf = GroupKFold(n_splits=5)

for fold, (train_idx, test_idx) in enumerate(gkf.split(X, y, groups=groups), 1):
    # Retrieve the full rows from cleaned_df for each fold
    fold_train = cleaned_df.iloc[train_idx].copy()
    fold_test = cleaned_df.iloc[test_idx].copy()
    
    # Save train and test sets for the current fold
    train_path = os.path.join(fold_dir, f"fold_{fold}_train.csv")
    test_path = os.path.join(fold_dir, f"fold_{fold}_test.csv")
    fold_train.to_csv(train_path, index=False)
    fold_test.to_csv(test_path, index=False)
    
    print(f"Saved Fold {fold}: Train -> {train_path} | Test -> {test_path}")


In [None]:
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GroupKFold

# Create folders for transformed data and PCA mappings
transformed_dir = "folds_transformed"
pca_mappings_dir = "pca_mappings"
os.makedirs(transformed_dir, exist_ok=True)
os.makedirs(pca_mappings_dir, exist_ok=True)

# Loop over the same folds to create and save transformed data
for fold, (train_idx, test_idx) in enumerate(gkf.split(X, y, groups=groups), 1):
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]  # y contains multiple 'slider' columns
    X_test = X.iloc[test_idx]
    y_test = y.iloc[test_idx]
    
    # Scale the data (fit on training set only)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Save scaled data as CSV files
    train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)
    train_scaled_df = pd.concat([train_scaled_df, y_train.reset_index(drop=True)], axis=1)
    
    test_scaled_df = pd.DataFrame(X_test_scaled, columns=X.columns)
    test_scaled_df = pd.concat([test_scaled_df, y_test.reset_index(drop=True)], axis=1)
    
    train_scaled_path = os.path.join(transformed_dir, f"fold_{fold}_train_scaled.csv")
    test_scaled_path = os.path.join(transformed_dir, f"fold_{fold}_test_scaled.csv")
    train_scaled_df.to_csv(train_scaled_path, index=False)
    test_scaled_df.to_csv(test_scaled_path, index=False)
    
    # Apply PCA on the scaled data (fit on training set only)
    pca = PCA(n_components=0.95)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    # ------ ADD PCA FEATURE MAPPING CODE HERE ------
    
    # Extract PCA feature loadings (how original features contribute to each PC)
    pca_loadings = pd.DataFrame(pca.components_, columns=X.columns, index=[f"PC{i+1}" for i in range(pca.n_components_)])

    # Save full PCA feature mapping
    pca_mapping_path = os.path.join(pca_mappings_dir, f"fold_{fold}_pca_feature_mapping.csv")
    pca_loadings.to_csv(pca_mapping_path)

    # Extract top 5 contributing features per PC
    top_n = 5  # Adjust as needed
    top_features_per_pc = pca_loadings.abs().apply(lambda x: x.nlargest(top_n).index.tolist(), axis=1)
    top_features_df = pd.DataFrame(top_features_per_pc.tolist(), index=top_features_per_pc.index, columns=[f"Top_{i+1}" for i in range(top_n)])

    # Save top features per PC
    top_features_path = os.path.join(pca_mappings_dir, f"fold_{fold}_top_features_per_pc.csv")
    top_features_df.to_csv(top_features_path)

    print(f"Fold {fold} PCA feature mapping saved: {pca_mapping_path}")
    print(f"Fold {fold} Top features per PC saved: {top_features_path}")

    # ------ END OF PCA FEATURE MAPPING CODE ------
    
    # Create DataFrames for PCA data
    pca_cols = [f"PC{i+1}" for i in range(X_train_pca.shape[1])]
    train_pca_df = pd.DataFrame(X_train_pca, columns=pca_cols)
    train_pca_df = pd.concat([train_pca_df, y_train.reset_index(drop=True)], axis=1)
    
    test_pca_df = pd.DataFrame(X_test_pca, columns=pca_cols)
    test_pca_df = pd.concat([test_pca_df, y_test.reset_index(drop=True)], axis=1)
    
    train_pca_path = os.path.join(transformed_dir, f"fold_{fold}_train_pca.csv")
    test_pca_path = os.path.join(transformed_dir, f"fold_{fold}_test_pca.csv")
    train_pca_df.to_csv(train_pca_path, index=False)
    test_pca_df.to_csv(test_pca_path, index=False)
    
    print(f"Fold {fold} transformed data saved:")
    print(f"  Scaled: Train -> {train_scaled_path}, Test -> {test_scaled_path}")
    print(f"  PCA:    Train -> {train_pca_path}, Test -> {test_pca_path}")


In [None]:
import pandas as pd

file_path = "combined_data_imputed.csv"

df = pd.read_csv(file_path)

# Assume your original DataFrame is 'df'
cols_to_drop = [
                'eda_phasic_entropy', 'acc_acc_y_entropy', 
                'acc_acc_z_entropy', 'acc_acc_x_entropy', 'eda_tonic_entropy']

# Save a cleaned copy of your dataset (all rows, all remaining columns)
cleaned_df = df.drop(columns=cols_to_drop)
cleaned_df.to_csv("cleaned_dataset_0307.csv", index=False)
print("Cleaned dataset saved as 'cleaned_dataset_0307.csv'.")
