In [4]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import joblib
import os
import pandas as pd
import numpy as np

# Directory paths
data_dir = "5fold_splits"
processed_data_dir = "5fold_splits_pca"
os.makedirs(processed_data_dir, exist_ok=True)

# Define target aspects
target_aspects = ["Positive_Emotions", "Negative_Emotions", "Self_Esteem", "Meaning_in_Life", "Social_Support"]

# Fit PCA on the first training fold and save the model
first_train_file = os.path.join(data_dir, "train_fold_0.csv")
first_train_df = pd.read_csv(first_train_file)
feature_cols = [col for col in first_train_df.columns if col not in ["participant_id"] + target_aspects]

# Standardize features
scaler = StandardScaler()
train_scaled = scaler.fit_transform(first_train_df[feature_cols])

# Fit PCA
pca = PCA(n_components=0.95)  # Keep 95% variance
train_pca = pca.fit_transform(train_scaled)

# Save PCA and scaler models
joblib.dump(pca, os.path.join(processed_data_dir, "pca_model.pkl"))
joblib.dump(scaler, os.path.join(processed_data_dir, "scaler.pkl"))
print("Saved PCA and Scaler models.")

# Apply the same PCA to all folds
for fold in range(5):
    train_file = os.path.join(data_dir, f"train_fold_{fold}.csv")
    test_file = os.path.join(data_dir, f"test_fold_{fold}.csv")

    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)

    # Standardize using the same scaler
    train_scaled = scaler.transform(train_df[feature_cols])
    test_scaled = scaler.transform(test_df[feature_cols])

    # Apply PCA using the same model
    train_pca = pca.transform(train_scaled)
    test_pca = pca.transform(test_scaled)

    # Convert back to DataFrame
    pca_feature_names = [f"PC{i+1}" for i in range(train_pca.shape[1])]
    train_pca_df = pd.DataFrame(train_pca, columns=pca_feature_names)
    test_pca_df = pd.DataFrame(test_pca, columns=pca_feature_names)

    # Add target variables and participant IDs back
    train_pca_df[target_aspects] = train_df[target_aspects].reset_index(drop=True)
    train_pca_df["participant_id"] = train_df["participant_id"].reset_index(drop=True)

    test_pca_df[target_aspects] = test_df[target_aspects].reset_index(drop=True)
    test_pca_df["participant_id"] = test_df["participant_id"].reset_index(drop=True)

    # Save processed files
    train_pca_file = os.path.join(processed_data_dir, f"train_fold_{fold}_pca.csv")
    test_pca_file = os.path.join(processed_data_dir, f"test_fold_{fold}_pca.csv")

    train_pca_df.to_csv(train_pca_file, index=False)
    test_pca_df.to_csv(test_pca_file, index=False)

    print(f"Saved PCA processed files: {train_pca_file} and {test_pca_file}")


Saved PCA and Scaler models.
Saved PCA processed files: 5fold_splits_pca/train_fold_0_pca.csv and 5fold_splits_pca/test_fold_0_pca.csv
Saved PCA processed files: 5fold_splits_pca/train_fold_1_pca.csv and 5fold_splits_pca/test_fold_1_pca.csv
Saved PCA processed files: 5fold_splits_pca/train_fold_2_pca.csv and 5fold_splits_pca/test_fold_2_pca.csv
Saved PCA processed files: 5fold_splits_pca/train_fold_3_pca.csv and 5fold_splits_pca/test_fold_3_pca.csv
Saved PCA processed files: 5fold_splits_pca/train_fold_4_pca.csv and 5fold_splits_pca/test_fold_4_pca.csv
