In [2]:
import pandas as pd
import os
from sklearn.feature_selection import VarianceThreshold

# Set the path to the email_ano folder
folder_path = r'C:\Users\karun\OneDrive\Documents\RIK\data\TWOS-dataset\email_ano'

all_data = []

# Loop through each CSV file in the folder
for file in os.listdir(folder_path):
    if file.endswith('.csv'):
        user_id = file.replace('.csv', '')  # e.g., 'user1', 'technical'
        file_path = os.path.join(folder_path, file)

        try:
            df = pd.read_csv(
                file_path,
                delimiter='|',
                encoding='utf-8',
                engine='python',
                on_bad_lines='skip'  # Skip problematic rows
            )

            df['user'] = user_id
            df.columns = df.columns.str.strip()
            all_data.append(df)

            print(f"Loaded {file} successfully.")

        except Exception as e:
            print(f"Failed to load {file}: {e}")

# Combine all user files into a single DataFrame
email_df = pd.concat(all_data, ignore_index=True)

# Drop email body if not needed
email_df = email_df.drop(columns=['body_anon'], errors='ignore')

# Convert labels to binary
email_df['label'] = email_df['label'].map({'NORMAL': 0, 'ANOMALY': 1})

# Drop zero-variance columns
print("Removing zero-variance columns...")
constant_cols = []
features = email_df.drop(columns=['user', 'id', 'label'], errors='ignore')

selector = VarianceThreshold(threshold=0.0)
selector.fit(features)
selected_columns = features.columns[selector.get_support()]

# Reconstruct DataFrame
email_df_cleaned = pd.concat([
    email_df[['user', 'id', 'label']].reset_index(drop=True),
    features[selected_columns].reset_index(drop=True)
], axis=1)

# Save the cleaned DataFrame to CSV
output_path = r'C:\Users\karun\OneDrive\Documents\RIK\outputs\twos_emails_liwc.csv'
email_df_cleaned.to_csv(output_path, index=False)

print("Processing complete.")
print(f"Total rows: {len(email_df_cleaned)}")
print(f"Final feature columns: {len(email_df_cleaned.columns) - 3}")
print(f"Saved to: {output_path}")


Loaded technical.csv successfully.
Loaded User1.csv successfully.
Loaded User10.csv successfully.
Loaded User11.csv successfully.
Loaded User12.csv successfully.
Loaded User13.csv successfully.
Loaded User14.csv successfully.
Loaded User15.csv successfully.
Loaded User16.csv successfully.
Loaded User17.csv successfully.
Loaded User18.csv successfully.
Loaded User19.csv successfully.
Loaded User2.csv successfully.
Loaded User20.csv successfully.
Loaded User21.csv successfully.
Loaded User22.csv successfully.
Loaded User23.csv successfully.
Loaded User3.csv successfully.
Loaded User4.csv successfully.
Loaded User6.csv successfully.
Loaded User7.csv successfully.
Loaded User8.csv successfully.
Loaded User9.csv successfully.
Removing zero-variance columns...
Processing complete.
Total rows: 2221
Final feature columns: 92
Saved to: C:\Users\karun\OneDrive\Documents\RIK\outputs\twos_emails_liwc.csv
