In [None]:
import os
import pandas as pd
import numpy as np

#Loads the data
df_mit = pd.read_csv('../mit_benchmark_data/mit_stress_dataset.csv')

#Makes all column text lowercase and removes spaces.
df_mit.columns = [col.lower().replace(' ', '_') for col in df_mit.columns]

#Renaming Dictionary
mit_rename = {
    'have_you_recently_experienced_stress_in_your_life?': 'stress_self_report_general',
    'have_you_noticed_a_rapid_heartbeat_or_palpitations?': 'heart_rate_symptoms',
    'do_you_face_any_sleep_problems_or_difficulties_falling_asleep?': 'sleep_hours',
    'have_you_been_dealing_with_anxiety_or_tension_recently?': 'stress_self_report_anxiety',
    'do_you_find_that_your_relationship_often_causes_you_stress?': 'stress_self_report_relational',
    'do_you_feel_overwhelmed_with_your_academic_workload?': 'stress_self_report_academic',
    'do_you_get_irritated_easily?': 'mood_score_irritability',
    'have_you_been_feeling_sadness_or_low_mood': 'mood_score',
    'gender': 'gender_bin'
}
#Runs the rename action
df_mit.rename(columns=mit_rename, inplace=True)

#Audits for missing values and duplicates
initial_count = len(df_mit)
df_mit = df_mit.drop_duplicates()
print(f"Removed {initial_count - len(df_mit)} duplicate rows")

#Defines columns to encode
cols_to_encode =[
    'stress_self_report_general', 'stress_self_report_anxiety',
    'stress_self_report_relational', 'stress_self_report_academic',
    'heart_rate'
]

#Handles binary/categorical mapping for any leftover survey strings
binary_mapping = {'Yes': 1, 'No': 0, 'Maybe': 0.5, 'yes': 1, 'no': 0}

for col in cols_to_encode:
   if col in df_mit.columns:
      if df_mit[col].dtype == 'object':
        df_mit[col] = df_mit[col].map(binary_mapping).fillna(0)
print(f"Missing data and duplicate audit complete!")

#define the likert map for response conversion
likert_map = {
    'Not at all': 1,
    'Rarely': 2,
    'Sometimes': 3,
    'Frequently': 4,
    'Extremely': 5
}

#Creates dictionary for likert map
likert_cols = [
    'stress_self_report_general', 'stress_self_report_anxiety',
    'stress_self_report_relational', 'stress_self_report_academic',
    'mood_score_irritability', 'heart_rate_symptoms'
]

for col in likert_cols:
    if col in df_mit.columns:
        #Maps strings to numbers, keeping existing numbers as-is
        df_mit[col] = df_mit[col].map(likert_map).fillna(df_mit[col])
        df_mit[col] = pd.to_numeric(df_mit[col], errors='coerce').fillna(0)

#Applies gender coding
if 'gender_bin' in df_mit.columns:
    gender_map = {'Male': 0, 'Female': 1, 'M': 0, 'F': 1, 'male': 0, 'female':1}
    df_mit['gender_bin'] = df_mit['gender_bin'].map(gender_map).fillna(0).astype(int)

#Calculates composite score for stress categories
stress_features = ['stress_self_report_general', 'stress_self_report_anxiety',
    'stress_self_report_relational', 'stress_self_report_academic']
df_mit['stress_self_report'] = df_mit[stress_features].mean(axis=1)

#Likert Response Scale: Five-point Likert scale ("Not at all" to "Extremely")
#likert_score + 3 or a specific dictionary map
#Maps the likert score to realistic 4-9 hour range
sleep_hours_map = {1: 4, 2: 5, 3: 7, 4: 8, 5: 9}

#applies the map to the existing numeric scores
df_mit['sleep_hours'] = df_mit['sleep_hours'].map(sleep_hours_map)

#Forces both categories to numeric
df_mit['sleep_hours'] = pd.to_numeric(df_mit['sleep_hours'], errors='coerce')
df_mit['heart_rate'] = pd.to_numeric(df_mit['heart_rate_symptoms'], errors='coerce')

#Drops missing values to keep the dataset clean for comparison
df_mit.dropna(subset=['sleep_hours', 'heart_rate'], inplace=True)


Removed 27 duplicate rows
Missing data and duplicate audit complete!
Success! Your clean data has been saved as a CSV file in your Project Folder!
