In [41]:
import os
import pandas as pd
import numpy as np

#loading the MIT stress dataset
df_mit = pd.read_csv('../mit_benchmark_data/mit_stress_dataset.csv')

#Set the background grid (the 'style')
sns.set_style("whitegrid")

#Set the color scheme ('the palette')
sns.set_palette("magma")

#Set the figure size
plt.rcParams['figure.figsize'] = (10,6)


In [None]:
#Data Audit:  Check for missing values and duplicates
initial_count = len(df_mit)
df_mit = df_mit.drop_duplicates()
print(f"Removed {initial_count - len(df_mit)} duplicate rows")

Removed 27 duplicate rows


In [45]:
#Rename columns
mit_rename = {
    'gender': 'gender',
    'Have you recently experienced stress in your life?': 'stress_label',
    'Have you noticed a rapid heartbeat or palpitations?': 'palpitations',
    'Do you face any sleep problems or difficulties falling asleep?': 'sleep_issues',
    'Have you been feeling sadness or low mood?':'low_mood',
    'Which type of stress fo you primarily experience?': 'stress_type'
}

df_mit = df_mit.rename(columns=mit_rename)

#Define the columns
cols_to_encode = ['palpitations', 'sleep_issues', 'low_mood']

#Map the values: convert survey strings to numeric values for plotting
mapping = {'Yes': 1, 'No': 0, 'Maybe': 0.5}

#The "For Loop"
for col in cols_to_encode:
    df_mit[col] = df_mit[col].map(mapping).astype(float)
else:
    print(f"Warning: Column '{col}' not found. Check the renaming step.")

print("Encoding Complete!")
df_mit[cols_to_encode].head()

Encoding Complete!


Unnamed: 0,palpitations,sleep_issues,low_mood
0,,,
1,,,
2,,,
3,,,
4,,,


In [None]:
df_sim.info()

NameError: name 'df_sim' is not defined

In [48]:

df_mit['gender'] = df_mit['gender'].astype(int)


df_mit['gender'] = df_mit['gender'].fillna(0).astype(int)
df_mit['gender_label'] = df_mit['gender'].map({0: 'm', 1: 'f'})

print("MIT Human Variability Audit")
print(df_mit.grouby('gender_label')['low_mood'].mean())

MIT Human Variability Audit


AttributeError: 'DataFrame' object has no attribute 'grouby'

Age Range: 18-21 years
Gender Encoding: 0 for male, 1 for female
Response Scale: Five-point Likert scale ("Not at all" to "Extremely") for stress indicators

In [None]:
#Check dimensions using shape (Rows, Columns)
print(f"Dataset contains {df_mit.shape[0]} rows and {df_mit.shape [1]} columns.")

Dataset contains 843 rows and 26 columns.


In [None]:
#Find missing values
missing_data = df_mit.isnull().sum()
print("\nMissing values per column:")
#only shows columns that have misses.
print(missing_data[missing_data>0])


Missing values per column:
Series([], dtype: int64)


Since the MIT data uses 0 and 1 as gender labels, these will need to be converted to 'M' and 'F' (to match the simulated), making charts and groubys easy to read.
#convert 0/1 to M/F

In [None]:
df_mit['Gender'] = df_mit['Gender'].map({0: 'M', 1: 'F'})

In [None]:
print(df_mit.isnull().sum())

Gender                                                                  0
Age                                                                     0
Have you recently experienced stress in your life?                      0
Have you noticed a rapid heartbeat or palpitations?                     0
Have you been dealing with anxiety or tension recently?                 0
Do you face any sleep problems or difficulties falling asleep?          0
Have you been dealing with anxiety or tension recently?.1               0
Have you been getting headaches more often than usual?                  0
Do you get irritated easily?                                            0
Do you have trouble concentrating on your academic tasks?               0
Have you been feeling sadness or low mood?                              0
Have you been experiencing any illness or health issues?                0
Do you often feel lonely or isolated?                                   0
Do you feel overwhelmed with your acad

Columns like Stress or Mood are float64 (4.0), converting them to int64 (4) makes the comparasion cleaner and easier to work with
#select all numerical columns to convert to int64- will not include "Gender' and "Which type of stress' columns

In [None]:
cols_to_convert = df_mit.select_dtypes(include=['float64']).columns

df_mit[cols_to_convert] = df_mit[cols_to_convert].astype('int64')

df_mit.head()

Unnamed: 0,Gender,Age,Have you recently experienced stress in your life?,Have you noticed a rapid heartbeat or palpitations?,Have you been dealing with anxiety or tension recently?,Do you face any sleep problems or difficulties falling asleep?,Have you been dealing with anxiety or tension recently?.1,Have you been getting headaches more often than usual?,Do you get irritated easily?,Do you have trouble concentrating on your academic tasks?,...,Are you facing any difficulties with your professors or instructors?,Is your working environment unpleasant or stressful?,Do you struggle to find time for relaxation and leisure activities?,Is your hostel or home environment causing you difficulties?,Do you lack confidence in your academic performance?,Do you lack confidence in your choice of academic subjects?,Academic and extracurricular activities conflicting for you?,Do you attend classes regularly?,Have you gained/lost weight?,Which type of stress do you primarily experience?
0,M,20,3,4,2,5,1,2,1,2,...,3,1,4,1,2,1,3,1,2,Eustress (Positive Stress) - Stress that motiv...
1,M,20,2,3,2,1,1,1,1,4,...,3,2,1,1,3,2,1,4,2,Eustress (Positive Stress) - Stress that motiv...
2,M,20,5,4,2,2,1,3,4,2,...,2,2,2,1,4,1,1,2,1,Eustress (Positive Stress) - Stress that motiv...
3,F,20,3,4,3,2,2,3,4,3,...,1,1,2,1,2,1,1,5,3,Eustress (Positive Stress) - Stress that motiv...
4,M,20,3,3,3,2,2,4,4,4,...,2,3,1,2,2,4,2,2,2,Eustress (Positive Stress) - Stress that motiv...


In [None]:
print(df_mit.columns.tolist())

['Gender', 'Age', 'Have you recently experienced stress in your life?', 'Have you noticed a rapid heartbeat or palpitations?', 'Have you been dealing with anxiety or tension recently?', 'Do you face any sleep problems or difficulties falling asleep?', 'Have you been dealing with anxiety or tension recently?.1', 'Have you been getting headaches more often than usual?', 'Do you get irritated easily?', 'Do you have trouble concentrating on your academic tasks?', 'Have you been feeling sadness or low mood?', 'Have you been experiencing any illness or health issues?', 'Do you often feel lonely or isolated?', 'Do you feel overwhelmed with your academic workload?', 'Are you in competition with your peers, and does it affect you?', 'Do you find that your relationship often causes you stress?', 'Are you facing any difficulties with your professors or instructors?', 'Is your working environment unpleasant or stressful?', 'Do you struggle to find time for relaxation and leisure activities?', 'Is 

There are duplicate column topics like anxiety or tension,to align with the simulated dataset, 
we will use mapping, handle the data types and shortens the names.

'Have you been dealing with anxiety or tension recently?', 
       'Do you face any sleep problems or difficulties falling asleep?', 
       'Have you been dealing with anxiety or tension recently?.1', 
       'Have you been getting headaches more often than usual?', 
       'Do you get irritated easily?', 
       'Do you have trouble concentrating on your academic tasks?', 
       'Have you been feeling sadness or low mood?', 
       'Have you been experiencing any illness or health issues?', 
       'Do you often feel lonely or isolated?', 
       'Are you in competition with your peers, and does it affect you?', 
       'Do you find that your relationship often causes you stress?', 
       'Are you facing any difficulties with your professors or instructors?', 
       'Is your working environment unpleasant or stressful?'
       'Is your hostel or home environment causing you difficulties?', 
       'Do you lack confidence in your academic performance?', 
       'Do you lack confidence in your choice of academic subjects?', 
       'Academic and extracurricular activities conflicting for you?', 
       'Do you attend classes regularly?', 
       'Have you gained/lost weight?', 

df_sim columns['Age', 'Gender', 'Heart_Rate', 'Blood_Pressure_Systolic', 'Blood_Pressure_Diastolic', 'Stress_Level_Biosensor', 'Stress_Level_Self_Report', 'Physical_Activity', 'Sleep_Quality', 'Mood', 'Study_Hours', 'Project_Hours', 'Health_Risk_Level']