In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("Sleep_health_and_lifestyle_dataset.csv")

In [3]:
data

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,370,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
370,371,Female,59,Nurse,8.0,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
371,372,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
372,373,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea


# replacing all the nan values with missing values.

In [4]:
null_values_per_column = data.isnull().sum()
print(null_values_per_column)

Person ID                    0
Gender                       0
Age                          0
Occupation                   0
Sleep Duration               0
Quality of Sleep             0
Physical Activity Level      0
Stress Level                 0
BMI Category                 0
Blood Pressure               0
Heart Rate                   0
Daily Steps                  0
Sleep Disorder             219
dtype: int64


In [5]:
unique_values = data['Sleep Disorder'].unique()
print(unique_values)

[nan 'Sleep Apnea' 'Insomnia']


In [6]:
data['Sleep Disorder'].fillna('None', inplace=True)


In [7]:
unique_values = data['Sleep Disorder'].unique()
print(unique_values)

['None' 'Sleep Apnea' 'Insomnia']


# replacing the age with range

In [8]:
print(data['Age'].unique())

[27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 48 49 50 51 52
 53 54 55 56 57 58 59]


In [9]:
age_bins = [26, 34, 44, 54, 59]
age_labels = ['27-34', '35-44', '45-54', '55-59']

data['Age Group'] = pd.cut(data['Age'], bins=age_bins, labels=age_labels, right=True)

In [10]:
data = data.drop('Age', axis=1)

In [11]:
data.dtypes

Person ID                     int64
Gender                       object
Occupation                   object
Sleep Duration              float64
Quality of Sleep              int64
Physical Activity Level       int64
Stress Level                  int64
BMI Category                 object
Blood Pressure               object
Heart Rate                    int64
Daily Steps                   int64
Sleep Disorder               object
Age Group                  category
dtype: object

## replacing sleep data 

In [12]:
print(data['Sleep Duration'].unique())

[6.1 6.2 5.9 6.3 7.8 6.  6.5 7.6 7.7 7.9 6.4 7.5 7.2 5.8 6.7 7.3 7.4 7.1
 6.6 6.9 8.  6.8 8.1 8.3 8.5 8.4 8.2]


In [13]:
sleep_bins = [0, 6, 8, float('inf')]  # Includes all possible sleep durations
sleep_labels = ['Short (<6 hours)', 'Normal (6-8 hours)', 'Long (>8 hours)']

# Discretize the 'Sleep Duration' column
data['Sleep Duration Category'] = pd.cut(data['Sleep Duration'], bins=sleep_bins, labels=sleep_labels, right=False)

In [14]:
data = data.drop('Sleep Duration', axis = 1)

# phisical activity level

In [15]:
data['Physical Activity Level'].unique()

array([42, 60, 30, 40, 75, 35, 45, 50, 32, 70, 80, 55, 90, 47, 65, 85])

In [16]:
num_bins = 3


data['Physical Activity Category'] = pd.qcut(data['Physical Activity Level'], q=num_bins, labels=['Low', 'Medium', 'High'])

In [17]:
data = data.drop('Physical Activity Level', axis =1)

# Heart rate

In [18]:
data['Heart Rate'].unique()

array([77, 75, 85, 82, 70, 80, 78, 69, 72, 68, 76, 81, 65, 84, 74, 67, 73,
       83, 86])

In [19]:
heart_rate_bins = [64, 70, 80, 86] 
heart_rate_labels = ['Low (65-70 bpm)', 'Normal (71-80 bpm)', 'High (81-86 bpm)']

data['Heart Rate Category'] = pd.cut(data['Heart Rate'], bins=heart_rate_bins, labels=heart_rate_labels, right=True)

In [20]:
print(data['Heart Rate Category'].unique())

['Normal (71-80 bpm)', 'High (81-86 bpm)', 'Low (65-70 bpm)']
Categories (3, object): ['Low (65-70 bpm)' < 'Normal (71-80 bpm)' < 'High (81-86 bpm)']


In [21]:
data = data.drop('Heart Rate', axis=1)

# blood pressure

In [22]:
data['Blood Pressure'].unique()

array(['126/83', '125/80', '140/90', '120/80', '132/87', '130/86',
       '117/76', '118/76', '128/85', '131/86', '128/84', '115/75',
       '135/88', '129/84', '130/85', '115/78', '119/77', '121/79',
       '125/82', '135/90', '122/80', '142/92', '140/95', '139/91',
       '118/75'], dtype=object)

In [23]:
data[['Systolic BP', 'Diastolic BP']] = data['Blood Pressure'].str.split('/', expand=True).astype(int)

# Optionally, check the first few rows to confirm the split
print(data[['Blood Pressure', 'Systolic BP', 'Diastolic BP']].head())

  Blood Pressure  Systolic BP  Diastolic BP
0         126/83          126            83
1         125/80          125            80
2         125/80          125            80
3         140/90          140            90
4         140/90          140            90


In [24]:
data = data.drop('Blood Pressure', axis = 1)

In [25]:
data['Systolic BP'].unique()

array([126, 125, 140, 120, 132, 130, 117, 118, 128, 131, 115, 135, 129,
       119, 121, 122, 142, 139])

In [26]:
data['Diastolic BP'].unique()

array([83, 80, 90, 87, 86, 76, 85, 84, 75, 88, 78, 77, 79, 82, 92, 95, 91])

In [27]:
import pandas as pd

def categorize_blood_pressure(systolic, diastolic):
    if systolic < 120 and diastolic < 80:
        return 'Normal'
    elif 120 <= systolic < 130 and diastolic < 80:
        return 'Elevated'
    elif (130 <= systolic < 140) or (80 <= diastolic < 90):
        return 'Hypertension Stage 1'
    elif systolic >= 140 or diastolic >= 90:
        return 'Hypertension Stage 2'
    else:
        return 'Uncategorized' 


data['Blood Pressure Category'] = data.apply(lambda row: categorize_blood_pressure(row['Systolic BP'], row['Diastolic BP']), axis=1)


print(data[['Systolic BP', 'Diastolic BP', 'Blood Pressure Category']].head())


   Systolic BP  Diastolic BP Blood Pressure Category
0          126            83    Hypertension Stage 1
1          125            80    Hypertension Stage 1
2          125            80    Hypertension Stage 1
3          140            90    Hypertension Stage 2
4          140            90    Hypertension Stage 2


In [28]:
data = data.drop('Systolic BP', axis=1)
data = data.drop('Diastolic BP', axis = 1)

In [29]:
data

Unnamed: 0,Person ID,Gender,Occupation,Quality of Sleep,Stress Level,BMI Category,Daily Steps,Sleep Disorder,Age Group,Sleep Duration Category,Physical Activity Category,Heart Rate Category,Blood Pressure Category
0,1,Male,Software Engineer,6,6,Overweight,4200,,27-34,Normal (6-8 hours),Low,Normal (71-80 bpm),Hypertension Stage 1
1,2,Male,Doctor,6,8,Normal,10000,,27-34,Normal (6-8 hours),Medium,Normal (71-80 bpm),Hypertension Stage 1
2,3,Male,Doctor,6,8,Normal,10000,,27-34,Normal (6-8 hours),Medium,Normal (71-80 bpm),Hypertension Stage 1
3,4,Male,Sales Representative,4,8,Obese,3000,Sleep Apnea,27-34,Short (<6 hours),Low,High (81-86 bpm),Hypertension Stage 2
4,5,Male,Sales Representative,4,8,Obese,3000,Sleep Apnea,27-34,Short (<6 hours),Low,High (81-86 bpm),Hypertension Stage 2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,370,Female,Nurse,9,3,Overweight,7000,Sleep Apnea,55-59,Long (>8 hours),Medium,Low (65-70 bpm),Hypertension Stage 2
370,371,Female,Nurse,9,3,Overweight,7000,Sleep Apnea,55-59,Long (>8 hours),Medium,Low (65-70 bpm),Hypertension Stage 2
371,372,Female,Nurse,9,3,Overweight,7000,Sleep Apnea,55-59,Long (>8 hours),Medium,Low (65-70 bpm),Hypertension Stage 2
372,373,Female,Nurse,9,3,Overweight,7000,Sleep Apnea,55-59,Long (>8 hours),Medium,Low (65-70 bpm),Hypertension Stage 2


# Sleep

In [30]:
data['Quality of Sleep'].unique()

array([6, 4, 7, 5, 8, 9])

In [31]:
def categorize_sleep_duration(duration):
    if duration  == 4:
        return 'Poor'
    elif duration == 5:
        return 'Fairly_Poor'
    elif duration == 6:
        return 'Fair'
    elif duration == 7:
        return 'Good'
    elif duration == 8:
        return 'Very_good' 
    elif duration == 9:
        return 'Excellent'   
    else:
        return 'Uncategorized'  


data['Sleep Quality'] = data['Quality of Sleep'].apply(categorize_sleep_duration)
# print(data[['Quality of Sleep', 'Sleep Quality']].head())


In [32]:
data

Unnamed: 0,Person ID,Gender,Occupation,Quality of Sleep,Stress Level,BMI Category,Daily Steps,Sleep Disorder,Age Group,Sleep Duration Category,Physical Activity Category,Heart Rate Category,Blood Pressure Category,Sleep Quality
0,1,Male,Software Engineer,6,6,Overweight,4200,,27-34,Normal (6-8 hours),Low,Normal (71-80 bpm),Hypertension Stage 1,Fair
1,2,Male,Doctor,6,8,Normal,10000,,27-34,Normal (6-8 hours),Medium,Normal (71-80 bpm),Hypertension Stage 1,Fair
2,3,Male,Doctor,6,8,Normal,10000,,27-34,Normal (6-8 hours),Medium,Normal (71-80 bpm),Hypertension Stage 1,Fair
3,4,Male,Sales Representative,4,8,Obese,3000,Sleep Apnea,27-34,Short (<6 hours),Low,High (81-86 bpm),Hypertension Stage 2,Poor
4,5,Male,Sales Representative,4,8,Obese,3000,Sleep Apnea,27-34,Short (<6 hours),Low,High (81-86 bpm),Hypertension Stage 2,Poor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,370,Female,Nurse,9,3,Overweight,7000,Sleep Apnea,55-59,Long (>8 hours),Medium,Low (65-70 bpm),Hypertension Stage 2,Excellent
370,371,Female,Nurse,9,3,Overweight,7000,Sleep Apnea,55-59,Long (>8 hours),Medium,Low (65-70 bpm),Hypertension Stage 2,Excellent
371,372,Female,Nurse,9,3,Overweight,7000,Sleep Apnea,55-59,Long (>8 hours),Medium,Low (65-70 bpm),Hypertension Stage 2,Excellent
372,373,Female,Nurse,9,3,Overweight,7000,Sleep Apnea,55-59,Long (>8 hours),Medium,Low (65-70 bpm),Hypertension Stage 2,Excellent


In [33]:
data = data.drop('Quality of Sleep', axis = 1)

# Stress level

In [34]:
data['Stress Level'].unique()

array([6, 8, 7, 4, 3, 5])

In [35]:
stress_bins = [2, 4, 6, 8] 
stress_labels = ['Low Stress', 'Moderate Stress', 'High Stress']

data['Stress Level Category'] = pd.cut(data['Stress Level'], bins=stress_bins, labels=stress_labels, right=True)

print(data[['Stress Level', 'Stress Level Category']].head())


   Stress Level Stress Level Category
0             6       Moderate Stress
1             8           High Stress
2             8           High Stress
3             8           High Stress
4             8           High Stress


In [36]:
# data = data.drop('Quality of Sleep', axis = 1)
data = data.drop('Stress Level', axis = 1)

In [37]:
data = data.drop('Person ID', axis = 1)

In [38]:
data

Unnamed: 0,Gender,Occupation,BMI Category,Daily Steps,Sleep Disorder,Age Group,Sleep Duration Category,Physical Activity Category,Heart Rate Category,Blood Pressure Category,Sleep Quality,Stress Level Category
0,Male,Software Engineer,Overweight,4200,,27-34,Normal (6-8 hours),Low,Normal (71-80 bpm),Hypertension Stage 1,Fair,Moderate Stress
1,Male,Doctor,Normal,10000,,27-34,Normal (6-8 hours),Medium,Normal (71-80 bpm),Hypertension Stage 1,Fair,High Stress
2,Male,Doctor,Normal,10000,,27-34,Normal (6-8 hours),Medium,Normal (71-80 bpm),Hypertension Stage 1,Fair,High Stress
3,Male,Sales Representative,Obese,3000,Sleep Apnea,27-34,Short (<6 hours),Low,High (81-86 bpm),Hypertension Stage 2,Poor,High Stress
4,Male,Sales Representative,Obese,3000,Sleep Apnea,27-34,Short (<6 hours),Low,High (81-86 bpm),Hypertension Stage 2,Poor,High Stress
...,...,...,...,...,...,...,...,...,...,...,...,...
369,Female,Nurse,Overweight,7000,Sleep Apnea,55-59,Long (>8 hours),Medium,Low (65-70 bpm),Hypertension Stage 2,Excellent,Low Stress
370,Female,Nurse,Overweight,7000,Sleep Apnea,55-59,Long (>8 hours),Medium,Low (65-70 bpm),Hypertension Stage 2,Excellent,Low Stress
371,Female,Nurse,Overweight,7000,Sleep Apnea,55-59,Long (>8 hours),Medium,Low (65-70 bpm),Hypertension Stage 2,Excellent,Low Stress
372,Female,Nurse,Overweight,7000,Sleep Apnea,55-59,Long (>8 hours),Medium,Low (65-70 bpm),Hypertension Stage 2,Excellent,Low Stress


# steps

In [40]:
data['Daily Steps'].unique()

array([ 4200, 10000,  3000,  3500,  8000,  4000,  4100,  6800,  5000,
        7000,  5500,  5200,  5600,  3300,  4800,  7500,  7300,  6200,
        6000,  3700])

In [41]:
steps_bins = [0, 4999, 7499, 9999, float('inf')]
steps_labels = ['Sedentary (<5000 steps)', 'Lightly Active (5000-7499 steps)', 'Moderately Active (7500-9999 steps)', 'Very Active (>=10000 steps)']

data['Activity Level'] = pd.cut(data['Daily Steps'], bins=steps_bins, labels=steps_labels, right=True)

In [42]:
data = data.drop('Daily Steps', axis = 1)

In [43]:
data

Unnamed: 0,Gender,Occupation,BMI Category,Sleep Disorder,Age Group,Sleep Duration Category,Physical Activity Category,Heart Rate Category,Blood Pressure Category,Sleep Quality,Stress Level Category,Activity Level
0,Male,Software Engineer,Overweight,,27-34,Normal (6-8 hours),Low,Normal (71-80 bpm),Hypertension Stage 1,Fair,Moderate Stress,Sedentary (<5000 steps)
1,Male,Doctor,Normal,,27-34,Normal (6-8 hours),Medium,Normal (71-80 bpm),Hypertension Stage 1,Fair,High Stress,Very Active (>=10000 steps)
2,Male,Doctor,Normal,,27-34,Normal (6-8 hours),Medium,Normal (71-80 bpm),Hypertension Stage 1,Fair,High Stress,Very Active (>=10000 steps)
3,Male,Sales Representative,Obese,Sleep Apnea,27-34,Short (<6 hours),Low,High (81-86 bpm),Hypertension Stage 2,Poor,High Stress,Sedentary (<5000 steps)
4,Male,Sales Representative,Obese,Sleep Apnea,27-34,Short (<6 hours),Low,High (81-86 bpm),Hypertension Stage 2,Poor,High Stress,Sedentary (<5000 steps)
...,...,...,...,...,...,...,...,...,...,...,...,...
369,Female,Nurse,Overweight,Sleep Apnea,55-59,Long (>8 hours),Medium,Low (65-70 bpm),Hypertension Stage 2,Excellent,Low Stress,Lightly Active (5000-7499 steps)
370,Female,Nurse,Overweight,Sleep Apnea,55-59,Long (>8 hours),Medium,Low (65-70 bpm),Hypertension Stage 2,Excellent,Low Stress,Lightly Active (5000-7499 steps)
371,Female,Nurse,Overweight,Sleep Apnea,55-59,Long (>8 hours),Medium,Low (65-70 bpm),Hypertension Stage 2,Excellent,Low Stress,Lightly Active (5000-7499 steps)
372,Female,Nurse,Overweight,Sleep Apnea,55-59,Long (>8 hours),Medium,Low (65-70 bpm),Hypertension Stage 2,Excellent,Low Stress,Lightly Active (5000-7499 steps)


In [44]:
data.columns = [col.replace(' ', '_') for col in data.columns]

print(data.columns)


Index(['Gender', 'Occupation', 'BMI_Category', 'Sleep_Disorder', 'Age_Group',
       'Sleep_Duration_Category', 'Physical_Activity_Category',
       'Heart_Rate_Category', 'Blood_Pressure_Category', 'Sleep_Quality',
       'Stress_Level_Category', 'Activity_Level'],
      dtype='object')


In [45]:
new_column_order = [
    'Gender', 
    'Age_Group', 
    'Occupation', 
    'Sleep_Duration_Category', 
    'Sleep_Quality',
    'Physical_Activity_Category', 
    'Stress_Level_Category', 
    'BMI_Category', 
    'Blood_Pressure_Category',  
    'Heart_Rate_Category', 
    'Activity_Level', 
    'Sleep_Disorder'
]

data = data[new_column_order]

print(data.head())


  Gender Age_Group            Occupation Sleep_Duration_Category  \
0   Male     27-34     Software Engineer      Normal (6-8 hours)   
1   Male     27-34                Doctor      Normal (6-8 hours)   
2   Male     27-34                Doctor      Normal (6-8 hours)   
3   Male     27-34  Sales Representative        Short (<6 hours)   
4   Male     27-34  Sales Representative        Short (<6 hours)   

  Sleep_Quality Physical_Activity_Category Stress_Level_Category BMI_Category  \
0          Fair                        Low       Moderate Stress   Overweight   
1          Fair                     Medium           High Stress       Normal   
2          Fair                     Medium           High Stress       Normal   
3          Poor                        Low           High Stress        Obese   
4          Poor                        Low           High Stress        Obese   

  Blood_Pressure_Category Heart_Rate_Category               Activity_Level  \
0    Hypertension Stage 1 

In [46]:
data.to_csv('trainingData.csv', index=False)

In [47]:
data

Unnamed: 0,Gender,Age_Group,Occupation,Sleep_Duration_Category,Sleep_Quality,Physical_Activity_Category,Stress_Level_Category,BMI_Category,Blood_Pressure_Category,Heart_Rate_Category,Activity_Level,Sleep_Disorder
0,Male,27-34,Software Engineer,Normal (6-8 hours),Fair,Low,Moderate Stress,Overweight,Hypertension Stage 1,Normal (71-80 bpm),Sedentary (<5000 steps),
1,Male,27-34,Doctor,Normal (6-8 hours),Fair,Medium,High Stress,Normal,Hypertension Stage 1,Normal (71-80 bpm),Very Active (>=10000 steps),
2,Male,27-34,Doctor,Normal (6-8 hours),Fair,Medium,High Stress,Normal,Hypertension Stage 1,Normal (71-80 bpm),Very Active (>=10000 steps),
3,Male,27-34,Sales Representative,Short (<6 hours),Poor,Low,High Stress,Obese,Hypertension Stage 2,High (81-86 bpm),Sedentary (<5000 steps),Sleep Apnea
4,Male,27-34,Sales Representative,Short (<6 hours),Poor,Low,High Stress,Obese,Hypertension Stage 2,High (81-86 bpm),Sedentary (<5000 steps),Sleep Apnea
...,...,...,...,...,...,...,...,...,...,...,...,...
369,Female,55-59,Nurse,Long (>8 hours),Excellent,Medium,Low Stress,Overweight,Hypertension Stage 2,Low (65-70 bpm),Lightly Active (5000-7499 steps),Sleep Apnea
370,Female,55-59,Nurse,Long (>8 hours),Excellent,Medium,Low Stress,Overweight,Hypertension Stage 2,Low (65-70 bpm),Lightly Active (5000-7499 steps),Sleep Apnea
371,Female,55-59,Nurse,Long (>8 hours),Excellent,Medium,Low Stress,Overweight,Hypertension Stage 2,Low (65-70 bpm),Lightly Active (5000-7499 steps),Sleep Apnea
372,Female,55-59,Nurse,Long (>8 hours),Excellent,Medium,Low Stress,Overweight,Hypertension Stage 2,Low (65-70 bpm),Lightly Active (5000-7499 steps),Sleep Apnea


In [48]:
data.columns

Index(['Gender', 'Age_Group', 'Occupation', 'Sleep_Duration_Category',
       'Sleep_Quality', 'Physical_Activity_Category', 'Stress_Level_Category',
       'BMI_Category', 'Blood_Pressure_Category', 'Heart_Rate_Category',
       'Activity_Level', 'Sleep_Disorder'],
      dtype='object')

In [49]:
data

Unnamed: 0,Gender,Age_Group,Occupation,Sleep_Duration_Category,Sleep_Quality,Physical_Activity_Category,Stress_Level_Category,BMI_Category,Blood_Pressure_Category,Heart_Rate_Category,Activity_Level,Sleep_Disorder
0,Male,27-34,Software Engineer,Normal (6-8 hours),Fair,Low,Moderate Stress,Overweight,Hypertension Stage 1,Normal (71-80 bpm),Sedentary (<5000 steps),
1,Male,27-34,Doctor,Normal (6-8 hours),Fair,Medium,High Stress,Normal,Hypertension Stage 1,Normal (71-80 bpm),Very Active (>=10000 steps),
2,Male,27-34,Doctor,Normal (6-8 hours),Fair,Medium,High Stress,Normal,Hypertension Stage 1,Normal (71-80 bpm),Very Active (>=10000 steps),
3,Male,27-34,Sales Representative,Short (<6 hours),Poor,Low,High Stress,Obese,Hypertension Stage 2,High (81-86 bpm),Sedentary (<5000 steps),Sleep Apnea
4,Male,27-34,Sales Representative,Short (<6 hours),Poor,Low,High Stress,Obese,Hypertension Stage 2,High (81-86 bpm),Sedentary (<5000 steps),Sleep Apnea
...,...,...,...,...,...,...,...,...,...,...,...,...
369,Female,55-59,Nurse,Long (>8 hours),Excellent,Medium,Low Stress,Overweight,Hypertension Stage 2,Low (65-70 bpm),Lightly Active (5000-7499 steps),Sleep Apnea
370,Female,55-59,Nurse,Long (>8 hours),Excellent,Medium,Low Stress,Overweight,Hypertension Stage 2,Low (65-70 bpm),Lightly Active (5000-7499 steps),Sleep Apnea
371,Female,55-59,Nurse,Long (>8 hours),Excellent,Medium,Low Stress,Overweight,Hypertension Stage 2,Low (65-70 bpm),Lightly Active (5000-7499 steps),Sleep Apnea
372,Female,55-59,Nurse,Long (>8 hours),Excellent,Medium,Low Stress,Overweight,Hypertension Stage 2,Low (65-70 bpm),Lightly Active (5000-7499 steps),Sleep Apnea
