1.数据处理

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('adjusted_sleep_health_data.csv')

# 去重操作
data = data.drop_duplicates()

# 检查缺失值
missing_values = data.isnull().sum()
print("缺失值统计:")
print(missing_values)

# 若出现缺失值，使用均值填充数值变量的缺失值
numeric_cols = ['Sleep Duration (hours)', 'Quality of Sleep (scale: 1-10)', 
               'Physical Activity Level (minutes/day)', 'Stress Level (scale: 1-10)',
               'Heart Rate (bpm)', 'Daily Steps','Systolic BP', 'Diastolic BP']
for col in numeric_cols:
    if data[col].isnull().sum() > 0:
        data[col].fillna(round(data[col].mean()), inplace=True)

# 异常值处理
# 根据年龄判断心率是否正常
def is_normal_heart_rate(age, heart_rate):
    if age < 18:
        return 60 <= heart_rate <= 100
    elif 18 <= age <= 60:
        return 60 <= heart_rate <= 100
    else:  # 年龄 > 60
        return 50 <= heart_rate <= 90
    
# 判断血压是否正常
def is_normal_blood_pressure(systolic_bp, diastolic_bp):
    return 90 <= systolic_bp <= 140 and 60 <= diastolic_bp <= 90

# 检查并处理异常值
def check_and_mark_abnormal(data):
    
    # 计算每列的均值
    heart_rate_mean = int(data['Heart Rate (bpm)'].mean())
    systolic_bp_mean = data['Systolic BP'].mean()
    diastolic_bp_mean = data['Diastolic BP'].mean()
    
    # 检查心率并标记异常值为均值
    data.loc[~data.apply(lambda row: is_normal_heart_rate(row['Age'], row['Heart Rate (bpm)']), axis=1), 'Heart Rate (bpm)'] = heart_rate_mean

   # 检查血压并替换异常值为均值
    data.loc[~data.apply(lambda row: is_normal_blood_pressure(row['Systolic BP'], row['Diastolic BP']), axis=1), 'Systolic BP'] = systolic_bp_mean
    data.loc[~data.apply(lambda row: is_normal_blood_pressure(row['Systolic BP'], row['Diastolic BP']), axis=1), 'Diastolic BP'] = diastolic_bp_mean

    return data

# 处理异常值
data = check_and_mark_abnormal(data)

# 输出处理后的数据
print(data.head())

# 保存处理后的数据到新的 CSV 文件
data.to_csv('cleaned_sleep_health_data.csv', index=False)
print("数据已处理完毕")

缺失值统计:
Person ID                                0
Gender                                   0
Age                                      0
Occupation                               0
Sleep Duration (hours)                   0
Quality of Sleep (scale: 1-10)           0
Physical Activity Level (minutes/day)    0
Stress Level (scale: 1-10)               0
BMI Category                             0
Heart Rate (bpm)                         0
Daily Steps                              0
Sleep Disorder                           0
Systolic BP                              0
Diastolic BP                             0
dtype: int64
   Person ID  Gender   Age         Occupation  Sleep Duration (hours)  \
0          1    Male  25.0  Healthcare Worker                       6   
1          2  Female  35.0       Manual Labor                       6   
2          3  Female  34.0       Manual Labor                       6   
3          4  Female  18.0            Student                       6   
4          5 