2.数据分析

In [1]:
import pandas as pd
data = pd.read_csv('cleaned_sleep_health_data.csv')
# 计算基本统计量
descriptive_stats = data.describe()
print("基本统计量：")
print(descriptive_stats)

# 计算相关系数
correlation_matrix = data.corr()

# 特定变量之间的相关性
print("\n精神压力水平与睡眠质量的相关性：", correlation_matrix.loc['Stress Level (scale: 1-10)', 'Quality of Sleep (scale: 1-10)'])
print("睡眠时长与睡眠质量的相关性：", correlation_matrix.loc['Sleep Duration (hours)', 'Quality of Sleep (scale: 1-10)'])
print("年龄与睡眠质量的相关性：", correlation_matrix.loc['Age', 'Quality of Sleep (scale: 1-10)'])
print("每日步数与睡眠质量的相关性：", correlation_matrix.loc['Daily Steps', 'Quality of Sleep (scale: 1-10)'])
print("心率与睡眠质量的相关性：", correlation_matrix.loc['Heart Rate (bpm)', 'Quality of Sleep (scale: 1-10)'])
print("每日运动时长与睡眠质量的相关性：", correlation_matrix.loc['Physical Activity Level (minutes/day)', 'Quality of Sleep (scale: 1-10)'])
print("血压收缩压与睡眠质量的相关性：", correlation_matrix.loc['Systolic BP', 'Quality of Sleep (scale: 1-10)'])
print("血压舒张压与睡眠质量的相关性：", correlation_matrix.loc['Diastolic BP', 'Quality of Sleep (scale: 1-10)'])


基本统计量：
         Person ID          Age  Sleep Duration (hours)  \
count  10000.00000  10000.00000            10000.000000   
mean    5000.50000     40.37430                6.205500   
std     2886.89568     14.03999                0.749484   
min        1.00000     18.00000                4.000000   
25%     2500.75000     30.00000                6.000000   
50%     5000.50000     40.00000                6.000000   
75%     7500.25000     50.00000                6.000000   
max    10000.00000     90.00000               10.000000   

       Quality of Sleep (scale: 1-10)  Physical Activity Level (minutes/day)  \
count                    10000.000000                            10000.00000   
mean                         6.184800                               40.23050   
std                          1.618985                               20.63526   
min                          1.000000                               10.00000   
25%                          5.000000                        

  correlation_matrix = data.corr()


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from econml.dml import DML
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier

# 加载数据
data = pd.read_csv('cleaned_sleep_health_data.csv')

# 数据预处理
# 将分类变量转换为数值变量
label_encoders = {}
for column in data.columns:
    if data[column].dtype == 'object':
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column])
        label_encoders[column] = le

# 填充缺失值
imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# 定义因变量和自变量
X = data_imputed.drop(columns=['Quality of Sleep (scale: 1-10)', 'Sleep Disorder'])
y = data_imputed['Quality of Sleep (scale: 1-10)']
treatment = data_imputed['Sleep Disorder']  # 假设以 Sleep Disorder 作为处理变量

# 划分训练集和测试集
X_train, X_test, y_train, y_test, treatment_train, treatment_test = train_test_split(
    X, y, treatment, test_size=0.2, random_state=42
)

# 标准化数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(X.columns)

Index(['Person ID', 'Gender', 'Age', 'Occupation', 'Sleep Duration (hours)',
       'Physical Activity Level (minutes/day)', 'Stress Level (scale: 1-10)',
       'BMI Category', 'Heart Rate (bpm)', 'Daily Steps', 'Systolic BP',
       'Diastolic BP'],
      dtype='object')


In [3]:
from econml.dml import DML
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

# 定义机器学习模型
model_y = RandomForestRegressor(n_estimators=100, random_state=42)
model_t = RandomForestClassifier(n_estimators=100, random_state=42)
model_final = RandomForestRegressor(n_estimators=100, random_state=42)

# 构建DML模型
dml = DML(
    
    model_y=model_y,
    model_t=model_t,
    model_final=model_final,
    discrete_treatment=True,
    random_state=42
)

# 拟合DML模型
dml.fit(y_train, treatment_train, X=X_train_scaled)

# 估计因果效应
causal_effect = dml.effect(X_test_scaled)
print("因果效应估计：", causal_effect)

The final model has a nonzero intercept for at least one outcome; it will be subtracted, but consider fitting a model without an intercept if possible.


因果效应估计： [ 1.01834068 -1.07235932 -0.84485932 ... -0.48835932 -1.95025932
 -0.55455932]


In [4]:
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression  # 修改了导入路径

# 使用 statsmodels 进行回归分析
X_train_sm = sm.add_constant(X_train_scaled)  # 添加常数项
model_sm = sm.OLS(y_train, X_train_sm).fit()
print(model_sm.summary())

# 使用 scikit-learn 进行回归分析
reg_model = LinearRegression()
reg_model.fit(X_train_scaled, y_train)

# 回归系数
print("回归系数：", reg_model.coef_)

# F检验
f_statistic, p_value = f_regression(X_train_scaled, y_train)
print("F统计量：", f_statistic)
print("P值：", p_value)

                                  OLS Regression Results                                  
Dep. Variable:     Quality of Sleep (scale: 1-10)   R-squared:                       0.458
Model:                                        OLS   Adj. R-squared:                  0.457
Method:                             Least Squares   F-statistic:                     561.8
Date:                            Wed, 11 Jun 2025   Prob (F-statistic):               0.00
Time:                                    22:23:26   Log-Likelihood:                -12751.
No. Observations:                            8000   AIC:                         2.553e+04
Df Residuals:                                7987   BIC:                         2.562e+04
Df Model:                                      12                                         
Covariance Type:                        nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------