In [None]:
# 数据分析流程：
# 1. 导入必要的库
# 2. 导入数据
# 3. 数据清洗
# 4. 数据特征构造
# 5. 数据分析

In [20]:
# 1. 企鹅数据分析

# 1. 导入必要的库
import pandas as pd
import numpy as np

# 2. 导入数据，查看数据
df = pd.read_csv('./data/penguins.csv')
# df.head(5)
# df.info()

# 3. 数据清洗
print(df.isna().sum())
df.dropna(inplace=True)
print('-' * 50)
print(df.isna().sum())

# 4. 数据特征构造

# 4.1 性别 字符串→category
df['sex'] = df['sex'].astype('category')
# print(df.info())

# 4.2 求喙的长宽比（构造了新的数据特征）
df['bill_ratio'] = df['bill_length_mm'] / df['bill_depth_mm']

# 5. 数据分析
# 数据分箱-把体重设为三个等级
df['mass_levels'] = pd.cut(df['body_mass_g'], bins=3, labels=['低', '中', '高'])
print(df['mass_levels'].value_counts())

# 聚类分析-按岛屿、性别分组分析
# 1. 单性别
df.groupby(['sex']).agg({
    # agg abbr. aggregation
    'body_mass_g': ['mean', 'count']
})

# 2. 单岛屿
df.groupby(['island']).agg({
    # agg abbr. aggregation
    'body_mass_g': ['mean', 'count']
})

# 3. 性别 + 岛屿
df.groupby(['sex', 'island']).agg({
    # agg abbr. aggregation
    'body_mass_g': ['mean', 'count']
})

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64
--------------------------------------------------
species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64
mass_levels
低    150
中    128
高     55
Name: count, dtype: int64


  df.groupby(['sex']).agg({
  df.groupby(['sex', 'island']).agg({


Unnamed: 0_level_0,Unnamed: 1_level_0,body_mass_g,body_mass_g
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
sex,island,Unnamed: 2_level_2,Unnamed: 3_level_2
Female,Biscoe,4319.375,80
Female,Dream,3446.311475,61
Female,Torgersen,3395.833333,24
Male,Biscoe,5104.518072,83
Male,Dream,3987.096774,62
Male,Torgersen,4034.782609,23


In [52]:
# 2. 睡眠质量分析：

# 1. 导入必要的库
# 2. 导入数据
df = pd.read_csv('./data/sleep.csv')
# df.head(5)
# df.info()
# df.describe()

# 3. 数据清洗
# print(df.isna().sum()) # 确实数据太多，不能直接删掉
print(df['sleep_disorder'].value_counts())
# 方法1：填数据
print(df['sleep_disorder'].fillna('unknown', inplace=True))
# 方法2：丢掉那一列
df.drop(columns='sleep_disorder', inplace=True)
print(df)

# 4. 数据特征构造
# 数据类型转换后，分箱、分组聚合会容易
# 4.1 性别 → category
df['gender'] = df['gender'].astype('category')
# print(df.info())

# 4.2 职业 → category
print(df['occupation'].value_counts()) # 先看看能不能分类，太多分不了
df['occupation'] = df['occupation'].astype('category')
# print(df.info())

# 4.3 bmi_category → category
df['bmi_category'] = df['bmi_category'].astype('category')

# 4.4 blood_pressure分开
df[['high', 'low']] = df['blood_pressure'].str.split('/', expand=True)
print(df.head())

# 5. 数据分析
# 5.1 数据分箱
df['sleep_level'] = pd.cut(df['sleep_quality'], bins=3, labels=['低', '中', '高'])
df['age_stage'] = pd.cut(df['age'], bins=3, labels=['少年', '中年', '老年'])
print(df.info())

# 5.2 分组聚合
# 查看不同bmi的人睡眠质量有什么区别
# print(df['bmi_category'].value_counts())
# 多组、多聚合
df.groupby(['bmi_category', 'age_stage']).agg({
    'sleep_duration': 'mean',
    'sleep_quality': 'mean',
    'stress_level': 'mean'
})

sleep_disorder
Insomnia       79
Sleep Apnea    31
Name: count, dtype: int64
None
     person_id  gender  age     occupation  sleep_duration  sleep_quality  \
0            1    Male   29   Manual Labor             7.4            7.0   
1            2  Female   43        Retired             4.2            4.9   
2            3    Male   44        Retired             6.1            6.0   
3            4    Male   29  Office Worker             8.3           10.0   
4            5    Male   67        Retired             9.1            9.5   
..         ...     ...  ...            ...             ...            ...   
395        396  Female   36        Student             4.5            7.9   
396        397  Female   45   Manual Labor             6.0            6.1   
397        398  Female   30        Student             5.3            6.5   
398        399  Female   41        Retired            11.0            9.1   
399        400    Male   37        Retired             5.8            7

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  print(df['sleep_disorder'].fillna('unknown', inplace=True))
  df.groupby(['bmi_category', 'age_stage']).agg({


Unnamed: 0_level_0,Unnamed: 1_level_0,sleep_duration,sleep_quality,stress_level
bmi_category,age_stage,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Normal,少年,8.1,6.332,4.86
Normal,中年,7.422222,6.65,4.944444
Normal,老年,7.42,4.24,4.2
Obese,少年,8.25,6.253448,5.534483
Obese,中年,7.805556,6.216667,5.888889
Obese,老年,7.9,5.025,8.0
Overweight,少年,8.214286,6.171429,5.31746
Overweight,中年,8.246154,5.95641,5.974359
Overweight,老年,8.971429,6.285714,6.714286
Underweight,少年,7.603279,5.883607,5.42623
