# 用 Groupby 做出结论

In [4]:
# Load `winequality_edited.csv`
import pandas as pd
df = pd.read_csv("./winequality_edited.csv")

### 是否有一种特定类型的酒具有较高质量？

In [5]:
df.head(3)

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red


In [6]:
# 用 groupby 计算每个酒类型（红葡萄酒和白葡萄酒）的平均质量
df.groupby("color")["quality"].mean()

color
red      5.636023
white    5.877909
Name: quality, dtype: float64

### 哪个酸度水平的平均评分最高？

In [7]:
# 用 Pandas 描述功能查看最小、25%、50%、75% 和 最大 pH 值
df.pH.describe()

count    6497.000000
mean        3.218501
std         0.160787
min         2.720000
25%         3.110000
50%         3.210000
75%         3.320000
max         4.010000
Name: pH, dtype: float64

In [11]:
# 对用于把数据“分割”成组的边缘进行分组
bin_edges = [ 2.71,3.21 ,3.31 ,3.4 ,4.01 ] # 用刚才计算的五个值填充这个列表

In [12]:
# 四个酸度水平组的标签
bin_names = [ "high", "median_high", "mediam", "low"] # 对每个酸度水平类别进行命名

In [13]:
# 创建 acidity_levels 列
df['acidity_levels'] = pd.cut(df['pH'], bin_edges, labels=bin_names)

# 检查该列是否成功创建
df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color,acidity_levels
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red,low
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red,high
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red,median_high
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red,high
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red,low


In [None]:
# 用 groupby 计算每个酸度水平的平均质量
df.groupby("acidity_levels").

In [None]:
# 保存更改，供下一段使用
df.to_csv('winequality_edited.csv', index=False)