# Drawing Conclusions Using Groupby

Use `winequality_edited.csv`. You should've created this data file in the previous section: *Appending Data (cont.)*.

In [1]:
# Load dataset
import pandas as pd
df = pd.read_csv('winequality_edited.csv')
df.head()

Unnamed: 0,alcohol,chlorides,citric_acid,color,density,fixed_acidity,free_sulfur_dioxide,pH,quality,residual_sugar,sulphates,total_sulfur-dioxide,total_sulfur_dioxide,volatile_acidity
0,8.8,0.045,0.36,white,1.001,7.0,45.0,3.0,6,20.7,0.45,,170.0,0.27
1,9.5,0.049,0.34,white,0.994,6.3,14.0,3.3,6,1.6,0.49,,132.0,0.3
2,10.1,0.05,0.4,white,0.9951,8.1,30.0,3.26,6,6.9,0.44,,97.0,0.28
3,9.9,0.058,0.32,white,0.9956,7.2,47.0,3.19,6,8.5,0.4,,186.0,0.23
4,9.9,0.058,0.32,white,0.9956,7.2,47.0,3.19,6,8.5,0.4,,186.0,0.23


### Is a certain type of wine associated with higher quality?

In [12]:
# Find the mean quality of each wine type (red and white) with groupby
df.groupby('color')['quality'].mean()


color
red      5.636023
white    5.877909
Name: quality, dtype: float64

### What level of acidity receives the highest average rating?

In [14]:
# View the min, 25%, 50%, 75%, max pH values with Pandas describe
df['pH'].describe()

count    6497.000000
mean        3.218501
std         0.160787
min         2.720000
25%         3.110000
50%         3.210000
75%         3.320000
max         4.010000
Name: pH, dtype: float64

In [15]:
# Bin edges that will be used to "cut" the data into groups
bin_edges = [ 2.72,3.11,3.21,3.32,4.01] # Fill in this list with five values you just found

In [16]:
# Labels for the four acidity level groups
bin_names = ['High', 'Moderately High', 'Medium', 'Low'] # Name each acidity level category

In [17]:
# Creates acidity_levels column
df['acidity_levels'] = pd.cut(df['pH'], bin_edges, labels=bin_names)

# Checks for successful creation of this column
df.head()

Unnamed: 0,alcohol,chlorides,citric_acid,color,density,fixed_acidity,free_sulfur_dioxide,pH,quality,residual_sugar,sulphates,total_sulfur-dioxide,total_sulfur_dioxide,volatile_acidity,acidity_levels
0,8.8,0.045,0.36,white,1.001,7.0,45.0,3.0,6,20.7,0.45,,170.0,0.27,High
1,9.5,0.049,0.34,white,0.994,6.3,14.0,3.3,6,1.6,0.49,,132.0,0.3,Medium
2,10.1,0.05,0.4,white,0.9951,8.1,30.0,3.26,6,6.9,0.44,,97.0,0.28,Medium
3,9.9,0.058,0.32,white,0.9956,7.2,47.0,3.19,6,8.5,0.4,,186.0,0.23,Moderately High
4,9.9,0.058,0.32,white,0.9956,7.2,47.0,3.19,6,8.5,0.4,,186.0,0.23,Moderately High


In [23]:
# Find the mean quality of each acidity level with groupby
df.groupby('acidity_levels')['quality'].mean()

acidity_levels
High               5.783343
Moderately High    5.784540
Medium             5.850832
Low                5.859593
Name: quality, dtype: float64

In [None]:
# Save changes for the next section
df.to_csv('winequality_edited.csv', index=False)