# Drawing Conclusions Using Groupby

Use `clean_08.csv`. You should've created this data file in the previous section: *Fixing Datatype*.

In [1]:
# Load dataset
import pandas as pd

df_08 = pd.read_csv("clean_08.csv")

In [2]:
# view the columns available
df_08.columns

Index(['model', 'displ', 'cyl', 'trans', 'drive', 'fuel', 'veh_class',
       'air_pollution_score', 'city_mpg', 'hwy_mpg', 'cmb_mpg',
       'greenhouse_gas_score', 'smartway'],
      dtype='object')

### Is a certain type of vehicle associated with a lower air pollution score?

In [3]:
# Find the mean air_pollution_score of each veh_class type with groupby
df_08.groupby('veh_class').mean(numeric_only=True).air_pollution_score

veh_class
SUV              6.646429
large car        6.909091
midsize car      6.884058
minivan          6.764706
pickup           6.506024
small car        6.764264
station wagon    6.575000
van              6.000000
Name: air_pollution_score, dtype: float64

### What level of city_mpg is above the mean air pollution score?

In [4]:
# View the min, 25%, 50%, 75%, max city_mpg values with Pandas describe
df_08.describe().city_mpg

count    987.000000
mean      17.386018
std        4.088018
min        8.000000
25%       15.000000
50%       17.000000
75%       20.000000
max       48.000000
Name: city_mpg, dtype: float64

In [5]:
# View the mean value for the air pollution score
df_08.describe().air_pollution_score

count    987.000000
mean       6.706180
std        0.983016
min        4.000000
25%        6.000000
50%        6.000000
75%        7.000000
max        9.500000
Name: air_pollution_score, dtype: float64

In [6]:
# Bin edges that will be used to "cut" the data into groups
bin_edges = [8.0, 15.0, 17.0, 20.0, 48.0] # Fill in this list with five values you just found

In [7]:
# Labels for the four city_mpg level groups
bin_names = ['low', 'medium', 'mid_high', 'high'] # Name each city_mpg level category

#### Pandas cut
pandas [cut](https://pandas.pydata.org/docs/reference/api/pandas.cut.html) allows you to bucket or group values from a Dataframe column and label them with provided names.

In [8]:
# Creates city_mpg_levels column
df_08['city_mpg_levels'] = pd.cut(df_08['city_mpg'], bin_edges, labels=bin_names)

# Checks for successful creation of this column
df_08.tail()

Unnamed: 0,model,displ,cyl,trans,drive,fuel,veh_class,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score,smartway,city_mpg_levels
982,VOLVO XC 90,3.2,6,Auto-S6,2WD,Gasoline,SUV,7.0,14.0,20.0,16.0,4,no,low
983,VOLVO XC 90,3.2,6,Auto-S6,4WD,Gasoline,SUV,7.0,14.0,20.0,16.0,4,no,low
984,VOLVO XC 90,4.4,8,Auto-S6,4WD,Gasoline,SUV,7.0,13.0,19.0,15.0,3,no,low
985,MERCEDES-BENZ C300,3.0,6,Auto-L7,2WD,ethanol,small car,6.0,13.0,19.0,15.0,7,no,low
986,MERCEDES-BENZ C300,3.0,6,Auto-L7,2WD,gas,small car,4.0,18.0,25.0,21.0,6,no,mid_high


In [9]:
# Find the mean air_pollution_score of each city_mpg level with groupby
df_08.groupby('city_mpg_levels').mean(numeric_only=True).air_pollution_score

city_mpg_levels
low         6.327744
medium      6.578049
mid_high    6.835227
high        7.339572
Name: air_pollution_score, dtype: float64