# Data Manipulation - Groups and Joins

In [1]:
import pandas as pd

df = pd.DataFrame({'nome':['Edmar','Edmar','Edmar','Dayana','Edmar', 'Edmar', 'Rai','Vamp', 'Vamp', 'Dayana'],
                   'nota' :[9, 7, 9, 7, 9, 7, 8, 10, 9, 8]})


df

Unnamed: 0,nome,nota
0,Edmar,9
1,Edmar,7
2,Edmar,9
3,Dayana,7
4,Edmar,9
5,Edmar,7
6,Rai,8
7,Vamp,10
8,Vamp,9
9,Dayana,8


In [3]:
df['nome'] == 'Edmar'

0     True
1     True
2     True
3    False
4     True
5     True
6    False
7    False
8    False
9    False
Name: nome, dtype: bool

In [4]:
mask = df['nome'] == 'Edmar'
mask

0     True
1     True
2     True
3    False
4     True
5     True
6    False
7    False
8    False
9    False
Name: nome, dtype: bool

In [5]:
df.loc[mask, :]

Unnamed: 0,nome,nota
0,Edmar,9
1,Edmar,7
2,Edmar,9
4,Edmar,9
5,Edmar,7


In [6]:
df.loc[mask, :].mean()

nota    8.2
dtype: float64

In [7]:
mask = df['nome'] == 'Dayana'
mask

0    False
1    False
2    False
3     True
4    False
5    False
6    False
7    False
8    False
9     True
Name: nome, dtype: bool

In [8]:
df.loc[mask, :]

Unnamed: 0,nome,nota
3,Dayana,7
9,Dayana,8


In [9]:
df.loc[mask, :].mean()

nota    7.5
dtype: float64

In [10]:
df.nome.unique()

array(['Edmar', 'Dayana', 'Rai', 'Vamp'], dtype=object)

# Como fazer essa operação para todos os nomes únicos?

- `.groupby()` é uma forma de **agregar** todos os resultados para cada chave única
- sempre que você faz uma **agregação**, o resultado final terá 1 linha para cada valor pelo qual você agregou, portanto, é obrigatório que se aplique uma função agregadora para que todos os valores sejam sumarizados em um único valor associado àquela chave.

Por exemplo, se tivermos:

Nome | Nota
-----|-----
Andre | 10
Andre | 8 
Andre | 6
Joao  | 10
Joao  | 4

O resultado de um `.groupby` por 'Nome' resultaria em 2 linhas

Nome | xxxx
----|-----
Andre| *
Joao | *

O asterisco representa o valor agregado. Isto é, não há como trazer os valores 10, 8 e 6 associados à Andre. Temos, obrigatoriamente, que sumarizá-los em um único dado. Para isso, podemos fazer a média entre 10,8,6 (que seria 8), a soma (que seria 24), ou qualquer outra função agregadora. Assim teríamos um único valor sumarizado para a chave 'Nome'

In [11]:
df.groupby(by='nome')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000017C1C8F1F40>

In [21]:
df.groupby(by='nome').mean()

Unnamed: 0_level_0,nota
nome,Unnamed: 1_level_1
Dayana,7.5
Edmar,8.2
Rai,8.0
Vamp,9.5


In [22]:
df.groupby(by='nome').max()

Unnamed: 0_level_0,nota
nome,Unnamed: 1_level_1
Dayana,8
Edmar,9
Rai,8
Vamp,10


In [23]:
df.groupby(by='nome').min()

Unnamed: 0_level_0,nota
nome,Unnamed: 1_level_1
Dayana,7
Edmar,7
Rai,8
Vamp,9


## Aggregating methods

- `.mean()`
- `.median()`
- `.max()`
- `.min()`
- `.sum()`
- `.count()`
- `.describe()`
- `.agg()`

### More than one aggregation

In [24]:
# read the `.agg()` help (shift+TAB) to learn which aggregation methods it can handle

df.groupby(by='nome').agg({'max', 'min', 'mean'})

Unnamed: 0_level_0,nota,nota,nota
Unnamed: 0_level_1,mean,min,max
nome,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Dayana,7.5,7,8
Edmar,8.2,7,9
Rai,8.0,8,8
Vamp,9.5,9,10


In [19]:
df.groupby(by='nome').agg({'max', 'min', 'mean'})

Unnamed: 0_level_0,nota,nota,nota
Unnamed: 0_level_1,mean,min,max
nome,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Dayana,7.5,7,8
Edmar,8.2,7,9
Rai,8.0,8,8
Vamp,9.5,9,10


In [39]:
df_grouped = df_grouped.reset_index()

AttributeError: 'function' object has no attribute 'reset_index'

In [38]:
df_grouped

<bound method DataFrame.reset_index of         nota_max  nota_min  nota_avg
nome                                
Dayana         8         7       7.5
Edmar          9         7       8.2
Rai            8         8       8.0
Vamp          10         9       9.5>

## Named aggregation

In [27]:
pd.__version__

'1.0.5'

In [43]:
df_grouped = df.groupby(by='nome', as_index=False).agg(nota_max = ('nota','max'), 
                          nota_min = ('nota', min),
                          nota_avg = ('nota','mean'))
df_grouped

Unnamed: 0,nota_max,nota_min,nota_avg
0,Dayana,8,7
1,Edmar,9,7
2,Rai,8,8
3,Vamp,10,9


# Group by 
>    - Aggregating function
>    - Named aggregation
>    - `as_index = False`

In [50]:
data = pd.read_csv('vehicles.csv')
data.rename(columns={'Make':'Manufacturer'}, inplace=True)

In [51]:
data.head()

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.4375,2100
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.4375,2550


In [53]:
data.groupby(by='Manufacturer')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000017C1FF6B790>

In [54]:
data.groupby(by='Manufacturer').mean()

Unnamed: 0_level_0,Year,Engine Displacement,Cylinders,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
Manufacturer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AM General,1984.500000,3.350000,5.000000,22.674670,15.000000,15.000000,14.750000,611.358244,2287.500000
ASC Incorporated,1987.000000,3.800000,6.000000,20.600625,14.000000,21.000000,16.000000,555.437500,2550.000000
Acura,2003.493377,2.834768,5.231788,15.673371,18.890728,25.940397,21.506623,422.585325,1852.483444
Alfa Romeo,1991.878049,2.556098,5.317073,17.208234,17.097561,23.902439,19.512195,463.952115,1962.195122
American Motors Corporation,1984.590909,3.813636,5.545455,18.758092,16.045455,20.181818,17.681818,505.758823,1893.181818
...,...,...,...,...,...,...,...,...,...
Volkswagen,2002.928367,2.236008,4.595033,14.594784,21.226361,28.985673,24.093601,392.741721,1579.417383
Volvo,2002.182706,2.504742,4.945607,16.186996,17.981869,25.064156,20.605300,435.803755,1812.273361
Wallace Environmental,1991.500000,4.315625,7.812500,24.404196,12.437500,16.000000,13.875000,657.990029,2996.875000
Yugo,1988.375000,1.200000,4.000000,13.206218,23.000000,28.250000,25.000000,356.068256,1350.000000


In [57]:
data.groupby(by='Manufacturer').mean()[['Engine Displacement']].reset_index()

Unnamed: 0,Manufacturer,Engine Displacement
0,AM General,3.350000
1,ASC Incorporated,3.800000
2,Acura,2.834768
3,Alfa Romeo,2.556098
4,American Motors Corporation,3.813636
...,...,...
122,Volkswagen,2.236008
123,Volvo,2.504742
124,Wallace Environmental,4.315625
125,Yugo,1.200000


In [58]:
data.Drivetrain.unique()

array(['2-Wheel Drive', 'Rear-Wheel Drive', 'Front-Wheel Drive',
       '4-Wheel or All-Wheel Drive', 'All-Wheel Drive', '4-Wheel Drive',
       'Part-time 4-Wheel Drive', '2-Wheel Drive, Front'], dtype=object)

In [59]:
data.groupby(by='Cylinders')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000017C25A3DAC0>

In [60]:
data.groupby(by='Cylinders').mean()

Unnamed: 0_level_0,Year,Engine Displacement,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
Cylinders,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2.0,1997.0625,1.239583,17.503468,17.0,22.9375,19.104167,471.734739,2004.166667
3.0,1996.38806,1.052239,9.160623,34.00995,40.323383,36.572139,246.695246,962.437811
4.0,1999.780643,2.06657,14.120702,21.560323,28.272417,24.075441,380.939902,1487.879798
5.0,2002.64177,2.636653,16.514187,17.785615,24.68603,20.334716,444.828844,1813.278008
6.0,2001.294242,3.439342,18.086572,16.328946,22.661261,18.606189,487.609906,1943.19624
8.0,2000.77832,5.222581,22.3254,13.323331,18.537134,15.206302,604.159066,2414.734934
10.0,2008.777778,5.911765,24.182393,11.653595,18.366013,13.941176,652.086493,2926.797386
12.0,2006.218861,5.907473,25.831975,10.893238,16.969751,13.014235,696.034399,3143.149466
16.0,2011.125,8.0,32.961,8.0,14.625,10.0,873.0625,4050.0


In [61]:
data.groupby(by='Cylinders').mean()[['Fuel Cost/Year']]

Unnamed: 0_level_0,Fuel Cost/Year
Cylinders,Unnamed: 1_level_1
2.0,2004.166667
3.0,962.437811
4.0,1487.879798
5.0,1813.278008
6.0,1943.19624
8.0,2414.734934
10.0,2926.797386
12.0,3143.149466
16.0,4050.0


In [62]:
data.groupby(by='Cylinders').agg(avg_fuel = ('Fuel Cost/Year', 'mean'),
                                 qtd_fuel = ('Fuel Cost/Year', 'count'))

Unnamed: 0_level_0,avg_fuel,qtd_fuel
Cylinders,Unnamed: 1_level_1,Unnamed: 2_level_1
2.0,2004.166667,48
3.0,962.437811,201
4.0,1487.879798,13494
5.0,1813.278008,723
6.0,1943.19624,12765
8.0,2414.734934,7998
10.0,2926.797386,153
12.0,3143.149466,562
16.0,4050.0,8


In [63]:
data.groupby(by='Cylinders').agg(avg_fuel = ('Fuel Cost/Year', 'mean'),
                                 median_fuel = ('Fuel Cost/Year', 'median')).reset_index()

Unnamed: 0,Cylinders,avg_fuel,median_fuel
0,2.0,2004.166667,1950
1,3.0,962.437811,1000
2,4.0,1487.879798,1450
3,5.0,1813.278008,1850
4,6.0,1943.19624,1950
5,8.0,2414.734934,2400
6,10.0,2926.797386,2900
7,12.0,3143.149466,3100
8,16.0,4050.0,4050


In [64]:
avg_fuel = data.groupby(by='Cylinders').agg(avg_fuel = ('Fuel Cost/Year', 'mean'),
                                            median_fuel = ('Fuel Cost/Year', 'median')).reset_index()

In [65]:
avg_mpg_two_keys = data.groupby(by=['Year','Cylinders'], as_index=False).mean()[['Year','Cylinders', 'Combined MPG']]

In [66]:
avg_mpg_two_keys

Unnamed: 0,Year,Cylinders,Combined MPG
0,1984,4.0,21.333333
1,1984,6.0,15.838235
2,1984,8.0,13.848101
3,1985,2.0,17.000000
4,1985,3.0,39.000000
...,...,...,...
249,2017,4.0,27.233766
250,2017,6.0,21.171004
251,2017,8.0,17.568750
252,2017,10.0,16.285714


-----

# JOINs

How to merge dataframes based on a specific column

In [67]:
df

Unnamed: 0,nome,nota
0,Edmar,9
1,Edmar,7
2,Edmar,9
3,Dayana,7
4,Edmar,9
5,Edmar,7
6,Rai,8
7,Vamp,10
8,Vamp,9
9,Dayana,8


In [68]:
df_estados = pd.DataFrame({'nome':['Andre', 'Rai','Edmar','Dayana','Rodrigo'], 
                           'estado':['SP','DF','SP','BA','SP']})

df_estados

Unnamed: 0,nome,estado
0,Andre,SP
1,Rai,DF
2,Edmar,SP
3,Dayana,BA
4,Rodrigo,SP


In [69]:
pd.merge(left=df, right=df_estados, on='nome')

Unnamed: 0,nome,nota,estado
0,Edmar,9,SP
1,Edmar,7,SP
2,Edmar,9,SP
3,Edmar,9,SP
4,Edmar,7,SP
5,Dayana,7,BA
6,Dayana,8,BA
7,Rai,8,DF


## Types of Joins

![image-asset.png](data/image.png)

In [71]:
pd.merge(left=df, right=df_estados, on='nome', how='left')

Unnamed: 0,nome,nota,estado
0,Edmar,9,SP
1,Edmar,7,SP
2,Edmar,9,SP
3,Dayana,7,BA
4,Edmar,9,SP
5,Edmar,7,SP
6,Rai,8,DF
7,Vamp,10,
8,Vamp,9,
9,Dayana,8,BA


In [72]:
pd.merge(left=df, right=df_estados, on='nome', how='outer')

Unnamed: 0,nome,nota,estado
0,Edmar,9.0,SP
1,Edmar,7.0,SP
2,Edmar,9.0,SP
3,Edmar,9.0,SP
4,Edmar,7.0,SP
5,Dayana,7.0,BA
6,Dayana,8.0,BA
7,Rai,8.0,DF
8,Vamp,10.0,
9,Vamp,9.0,


## Using our vehicles dataframe

In [73]:
data

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.437500,2100
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.437500,2550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,smart,fortwo coupe,2013,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100
35948,smart,fortwo coupe,2014,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,243.000000,1100
35949,smart,fortwo coupe,2015,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100
35950,smart,fortwo coupe,2016,0.9,3.0,Auto(AM6),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,39,36,246.000000,1100


In [74]:
avg_fuel

Unnamed: 0,Cylinders,avg_fuel,median_fuel
0,2.0,2004.166667,1950
1,3.0,962.437811,1000
2,4.0,1487.879798,1450
3,5.0,1813.278008,1850
4,6.0,1943.19624,1950
5,8.0,2414.734934,2400
6,10.0,2926.797386,2900
7,12.0,3143.149466,3100
8,16.0,4050.0,4050


In [75]:
pd.merge(left=data, right=avg_fuel, on='Cylinders')

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year,avg_fuel,median_fuel
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950,1487.879798,1450
1,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.437500,2100,1487.879798,1450
2,Acura,2.2CL/3.0CL,1997,2.2,4.0,Automatic 4-spd,Front-Wheel Drive,Subcompact Cars,Regular,14.982273,20,26,22,403.954545,1500,1487.879798,1450
3,Acura,2.2CL/3.0CL,1997,2.2,4.0,Manual 5-spd,Front-Wheel Drive,Subcompact Cars,Regular,13.733750,22,28,24,370.291667,1400,1487.879798,1450
4,Acura,2.3CL/3.0CL,1998,2.3,4.0,Automatic 4-spd,Front-Wheel Drive,Subcompact Cars,Regular,14.982273,19,27,22,403.954545,1500,1487.879798,1450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,Bugatti,Veyron,2011,8.0,16.0,Automatic (S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,888.700000,4050,4050.000000,4050
35948,Bugatti,Veyron,2012,8.0,16.0,Automatic (S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,888.700000,4050,4050.000000,4050
35949,Bugatti,Veyron,2013,8.0,16.0,Auto(AM-S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,847.000000,4050,4050.000000,4050
35950,Bugatti,Veyron,2014,8.0,16.0,Auto(AM-S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,847.000000,4050,4050.000000,4050


## What if we had different names?

In [76]:
avg_fuel.rename(columns={'Cylinders':'cyl'}, inplace=True)

In [77]:
avg_fuel.head(2)

Unnamed: 0,cyl,avg_fuel,median_fuel
0,2.0,2004.166667,1950
1,3.0,962.437811,1000


In [78]:
data.head(2)

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550


In [79]:
pd.merge(left=data, right=avg_fuel, left_on='Cylinders', right_on='cyl')

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year,cyl,avg_fuel,median_fuel
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950,4.0,1487.879798,1450
1,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.437500,2100,4.0,1487.879798,1450
2,Acura,2.2CL/3.0CL,1997,2.2,4.0,Automatic 4-spd,Front-Wheel Drive,Subcompact Cars,Regular,14.982273,20,26,22,403.954545,1500,4.0,1487.879798,1450
3,Acura,2.2CL/3.0CL,1997,2.2,4.0,Manual 5-spd,Front-Wheel Drive,Subcompact Cars,Regular,13.733750,22,28,24,370.291667,1400,4.0,1487.879798,1450
4,Acura,2.3CL/3.0CL,1998,2.3,4.0,Automatic 4-spd,Front-Wheel Drive,Subcompact Cars,Regular,14.982273,19,27,22,403.954545,1500,4.0,1487.879798,1450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,Bugatti,Veyron,2011,8.0,16.0,Automatic (S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,888.700000,4050,16.0,4050.000000,4050
35948,Bugatti,Veyron,2012,8.0,16.0,Automatic (S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,888.700000,4050,16.0,4050.000000,4050
35949,Bugatti,Veyron,2013,8.0,16.0,Auto(AM-S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,847.000000,4050,16.0,4050.000000,4050
35950,Bugatti,Veyron,2014,8.0,16.0,Auto(AM-S7),All-Wheel Drive,Two Seaters,Premium,32.961000,8,15,10,847.000000,4050,16.0,4050.000000,4050


# How to concatenate dataframes?

In [80]:
small_cars = avg_fuel.loc[avg_fuel['cyl'] < 7, :]

In [81]:
big_cars = avg_fuel.loc[avg_fuel['cyl'] >= 7, :]

In [82]:
small_cars

Unnamed: 0,cyl,avg_fuel,median_fuel
0,2.0,2004.166667,1950
1,3.0,962.437811,1000
2,4.0,1487.879798,1450
3,5.0,1813.278008,1850
4,6.0,1943.19624,1950


In [83]:
big_cars

Unnamed: 0,cyl,avg_fuel,median_fuel
5,8.0,2414.734934,2400
6,10.0,2926.797386,2900
7,12.0,3143.149466,3100
8,16.0,4050.0,4050


In [87]:
pd.concat([small_cars, big_cars], axis=1).reset_index(drop=True)

Unnamed: 0,cyl,avg_fuel,median_fuel,cyl.1,avg_fuel.1,median_fuel.1
0,2.0,2004.166667,1950.0,,,
1,3.0,962.437811,1000.0,,,
2,4.0,1487.879798,1450.0,,,
3,5.0,1813.278008,1850.0,,,
4,6.0,1943.19624,1950.0,,,
5,,,,8.0,2414.734934,2400.0
6,,,,10.0,2926.797386,2900.0
7,,,,12.0,3143.149466,3100.0
8,,,,16.0,4050.0,4050.0


# Bins
> ```pd.cut``` vs ```pd.qcut```
> - Specify cutoffs

> - Use case
>     - Scores ~ decis (0,1,2,3,4,5,6,7,8,9)


Suppose I want to break the values of the variable `Combined MPG` into 5 categories: From Very Low to Very High

In [88]:
data['Fuel Barrels/Year']

0        19.388824
1        25.354615
2        20.600625
3        25.354615
4        20.600625
           ...    
35947     9.155833
35948     9.155833
35949     9.155833
35950     9.155833
35951     9.417429
Name: Fuel Barrels/Year, Length: 35952, dtype: float64

In [89]:
data['Fuel Barrels/Year'].describe()

count    35952.000000
mean        17.609056
std          4.467283
min          0.060000
25%         14.699423
50%         17.347895
75%         20.600625
max         47.087143
Name: Fuel Barrels/Year, dtype: float64

In [90]:
data['cat_barrel_year'] = pd.cut(data['Fuel Barrels/Year'], 5, labels=['MB','B','M','A','MA'])

In [91]:
data.groupby(by='cat_barrel_year').mean()['Engine Displacement']

cat_barrel_year
MB    1.863077
B     2.696333
M     4.642560
A     5.755868
MA    6.000000
Name: Engine Displacement, dtype: float64

In [92]:
data['cat_barrel_year'].value_counts()

B     23804
M     11062
A       605
MB      455
MA       26
Name: cat_barrel_year, dtype: int64

In [95]:
pd.qcut(data['Fuel Barrels/Year'], 5, labels=['MB','B','M','A','MA']).value_counts()

M     9871
B     7196
MB    7192
MA    7080
A     4613
Name: Fuel Barrels/Year, dtype: int64

In [94]:
data.head()

Unnamed: 0,Manufacturer,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year,cat_barrel_year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950,M
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550,M
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.4375,2100,M
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550,M
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.4375,2550,M


In [96]:
mpg_labels = ['Very Low', 'Low', 'Moderate', 'High', 'Very High']

In [97]:
# performing the pd.cut operation without specifiyng the labels outputs the intervals
bins = pd.cut(data['Fuel Cost/Year'], 5)
bins

0        (1640.0, 2680.0]
1        (1640.0, 2680.0]
2        (1640.0, 2680.0]
3        (1640.0, 2680.0]
4        (1640.0, 2680.0]
               ...       
35947     (594.8, 1640.0]
35948     (594.8, 1640.0]
35949     (594.8, 1640.0]
35950     (594.8, 1640.0]
35951     (594.8, 1640.0]
Name: Fuel Cost/Year, Length: 35952, dtype: category
Categories (5, interval[float64]): [(594.8, 1640.0] < (1640.0, 2680.0] < (2680.0, 3720.0] < (3720.0, 4760.0] < (4760.0, 5800.0]]

In [98]:
# performing the pd.cut operation using the labels argument outputs your labels
bins = pd.cut(data['Fuel Cost/Year'], 5, labels=mpg_labels)
bins.head(10)

0         Low
1         Low
2         Low
3         Low
4         Low
5    Very Low
6    Very Low
7         Low
8    Very Low
9    Very Low
Name: Fuel Cost/Year, dtype: category
Categories (5, object): [Very Low < Low < Moderate < High < Very High]

In [99]:
bins = pd.qcut(data['Fuel Cost/Year'],5, labels=mpg_labels)
bins.head(10)

0     Moderate
1    Very High
2         High
3    Very High
4    Very High
5          Low
6     Very Low
7          Low
8          Low
9     Very Low
Name: Fuel Cost/Year, dtype: category
Categories (5, object): [Very Low < Low < Moderate < High < Very High]

In [100]:
bins.value_counts()

Low          8827
Very Low     7646
High         7486
Very High    6210
Moderate     5783
Name: Fuel Cost/Year, dtype: int64

In [None]:
cutoffs = [1000,1500,2000,2500,3000,3500]
bins = pd.cut(data['Fuel Cost/Year'], cutoffs, labels=mpg_labels)
bins.head(10)

In [None]:
bins.value_counts(sort=False)

# Convert categorical variables columns

>    - dummies
>    - One hot encoding

In [101]:
data[['cat_barrel_year']]
# count the values within each category

Unnamed: 0,cat_barrel_year
0,M
1,M
2,M
3,M
4,M
...,...
35947,MB
35948,MB
35949,MB
35950,MB


In [102]:
data['cat_barrel_year'].unique()

[M, B, MB, A, MA]
Categories (5, object): [MB < B < M < A < MA]

In [103]:
data[['cat_barrel_year']].head()

Unnamed: 0,cat_barrel_year
0,M
1,M
2,M
3,M
4,M


In [104]:
drivetrain = pd.get_dummies(data['cat_barrel_year'])
drivetrain

Unnamed: 0,MB,B,M,A,MA
0,0,0,1,0,0
1,0,0,1,0,0
2,0,0,1,0,0
3,0,0,1,0,0
4,0,0,1,0,0
...,...,...,...,...,...
35947,1,0,0,0,0
35948,1,0,0,0,0
35949,1,0,0,0,0
35950,1,0,0,0,0
