# 0) Warmup

In [2]:
!ls groupby_and_aggregation

ls: 无法访问 'groupby_and_aggregation': 没有那个文件或目录


In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv('./data/large_countries_2015.csv', index_col=0)

df['population'] = df['population'] / 1000000
# 小数点
df['population'] = round(df['population'], 1)

In [7]:
df.head()

Unnamed: 0,population,fertility,continent
Bangladesh,161.0,2.12,Asia
Brazil,207.8,1.78,South America
China,1376.0,1.57,Asia
India,1311.1,2.43,Asia
Indonesia,257.6,2.28,Asia


## 0.1) Calculate the average population size of countries in the dataset

In [4]:
print(f'The average population size of large countries in 2015 was {round(df.population.mean(), 2)} million')

The average population size of large countries in 2015 was 375.35 million


## 0.2) Calculate the average fertility rate in the dataset

In [5]:
# If no person dies, 
# what would be the number of people living in large countries in the next generation?
df['pop_next_gen'] = df['population'] + df['population'] * 0.5 * df['fertility']
df.head()

Unnamed: 0,population,fertility,continent,pop_next_gen
Bangladesh,161.0,2.12,Asia,331.66
Brazil,207.8,1.78,South America,392.742
China,1376.0,1.57,Asia,2456.16
India,1311.1,2.43,Asia,2904.0865
Indonesia,257.6,2.28,Asia,551.264


In [6]:
population = round(df['pop_next_gen'].sum(), 2)
print(f'The sum of people living in large countries in the next generation is {population}')

The sum of people living in large countries in the next generation is 9459.89


In [7]:
# The average fertility rate would be the fertility rate that would lead to exactly this number
# if we applied it to every country
df['pop_next_arithmetic_mean'] = df['population'] + df['population'] * 0.5 * df['fertility'].mean()
df.head()

Unnamed: 0,population,fertility,continent,pop_next_gen,pop_next_arithmetic_mean
Bangladesh,161.0,2.12,Asia,331.66,357.21875
Brazil,207.8,1.78,South America,392.742,461.05625
China,1376.0,1.57,Asia,2456.16,3053.0
India,1311.1,2.43,Asia,2904.0865,2909.003125
Indonesia,257.6,2.28,Asia,551.264,571.55


In [8]:
population_arithmetic_mean = round(df['pop_next_arithmetic_mean'].sum(), 2)
print(f'The sum of people living in large countries in the next generation is {population_arithmetic_mean}')

The sum of people living in large countries in the next generation is 9993.69


In [9]:
# Taking the arithmetic mean does not provide us with the right number of people in the next 
# generation
# The reason is that the influence of countries du to differences in population size are not 
# taken into account by the arithmetic mean

In [10]:
# Weighted arithmetic mean
# Weights will be the population size
weighted_average_fertility = np.average(df.fertility, weights=df.population)

In [11]:
df['pop_next_weighted_arithmetic_mean'] = df['population'] + df['population'] * 0.5 * weighted_average_fertility
df.head()

Unnamed: 0,population,fertility,continent,pop_next_gen,pop_next_arithmetic_mean,pop_next_weighted_arithmetic_mean
Bangladesh,161.0,2.12,Asia,331.66,357.21875,338.138354
Brazil,207.8,1.78,South America,392.742,461.05625,436.429503
China,1376.0,1.57,Asia,2456.16,3053.0,2889.927794
India,1311.1,2.43,Asia,2904.0865,2909.003125,2753.622333
Indonesia,257.6,2.28,Asia,551.264,571.55,541.021366


In [12]:
population_weighted_arithmetic_mean = round(df['pop_next_weighted_arithmetic_mean'].sum(), 2)
print(f'The sum of people living in large countries in the next generation is {population_weighted_arithmetic_mean}')

The sum of people living in large countries in the next generation is 9459.89


Now imagine that we are interested in the averages per continent. How can we do that?

# 1) Groupby and aggregation

Today we are going to talk about 3 different but related topics.

1. Aggregate functions in pandas
2. The groupby method of a pandas.DataFrame
3. Applying functions or transformations to pandas.Series

## 1.1) Aggregate functions in pandas

**Aggregate function**: takes multiple rows as input and returns a single value

In [13]:
df.mean()

population                           375.350000
fertility                              2.437500
pop_next_gen                         788.324417
pop_next_arithmetic_mean             832.807813
pop_next_weighted_arithmetic_mean    788.324417
dtype: float64

In [14]:
df[['population', 'fertility']].sum()

population    4504.20
fertility       29.25
dtype: float64

In [15]:
df.count()

population                           12
fertility                            12
continent                            12
pop_next_gen                         12
pop_next_arithmetic_mean             12
pop_next_weighted_arithmetic_mean    12
dtype: int64

In [16]:
df.max()

population                                    1376
fertility                                     5.89
continent                            South America
pop_next_gen                               2904.09
pop_next_arithmetic_mean                      3053
pop_next_weighted_arithmetic_mean          2889.93
dtype: object

In [17]:
df.min()

population                             100.7
fertility                               1.45
continent                             Africa
pop_next_gen                         218.385
pop_next_arithmetic_mean             223.428
pop_next_weighted_arithmetic_mean    211.494
dtype: object

In [18]:
df.median()

population                           185.550000
fertility                              2.125000
pop_next_gen                         434.385000
pop_next_arithmetic_mean             411.689062
pop_next_weighted_arithmetic_mean    389.699202
dtype: float64

In [19]:
df.quantile(.1)

population                           126.640000
fertility                              1.574000
pop_next_gen                         251.570450
pop_next_arithmetic_mean             280.982500
pop_next_weighted_arithmetic_mean    265.974168
Name: 0.1, dtype: float64

In [20]:
df.prod()

population                           4.000891e+28
fertility                            1.715701e+04
pop_next_gen                         4.163947e+32
pop_next_arithmetic_mean             5.694520e+32
pop_next_weighted_arithmetic_mean    2.946995e+32
dtype: float64

In [21]:
df.std()

population                            456.517642
fertility                               1.200781
pop_next_gen                          903.267856
pop_next_arithmetic_mean             1012.898518
pop_next_weighted_arithmetic_mean     958.795800
dtype: float64

In [None]:
df.var()

In [25]:
df.sem()

population                           131.785292
fertility                              0.346636
pop_next_gen                         260.750970
pop_next_arithmetic_mean             292.398616
pop_next_weighted_arithmetic_mean    276.780507
dtype: float64

In [33]:
df.describe()

Unnamed: 0,population,fertility,pop_next_gen,pop_next_arithmetic_mean,pop_next_weighted_arithmetic_mean
count,12.0,12.0,12.0,12.0,12.0
mean,375.35,2.4375,788.324417,832.807813,788.324417
std,456.517642,1.200781,903.267856,1012.898518,958.7958
min,100.7,1.45,218.385,223.428125,211.493989
25%,139.375,1.7375,261.445625,309.238281,292.720702
50%,185.55,2.125,434.385,411.689062,389.699202
75%,273.65,2.5675,658.7745,607.160937,574.730189
max,1376.0,5.89,2904.0865,3053.0,2889.927794


In [34]:
# apply a customized set of aggregate functions
#SEM：标准误（Standard Error of Mean）
df.agg(['count','mean', 'std','sem'])

Unnamed: 0,population,fertility,continent,pop_next_gen,pop_next_arithmetic_mean,pop_next_weighted_arithmetic_mean
count,12.0,12.0,12.0,12.0,12.0,12.0
mean,375.35,2.4375,,788.324417,832.807813,788.324417
std,456.517642,1.200781,,903.267856,1012.898518,958.7958
sem,131.785292,0.346636,,260.75097,292.398616,276.780507


In [24]:
# Aggregation with different functions for different columns
df.agg({'population': 'mean', 'fertility': 'var'})

population    375.350000
fertility       1.441875
dtype: float64

## 1.2) Group By in pandas

Data Aggregation in Python is very closely linked to the ``DataFrame.groupby()`` statement.

- Splitting the data into groups based on some criteria.
- Applying a(n aggregate) function to each group independently.
- Combining the results into a data structure.

![Split-Apply-Combine](split_apply_combine.png)

## 1.2.1) Splitting

In [35]:
df = df[['population', 'fertility', 'continent']]

In [36]:
# Inspect the type of groupby
type(df.groupby('continent'))

pandas.core.groupby.generic.DataFrameGroupBy

In [37]:
# Assign the groupby object to a variable name
g = df.groupby('continent')

In [38]:
# Inspect the attributes and methods of the object
g.groups

{'Africa': Index(['Nigeria'], dtype='object'),
 'Asia': Index(['Bangladesh', 'China', 'India', 'Indonesia', 'Japan', 'Pakistan',
        'Philippines'],
       dtype='object'),
 'Europe': Index(['Russia'], dtype='object'),
 'North America': Index(['Mexico', 'United States'], dtype='object'),
 'South America': Index(['Brazil'], dtype='object')}

In [41]:
# iterrate over the DataFrames
for a, b in g:
    print(a)
    print(b)
    print('\n')

Africa
         population  fertility continent
Nigeria       182.2       5.89    Africa


Asia
             population  fertility continent
Bangladesh        161.0       2.12      Asia
China            1376.0       1.57      Asia
India            1311.1       2.43      Asia
Indonesia         257.6       2.28      Asia
Japan             126.6       1.45      Asia
Pakistan          188.9       3.04      Asia
Philippines       100.7       2.98      Asia


Europe
        population  fertility continent
Russia       143.5       1.61    Europe


North America
               population  fertility      continent
Mexico              127.0       2.13  North America
United States       321.8       1.97  North America


South America
        population  fertility      continent
Brazil       207.8       1.78  South America




In [42]:
# iterrate over the DataFrames
for group, df_group in g:
    print(group)
    print(df_group)
    print('\n')

Africa
         population  fertility continent
Nigeria       182.2       5.89    Africa


Asia
             population  fertility continent
Bangladesh        161.0       2.12      Asia
China            1376.0       1.57      Asia
India            1311.1       2.43      Asia
Indonesia         257.6       2.28      Asia
Japan             126.6       1.45      Asia
Pakistan          188.9       3.04      Asia
Philippines       100.7       2.98      Asia


Europe
        population  fertility continent
Russia       143.5       1.61    Europe


North America
               population  fertility      continent
Mexico              127.0       2.13  North America
United States       321.8       1.97  North America


South America
        population  fertility      continent
Brazil       207.8       1.78  South America




In [43]:
# get the DataFrame of a specific group
g.get_group('Asia')

Unnamed: 0,population,fertility,continent
Bangladesh,161.0,2.12,Asia
China,1376.0,1.57,Asia
India,1311.1,2.43,Asia
Indonesia,257.6,2.28,Asia
Japan,126.6,1.45,Asia
Pakistan,188.9,3.04,Asia
Philippines,100.7,2.98,Asia


There are different ways of splitting a ``pandas.DataFrame``

In [44]:
# 1. By one column
df.groupby('continent')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fe191d7f850>

In [45]:
# 2. By two (or more) columns
df.groupby(['continent', 'population'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fe191c5d8b0>

In [46]:
# Apply a aggregate function to it
df.groupby('continent')['population'].mean()

continent
Africa           182.200000
Asia             503.128571
Europe           143.500000
North America    224.400000
South America    207.800000
Name: population, dtype: float64

### More sophisticated applications

``.apply()`` and ``.transform()``

``.transform()`` takes a ``pd.DataFrame`` or a ``pd.Series`` as input, applies a specified function to each ``pd.Series`` it receives and returns a ``pd.DataFrame`` or ``pd.Series`` of equal shape.
> **input:** ``pd.DataFrame`` or ``pd.Series``<br>
> **operates on:** each ``pd.Series``separately<br>
> **output:** ``pd.DataFrame`` or ``pd.Series`` of the same shape

In [47]:
g = df.groupby('continent')

In [48]:
g['population'].mean()

continent
Africa           182.200000
Asia             503.128571
Europe           143.500000
North America    224.400000
South America    207.800000
Name: population, dtype: float64

In [49]:
g['population'].transform('mean')

Bangladesh       503.128571
Brazil           207.800000
China            503.128571
India            503.128571
Indonesia        503.128571
Japan            503.128571
Mexico           224.400000
Nigeria          182.200000
Pakistan         503.128571
Philippines      503.128571
Russia           143.500000
United States    224.400000
Name: population, dtype: float64

In [52]:
# Transform can be incredibly powerful to create new columns - or in order to fill in missing values with .fillna()
df['continent_population'] = g['population'].transform('sum')
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['continent_population'] = g['population'].transform('sum')


Unnamed: 0,population,fertility,continent,continent_population,weighted_fertility
Bangladesh,161.0,2.12,Asia,3521.9,0.096914
Brazil,207.8,1.78,South America,207.8,1.78
China,1376.0,1.57,Asia,3521.9,0.613396
India,1311.1,2.43,Asia,3521.9,0.904618
Indonesia,257.6,2.28,Asia,3521.9,0.166765


In [53]:
df['weighted_fertility'] = df['fertility'] * df['population'] / df['continent_population']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['weighted_fertility'] = df['fertility'] * df['population'] / df['continent_population']


In [54]:
df.head()

Unnamed: 0,population,fertility,continent,continent_population,weighted_fertility
Bangladesh,161.0,2.12,Asia,3521.9,0.096914
Brazil,207.8,1.78,South America,207.8,1.78
China,1376.0,1.57,Asia,3521.9,0.613396
India,1311.1,2.43,Asia,3521.9,0.904618
Indonesia,257.6,2.28,Asia,3521.9,0.166765


In [55]:
df.groupby('continent')['weighted_fertility'].sum()

continent
Africa           5.890000
Asia             2.082073
Europe           1.610000
North America    2.015276
South America    1.780000
Name: weighted_fertility, dtype: float64

``.apply()`` takes a ``pd.DataFrame`` or a ``pd.Series`` as input, applies a specified function, either to the ``pd.DataFrame`` or to each ``pd.Series``. It returns, depending on the specified funtion, a ``pd.DataFrame`` of flexible size, a ``pd.Series`` or a scalar.
> **input:** ``pd.DataFrame``<br>
> **operates on:** the whole ``pd.DataFrame`` or each ``pd.Series``, depending on the function<br>
> **output:** ``pd.DataFrame``, ``pd.Series`` or scalar

In [56]:
def weighted_average(df):
    '''Calculates the weighted average fertility'''
    weights = df['population']/df['population'].sum()
    weighted_fertility = df['fertility'] * weights
    weighted_average_fertility = weighted_fertility.sum()
    return weighted_average_fertility

In [57]:
weighted_average(df)

2.2004764442076286

In [60]:
df.groupby('continent').apply(weighted_average) # .groupby().apply(name_of_function)

continent
Africa           5.890000
Asia             2.082073
Europe           1.610000
North America    2.015276
South America    1.780000
dtype: float64

### Combine

Nothing to do here. Pandas is doing that for us.