# Setup 

In [2]:
import numpy as np
import pandas as pd
%pip install gapminder
from gapminder import gapminder

# constants 
rows = 10

gapminder.head(rows)

Collecting gapminder
  Downloading gapminder-0.1-py3-none-any.whl (32 kB)
Installing collected packages: gapminder
Successfully installed gapminder-0.1
Note: you may need to restart the kernel to use updated packages.


Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
5,Afghanistan,Asia,1977,38.438,14880372,786.11336
6,Afghanistan,Asia,1982,39.854,12881816,978.011439
7,Afghanistan,Asia,1987,40.822,13867957,852.395945
8,Afghanistan,Asia,1992,41.674,16317921,649.341395
9,Afghanistan,Asia,1997,41.763,22227415,635.341351


# Filter 

Selecting columns in the dataset. 

In [8]:
gapminder.filter(items = ["continent", "year", "pop"]).head(rows)

Unnamed: 0,continent,year,pop
0,Asia,1952,8425333
1,Asia,1957,9240934
2,Asia,1962,10267083
3,Asia,1967,11537966
4,Asia,1972,13079460
5,Asia,1977,14880372
6,Asia,1982,12881816
7,Asia,1987,13867957
8,Asia,1992,16317921
9,Asia,1997,22227415


In [9]:
gapminder.filter(like = "co").head(rows)

Unnamed: 0,country,continent
0,Afghanistan,Asia
1,Afghanistan,Asia
2,Afghanistan,Asia
3,Afghanistan,Asia
4,Afghanistan,Asia
5,Afghanistan,Asia
6,Afghanistan,Asia
7,Afghanistan,Asia
8,Afghanistan,Asia
9,Afghanistan,Asia


# Rename

Renaming columns

In [10]:
gapminder.rename(columns = {
    "year" : "Year",
    "lifeExp" : "Life Expectancy"
}).head(rows)

Unnamed: 0,country,continent,Year,Life Expectancy,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
5,Afghanistan,Asia,1977,38.438,14880372,786.11336
6,Afghanistan,Asia,1982,39.854,12881816,978.011439
7,Afghanistan,Asia,1987,40.822,13867957,852.395945
8,Afghanistan,Asia,1992,41.674,16317921,649.341395
9,Afghanistan,Asia,1997,41.763,22227415,635.341351


# Sort_values() 

Sorting the dataset 

In [11]:
gapminder.filter(items = ["continent", "year", "pop"]).sort_values("year").head(rows)

Unnamed: 0,continent,year,pop
0,Asia,1952,8425333
528,Europe,1952,42459667
540,Africa,1952,420702
1656,Asia,1952,1030585
552,Africa,1952,284320
564,Europe,1952,69145952
576,Africa,1952,5581001
588,Europe,1952,7733250
600,Americas,1952,3146381
612,Africa,1952,2664249


In [18]:
(
    gapminder
    .filter(items = ["continent", "year", "lifeExp"])
    .sort_values(["lifeExp", "year"], ascending = [True, False])
    .head(rows)
)

Unnamed: 0,continent,year,lifeExp
1292,Africa,1992,23.599
0,Asia,1952,28.801
552,Africa,1952,30.0
36,Africa,1952,30.015
1344,Africa,1952,30.331
1,Asia,1957,30.332
221,Asia,1977,31.22
1032,Africa,1952,31.286
1345,Africa,1957,31.57
192,Africa,1952,31.975


# Query 

Filtering rows in the dataset

In [20]:
(
    gapminder
    .filter(items = ["continent", "year", "lifeExp"])
    .query("year == 1972")
    .head(rows)
    .style.set_properties(padding = "10px 10px 10px 180px")
)

Unnamed: 0,continent,year,lifeExp
4,Asia,1972,36.088
16,Europe,1972,67.69
28,Africa,1972,54.518
40,Africa,1972,37.928
52,Americas,1972,67.065
64,Oceania,1972,71.93
76,Europe,1972,70.63
88,Asia,1972,63.3
100,Asia,1972,45.252
112,Europe,1972,71.44


In [21]:
# Filter rows with the year 1972 and with a life expectancy below average.

(
    gapminder
    .filter(items = ["country", "year", "lifeExp"])
    .query("year == 1972")
    .query("lifeExp < lifeExp.mean()")
    .head(rows)
)

Unnamed: 0,country,year,lifeExp
4,Afghanistan,1972,36.088
28,Algeria,1972,54.518
40,Angola,1972,37.928
100,Bangladesh,1972,45.252
124,Benin,1972,47.014
136,Bolivia,1972,46.714
160,Botswana,1972,56.024
196,Burkina Faso,1972,43.591
208,Burundi,1972,44.057
220,Cambodia,1972,40.317


In [22]:
"""
Filter rows with the year 1972 and with a life expectancy below average, 
and with the country either to be Bolivia OR Angola.
"""

(
    gapminder
    .filter(items = ["country", "year", "lifeExp"])
    .query("year == 1972")
    .query("lifeExp < lifeExp.mean()")
    .query("country == 'Bolivia' | country == 'Angola'")
    .head(rows)
)

Unnamed: 0,country,year,lifeExp
40,Angola,1972,37.928
136,Bolivia,1972,46.714


# Assign() and lambda

Create a column that combines continent and coountry information, and another column that shows the rounded lifeExp information. 

In [23]:
(
    gapminder
    .sort_values(["year", "pop"])
    .assign(
        con_country = lambda x: x.continent + " - " + x.country,
        rn_lifeExp = lambda x: x.lifeExp.round(0).astype(int)
    )
    .filter(items = ["continent", "country", "con_country", "lifeExp", "rn_lifeExp"])
    .head(rows)
)

Unnamed: 0,continent,country,con_country,lifeExp,rn_lifeExp
1296,Africa,Sao Tome and Principe,Africa - Sao Tome and Principe,46.471,46
420,Africa,Djibouti,Africa - Djibouti,34.812,35
84,Asia,Bahrain,Asia - Bahrain,50.939,51
684,Europe,Iceland,Europe - Iceland,72.49,72
312,Africa,Comoros,Africa - Comoros,40.715,41
852,Asia,Kuwait,Asia - Kuwait,55.565,56
480,Africa,Equatorial Guinea,Africa - Equatorial Guinea,34.482,34
1260,Africa,Reunion,Africa - Reunion,52.724,53
552,Africa,Gambia,Africa - Gambia,30.0,30
1452,Africa,Swaziland,Africa - Swaziland,41.407,41


# groupby() and agg()

Group the dataset and create summary calculations

In [28]:
(
    gapminder
    .groupby("continent")
    .agg(
        pop_mean = ('pop', 'mean'),
        pop_sd = ('pop', 'std'),
        le_mean = ('lifeExp', 'mean'),
        le_sd = ('lifeExp', 'std')
    )
)

Unnamed: 0_level_0,pop_mean,pop_sd,le_mean,le_sd
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,9916003.0,15490920.0,48.86533,9.15021
Americas,24504790.0,50979430.0,64.658737,9.345088
Asia,77038720.0,206885200.0,60.064903,11.864532
Europe,17169760.0,20519440.0,71.903686,5.433178
Oceania,8874672.0,6506342.0,74.326208,3.795611


In [29]:
(
    gapminder
    .query("year > 1989")
    .groupby(["continent", "year"])
    .agg(
        pop_mean = ('pop', 'mean'),
        pop_sd = ('pop', 'std'),
        le_mean = ('lifeExp', 'mean'),
        le_sd = ('lifeExp', 'std')
    )
    .head(rows)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,pop_mean,pop_sd,le_mean,le_sd
continent,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Africa,1992,12674640.0,17562720.0,53.629577,9.461071
Africa,1997,14304480.0,19873010.0,53.598269,9.103387
Africa,2002,16033150.0,22303000.0,53.325231,9.586496
Africa,2007,17875760.0,24917730.0,54.806038,9.630781
Americas,1992,29570960.0,58109220.0,69.56836,5.167104
Americas,1997,31876020.0,62032820.0,71.15048,4.887584
Americas,2002,33990910.0,65601550.0,72.42204,4.799705
Americas,2007,35954850.0,68833780.0,73.60812,4.440948
Asia,1992,94948250.0,244960400.0,66.537212,8.075549
Asia,1997,102523800.0,262349700.0,68.020515,8.091171
