# South America

Analysis and visualisation of data about South America.

## Imports

In [47]:
import pandas as pd
import altair as alt

## Data Sourcing

In [3]:
sa = pd.read_csv("https://sigma-resources-public.s3.eu-west-2.amazonaws.com/datasets/south_america_data.csv")

In [4]:
sa.head(2)

Unnamed: 0,year,name,life_expectancy_female,life_expectancy_male,birth_rate,population
0,2000,Argentina,77.22,70.58,19.37,37070774
1,2000,Brazil,73.43,66.26,19.84,175873720


## Data Exploration

In [8]:
sa["name"].unique()

array(['Argentina', 'Brazil', 'Bolivia', 'Chile', 'Colombia', 'Ecuador',
       'Guyana', 'Paraguay', 'Peru', 'Suriname', 'Venezuela, RB',
       'Uruguay'], dtype=object)

In [10]:
sa.tail()

Unnamed: 0,year,name,life_expectancy_female,life_expectancy_male,birth_rate,population
175,2021,Paraguay,73.39,67.43,20.63,6703799
176,2021,Peru,74.75,70.12,17.62,33715471
177,2021,Suriname,73.55,67.21,18.15,612985
178,2021,"Venezuela, RB",75.21,66.26,15.88,28199867
179,2021,Uruguay,79.26,71.65,10.47,3426260


### Filtering

1. Check every row against a condition (producing a column of True/False values)
2. Use the produced column as a filter

In [12]:
bool_filter = sa["name"] == "Argentina"

In [14]:
sa[bool_filter]

Unnamed: 0,year,name,life_expectancy_female,life_expectancy_male,birth_rate,population
0,2000,Argentina,77.22,70.58,19.37,37070774
12,2001,Argentina,77.59,70.75,18.98,37480493
24,2002,Argentina,78.08,70.75,18.76,37885028
36,2003,Argentina,77.28,70.83,18.45,38278164
48,2004,Argentina,78.1,71.53,18.35,38668796
60,2005,Argentina,78.53,71.69,18.35,39070501
72,2006,Argentina,78.72,72.07,18.19,39476851
84,2014,Argentina,80.0,73.42,17.5,42669500
96,2015,Argentina,80.19,73.29,17.35,43131966
108,2016,Argentina,79.67,72.93,16.82,43590368


In [15]:
sa[sa["name"] == "Argentina"]

Unnamed: 0,year,name,life_expectancy_female,life_expectancy_male,birth_rate,population
0,2000,Argentina,77.22,70.58,19.37,37070774
12,2001,Argentina,77.59,70.75,18.98,37480493
24,2002,Argentina,78.08,70.75,18.76,37885028
36,2003,Argentina,77.28,70.83,18.45,38278164
48,2004,Argentina,78.1,71.53,18.35,38668796
60,2005,Argentina,78.53,71.69,18.35,39070501
72,2006,Argentina,78.72,72.07,18.19,39476851
84,2014,Argentina,80.0,73.42,17.5,42669500
96,2015,Argentina,80.19,73.29,17.35,43131966
108,2016,Argentina,79.67,72.93,16.82,43590368


In [16]:
sa.head()

Unnamed: 0,year,name,life_expectancy_female,life_expectancy_male,birth_rate,population
0,2000,Argentina,77.22,70.58,19.37,37070774
1,2000,Brazil,73.43,66.26,19.84,175873720
2,2000,Bolivia,64.39,60.42,29.72,8592656
3,2000,Chile,79.98,73.71,15.96,15351799
4,2000,Colombia,75.54,67.31,22.11,39215135


In [19]:
# Filter for all countries 

sa[sa["year"] < 2006]

Unnamed: 0,year,name,life_expectancy_female,life_expectancy_male,birth_rate,population
0,2000,Argentina,77.22,70.58,19.37,37070774
1,2000,Brazil,73.43,66.26,19.84,175873720
2,2000,Bolivia,64.39,60.42,29.72,8592656
3,2000,Chile,79.98,73.71,15.96,15351799
4,2000,Colombia,75.54,67.31,22.11,39215135
...,...,...,...,...,...,...
67,2005,Paraguay,73.72,67.61,24.05,5476878
68,2005,Peru,75.02,71.05,22.33,28147267
69,2005,Suriname,70.42,63.98,22.27,516220
70,2005,"Venezuela, RB",76.67,69.14,22.33,26668785


## Challenge

1. Create a variable called `y2018` that contains just rows from 2018
2. Create a variable called `cBrazil` that contains just Brazil rows

In [24]:
y2018 = sa[sa["year"] == 2018]
cBrazil = sa[sa["name"] == "Brazil"]

In [30]:
cBrazil["birth_rate"].min()

np.float64(12.88)

In [33]:
cBrazil.sort_values(by="life_expectancy_female", ascending=False).head(3)

Unnamed: 0,year,name,life_expectancy_female,life_expectancy_male,birth_rate,population
145,2019,Brazil,78.47,72.2,13.63,211782878
133,2018,Brazil,78.27,71.96,14.13,210166592
121,2017,Brazil,78.03,71.64,14.16,208504960


## Challenge

What's the integer difference between the lowest male life expectancy and the highest female life expectancy ever across all of South America?

In [42]:
sa["life_expectancy_female"].max() - sa["life_expectancy_male"].min()

np.float64(22.289999999999992)

In [43]:
sa["life_expectancy_female"].max() - sa["life_expectancy_male"].min()

np.float64(22.289999999999992)

In [46]:
sa["life_expectancy_female"].sort_values(ascending=False)

147    82.71
123    82.60
111    82.57
135    82.53
99     82.38
       ...  
50     66.24
38     65.98
26     65.56
14     64.99
2      64.39
Name: life_expectancy_female, Length: 180, dtype: float64

## Visualisation

1. Make a chart
2. Feed the chart the dataset
3. State what kind of chart it is
4. Explain how the different columns will be mapped onto the chart

In [49]:
alt.Chart(y2018).mark_bar().encode(
    x = 'name',
    y = 'population'
)

### Challenge

- Make a scatterplot (`mark_point()`) of `birth_rate` and `population` for 2108

In [51]:
alt.Chart(y2018).mark_point().encode(
    y="birth_rate",
    x="population"
)

In [55]:
alt.Chart(y2018).mark_point().encode(
    x=alt.X("birth_rate").sort("-y"),
    y="population",
    color="name"
)