In [None]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from bs4 import BeautifulSoup

from utils import *

sns.set_style('whitegrid')

In [None]:
data_dir = '../data/'
ts = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
ts

### Countries 
https://www.countries-ofthe-world.com/countries-of-africa.html 

In [None]:
countries_raw = '''A
Algeria
Angola
B
Benin
Botswana
Burkina Faso
Burundi
C
Cabo Verde
Cameroon
Central African Republic (CAR)
Chad
Comoros
Congo, Democratic Republic of the
Congo, Republic of the
Cote d'Ivoire
D
Djibouti
E
Egypt
Equatorial Guinea
Eritrea
Eswatini (formerly Swaziland)
Ethiopia
G
Gabon
Gambia
Ghana
Guinea
Guinea-Bissau
K
Kenya
L
Lesotho
Liberia
Libya
M
Madagascar
Malawi
Mali
Mauritania
Mauritius
Morocco
Mozambique
N
Namibia
Niger
Nigeria
R
Rwanda
S
Sao Tome and Principe
Senegal
Seychelles
Sierra Leone
Somalia
South Africa
South Sudan
Sudan
T
Tanzania
Togo
Tunisia
U
Uganda
Z
Zambia
Zimbabwe
'''

In [None]:
countries = [c for c in countries_raw.split('\n') if len(c) > 1]
countries

In [None]:
for c in countries:
    print(c)

In [None]:
assert len(countries) == 54, f'there are 54 countries in Africa, you got {len(countries)}'

In [None]:
def is_country_in_str(s):
    pos = []
    for c in countries:
        if c in s:
            pos.append(c)
        
    if len(pos) > 0:
        lengths = list(map(lambda c: len(c), pos))
        c = pos[np.argmax(lengths)]
        return True, c
    else:
        return False, ''

### Population 

#### download 

In [None]:
population_file_raw = 'population_africa_wikipedia_raw.html'

In [None]:
!curl https://en.wikipedia.org/wiki/List_of_African_countries_by_population > {data_dir + population_file_raw}

#### parse 

In [None]:
with open(data_dir + population_file_raw, 'rb') as fo:
    population_raw = str(fo.read())

In [None]:
population_raw = [line for line in population_raw.split('<td style="text-align:left;">') if is_country_in_str(line)[0]]
population_raw[0], f'length {len(population_raw)}'

In [None]:
def parse_year(year_blob:str):
    '''Finds year in the string, assumes that the year is the first digits in the string. 
    Returns year (int), or None if there is year'''
    y = re.search('\d+', year_blob)
    return y.group(0) if y is not None else y

def parse_population_cell(blob, verbose=0):
    if verbose > 1:
        print(blob)

    data_raw = blob.split('title=')[-1].split('\\n')
    if verbose:
        print(data_raw)

    # parse population
    pstr = blob.split('title=')[-1].split('\\n')[1]
    res = re.findall('\d+',pstr)
    p_str = ''.join(res)

    # parse year
    y = parse_year(data_raw[2])
    y = y if y is not None else parse_year(data_raw[3])
    
    if verbose:
        print(p_str, y)
    population = int(p_str)
    return population, y


In [None]:
parsed_population = {}
for i, blob in enumerate(population_raw):
    if 'footer' in blob:
        print('end of table, reached the footer')
        break
    c = is_country_in_str(blob)[1]
    print(f'Country {c} {i+1}/{len(population_raw)}')

    population, update = parse_population_cell(blob, verbose=1)
    parsed_population[c] = dict(population=population, update=update)


In [None]:
c, population, update

In [None]:
parsed_population['South Africa']

In [None]:
df_population = pd.DataFrame(parsed_population).T.reset_index().rename(columns={'index': 'country'})
df_population.head()

In [None]:
df_population['population_mil'] = df_population.population / 10**6

#### Visualize

In [None]:
plt.figure(figsize=(12,8))

sns.barplot(data=df_population, x='country', y='population_mil')
plt.xticks(rotation=90)

show_values_on_bars(plt.gca())

plt.title(f'Population of African countries, Wikipedia ({ts[:4]})')

plt.tight_layout()
plt.show()

### Business indicators

In [None]:
business_file_raw = 'business_indicators_africa_raw.html'

In [None]:
!curl https://africadata.com/wp-content/themes/fruitful/topIndicatorsajax.php > {data_dir + business_file_raw}

In [None]:
with open(data_dir + business_file_raw, 'rb') as fo:
    business_raw = str(fo.read())

In [None]:
soup = BeautifulSoup(business_raw, 'html.parser')

columns = soup.find_all('th')
columns

In [None]:
columns = [str(col).split('>')[1].split('<')[0] for col in columns]
columns

In [None]:
rows = [
    str(row) for row in soup.find_all('tr')
    if is_country_in_str(str(row))[0]
]
len(rows)

In [None]:
rows[:3]

In [None]:
row = rows[0]
def parse_business_indicator_row(row:str):
    '''
    returns a list of strings
    '''
    row = [
        d.split('>')[-1] for d in row.split('</span>')
    ]
    row = [d for d in row if len(d) > 0]
    return row

In [None]:
rows_parsed = [parse_business_indicator_row(row) for row in rows]
rows_parsed[:3]

In [None]:
df_business_indicators = pd.DataFrame(rows_parsed, columns = columns)
df_business_indicators.head()

In [None]:
df_business_indicators.Value = df_business_indicators.Value.replace('N/A', None)
df_business_indicators.Value = df_business_indicators.Value.apply(lambda s: float(s.replace(',', '')))

df_business_indicators.head()

In [None]:
plt.figure(figsize=(12, 8), facecolor='w')

indicator = 'GDP per Capita (US$)'
df_temp = df_business_indicators.query(f"Indicators == '{indicator}'")
mm_ts = df_temp['Value Date'].min(), df_temp['Value Date'].max()

sns.barplot(
    data=df_temp,
    x='Country', y='Value'
)

plt.xticks(rotation=90)

show_values_on_bars(plt.gca())

tstr = mm_ts[0] if mm_ts[0] in mm_ts[1] else f'data varies {mm_ts[0]}-{mm_ts[1]}'

plt.title(f'African {indicator} ({tstr})')

plt.tight_layout()
plt.show()
