In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [3]:
base_url = 'https://www.numbeo.com/cost-of-living/historical-data-city-selector'

In [4]:
page = requests.get(base_url)
numbeo_city_soup = BeautifulSoup(page.content, "html.parser")
results = numbeo_city_soup.find('table', class_='related_links')
#print(results())

In [5]:
list_cities = results.find_all('a')
list_cities[0]

<a href="https://www.numbeo.com/cost-of-living/city-history/in/Aachen">Aachen, Germany</a>

In [6]:
city_pages = []
for city in list_cities:
    link_title = city.text
    location = link_title.split(",")
    link_url = city["href"]
    city_obj = {'City': location[0], 
                'Country':location[1], 
                'Url':link_url}
    city_pages.append(city_obj)

In [7]:
city_pages[0]

{'City': 'Aachen',
 'Country': ' Germany',
 'Url': 'https://www.numbeo.com/cost-of-living/city-history/in/Aachen'}

In [8]:
df = pd.DataFrame(city_pages)
df.head()

Unnamed: 0,City,Country,Url
0,Aachen,Germany,https://www.numbeo.com/cost-of-living/city-his...
1,Aalborg,Denmark,https://www.numbeo.com/cost-of-living/city-his...
2,Aarhus,Denmark,https://www.numbeo.com/cost-of-living/city-his...
3,Abbotsford,Canada,https://www.numbeo.com/cost-of-living/city-his...
4,Aberdeen,United Kingdom,https://www.numbeo.com/cost-of-living/city-his...


In [9]:
def get_tables(city):
    page = requests.get(df.iloc[df.loc[df['City'] == city].index[0]]["Url"])
    one_city_soup = BeautifulSoup(page.content, "html.parser")
    inner_width = one_city_soup.find_all('div', class_='innerWidth')
    results = inner_width[2].find_all('table')
    return results

In [10]:
cape_town_data = get_tables("Cape Town")
len(cape_town_data) 

14

In [11]:
nairobi_data = get_tables("Nairobi")
len(nairobi_data)

14

In [12]:
from io import StringIO

In [13]:
reader_converter = lambda x: pd.DataFrame(pd.read_html(StringIO(str(x)))[0])
cape_df_list = [reader_converter(table) for table in cape_town_data]
nairobi_df_list =[reader_converter(table) for table in nairobi_data]

In [19]:
cape_df_list[3]

Unnamed: 0,Year,Apples (1kg),Oranges (1kg),Potato (1kg),Lettuce (1 head),"Rice (white), (1kg)",Tomato (1kg),Banana (1kg),Onion (1kg)
0,2023,26.43,23.56,22.00,18.48,28.66,26.08,28.76,24.90
1,2022,25.17,22.55,18.48,17.34,26.18,25.40,24.39,18.28
2,2021,22.86,23.72,16.59,15.72,20.88,22.25,22.18,17.27
3,2020,24.64,23.57,20.21,15.45,22.65,22.54,23.92,18.86
4,2019,25.90,28.71,23.12,18.11,25.40,20.89,23.82,22.67
5,2018,21.60,19.50,14.77,13.93,23.00,17.86,17.78,13.58
6,2017,18.95,19.00,15.00,13.39,19.00,18.00,17.50,13.39
7,2016,20.82,18.43,18.73,14.67,17.15,19.20,16.31,16.40
8,2015,16.23,13.23,11.16,10.72,18.57,15.25,13.51,11.03
9,2014,16.89,17.36,9.61,9.69,18.45,14.89,-,-


In [15]:
nairobi_df_list[3]

Unnamed: 0,Year,Apples (1kg),Oranges (1kg),Potato (1kg),Lettuce (1 head),"Rice (white), (1kg)",Tomato (1kg),Banana (1kg),Onion (1kg)
0,2023,303.47,213.31,119.71,72.00,221.04,103.44,128.00,117.63
1,2022,291.83,229.00,112.25,80.22,188.68,114.58,98.67,126.08
2,2021,246.14,195.11,91.54,71.02,169.6,110.27,108.48,101.88
3,2020,266.50,175.00,102.00,65.00,155.23,101.46,93.22,97.62
4,2019,261.55,200.00,162.86,93.17,190.13,146.67,131.43,107.50
5,2018,293.00,-,-,-,208.57,136.67,119.00,132.00
6,2017,274.83,190.00,87.50,-,159.0,91.67,122.00,94.00
7,2016,281.75,227.86,103.92,62.00,123.85,92.67,158.79,106.89
8,2015,308.33,206.67,100.00,80.83,163.11,118.33,188.00,86.00
9,2014,233.33,250.00,98.57,60.00,167.14,131.43,-,-


In [16]:
import os

def save_df(df_list, city): 
    titles = ['eat-out-meal', 'beverage', 'basic-grocery', 'fresh-produce', 'deli', 'rental', 'property',
              'salary', 'transport-daily', 'transport-monthly', 'mortgage', 'utilities', 'entertainment', 'clothing']
    # Create directory if it doesn't exist
    directory = f'data/{city}'
    if not os.path.exists(directory):
        os.makedirs(directory)
    for index, frame in enumerate(df_list):
        # Ensure the title index is within bounds
        if index < len(titles):
            # Save DataFrame to CSV
            frame.to_csv(f'{directory}/{titles[index]}.csv', sep=',', index=False, encoding='utf-8')


In [17]:
save_df(cape_df_list, 'cape-town')

In [18]:
save_df(nairobi_df_list, 'nairobi')