- https://newsapi.org/
- https://geocode.xyz/
- https://restcountries.com/
- https://api.openweathermap.org

In [1]:
import requests
import json
from urllib import parse
import pandas as pd

## Countries

In [3]:
def get_countries():
    res = requests.get("https://restcountries.com/v3.1/all")
    return json.loads(res.content.decode())

countries_res = get_countries()

In [8]:
def expand_currency(df):
    return pd.concat(
            [df,pd.DataFrame(df.currency.tolist(), index=df.index, columns=['currency_key','currency_name',"currency_symbol"])]
        ,axis=1) 

def expand_timezones(df):
     tzdf = pd.DataFrame(df.timezones.tolist(), index=df.index)
     tzdf.columns = [f'timezone_{col+1}' for col in tzdf.columns]
     return pd.concat(
            [df,tzdf]
        ,axis=1)       

def expand_languages(df):
     lngdf = pd.DataFrame(df.languages.tolist(), index=df.index)
     lngdf.columns = [f'lang_{col+1}' for col in lngdf.columns]
     return pd.concat(
            [df,lngdf]
        ,axis=1)   

def process_countries_res(countries_res):
    countries = pd.DataFrame(countries_res)
    return (
        countries
        [[
            "name",
            "capital",
            "independent",
            "currencies",
            "continents",
            "latlng",
            "population",
            "region",
            "timezones",
            "area",
            "languages",
            "tld"
        ]]
        .assign(
            common_name = lambda df: df.name.apply(lambda x: x.get("common","")),
            oficial_name = lambda df: df.name.apply(lambda x: x.get("official","")),
            currency = lambda df: df.currencies.apply(
                                        lambda x: [
                                            list(x.keys())[0], 
                                            x.get(list(x.keys())[0]).get("name",""),
                                            x.get(list(x.keys())[0]).get("symbol","")
                                            ]
                                        if isinstance(x, dict) else []
                                    ),
            continents = lambda df: df.continents.apply(lambda x: x[0]),
            lat = lambda df: df.latlng.apply(lambda x: x[0]),
            lon = lambda df: df.latlng.apply(lambda x: x[1]),
            capital = lambda df: df.capital.apply(lambda x: x[0] if isinstance(x, list) else x),
            languages = lambda df: df.languages.apply(lambda x: [v for k,v in x.items()][:3] if isinstance(x, dict) else []),
            timezones = lambda df: df.timezones.apply(lambda x: x[:3] if isinstance(x, list) else []),
            country_code = lambda df: df.tld.apply(lambda x : x[0][1:] if isinstance(x, list) else pd.NA)
        )
        .pipe(
            expand_currency
        )
        .pipe(
            expand_timezones
        )
        .pipe(
            expand_languages
        )
        .drop(columns=['name','currencies','latlng','timezones','languages','currency','tld'])
    )


In [None]:
countries_clean = process_countries_res(countries_res)

In [9]:
countries_clean.to_csv(
     "../data/etl/countries.csv",
     index=False
)

## News

In [14]:
NEWS_API_KEY = '35686b6ca73247b5a9f3a94c10e2e34d'
cc = [
'ae','ar','at','au','be',
'bg','br','ca','ch','cn',
'co','cu','cz','de',
'eg','fr','gb','gr',
'hk','hu','id','ie',
'il','in','it','jp',
'kr','lt','lv','ma',
'mx','my','ng','nl',
'no','nz','ph','pl',
'pt','ro','rs','ru',
'sa','se','sg','si',
'sk','th','tr','tw',
'ua','us','ve','za']

def get_news_by_country(country):
    news_url = f'https://newsapi.org/v2/top-headlines?country={country}&apiKey={NEWS_API_KEY}'
    res = requests.get(news_url)
    return json.loads(res.content.decode())

def process_news_res(news_res, country):
    news = pd.DataFrame(news_res['articles'])
    news['country_code'] = country
    news['source_id'] = news.source.apply(lambda x: x['id'])
    news['source_name'] = news.source.apply(lambda x: x['name'])
    return news.drop(columns=['source'])


In [54]:
news_data = []
for country in cc:
    news_res = get_news_by_country(country)
    news = process_news_res(news_res,country)
    news_data.append(news)

news_data = pd.concat(news_data)


In [None]:
news_data.head()

In [None]:
news_data.to_csv("../data/etl/news.csv", index=False)

In [11]:
def get_news_by_keyword(keyword):
    q = parse.quote(keyword)
    news_url = f'https://newsapi.org/v2/everything?q={q}&apiKey={NEWS_API_KEY}'
    res = requests.get(news_url)
    return json.loads(res.content.decode())


In [19]:
get_news_by_keyword("madrid")

{'status': 'ok',
 'totalResults': 39340,
 'articles': [{'source': {'id': 'bbc-news', 'name': 'BBC News'},
   'author': 'https://www.facebook.com/bbcnews',
   'title': 'Flambé fire kills two in Madrid restaurant',
   'description': 'Another ten people are injured after plastic plants caught fire as a waiter flambéed a dish.',
   'url': 'https://www.bbc.co.uk/news/world-europe-65360859',
   'urlToImage': 'https://ichef.bbci.co.uk/news/1024/branded_news/15CCE/production/_129449298_emergenciasmadrid.jpg',
   'publishedAt': '2023-04-22T16:24:49Z',
   'content': 'At least two people have died and another ten were injured after a waiter flambéed a dish, accidentally setting fire to an Italian restaurant on a busy Friday evening in Madrid. \r\nOne of the injured … [+1066 chars]'},
  {'source': {'id': 'bbc-news', 'name': 'BBC News'},
   'author': None,
   'title': 'Novak Djokovic has withdrawn from Madrid Open, say organisers',
   'description': "World number one Novak Djokovic pulls out of nex

In [21]:
WEATHER_API_KEY = '8831d413bb40311275e912234c1df8cb'
GEOCODE_API_KEY = '205787102155036129079x23712 '

def get_forward_geocoding(address):
    q = parse.quote(address)
    geocode_url = f'https://geocode.xyz/{q}?geoit=JSON&auth={GEOCODE_API_KEY}'
    res = requests.get(geocode_url)
    return json.loads(res.content.decode())

def get_reverse_geocoding(lat, lon):
    geocode_url=f'https://geocode.xyz/{lat},{lon}?geoit=JSON&auth={GEOCODE_API_KEY}'
    res = requests.get(geocode_url)
    return json.loads(res.content.decode())

def get_weather(lat, lon):
    weather_url = f'https://api.openweathermap.org/data/2.5/weather?lat={lat}&lon={lon}&appid={WEATHER_API_KEY}'
    res = requests.get(weather_url)
    return json.loads(res.content.decode())

In [22]:
get_forward_geocoding(address="Calle Tres Peces 26, Madrid, Spain")

{'standard': {'stnumber': '25',
  'addresst': 'Tres Peces',
  'statename': 'Comunidad de Madrid',
  'postal': '28012',
  'region': 'Comunidad de Madrid',
  'city': 'Madrid',
  'prov': 'ES',
  'countryname': 'Spain',
  'confidence': '0.9'},
 'longt': '-3.69883',
 'alt': {},
 'elevation': {},
 'remaining_credits': '-1',
 'latt': '40.41099'}

In [23]:
get_reverse_geocoding(lat=40.41099, lon=-3.69883)

{'statename': {},
 'distance': '0.000',
 'elevation': '647',
 'osmtags': {'wikipedia': 'es:Embajadores (Madrid)',
  'wikidata': 'Q2449844',
  'boundary': 'administrative',
  'name': 'Embajadores',
  'type': 'boundary',
  'landuse': 'residential',
  'admin_level': '10'},
 'state': 'Comunidad de Madrid',
 'latt': '40.41099',
 'city': 'Madrid',
 'prov': 'ES',
 'intersection': {'distance': '0.099',
  'xlat': '40.4101008',
  'xlon': '-3.69876075',
  'street2': 'BUENAVISTA',
  'street1': 'CL ZURITA'},
 'geocode': 'MADRID-KVUZC',
 'geonumber': '3162553972580',
 'country': 'Spain',
 'stnumber': '25',
 'staddress': 'CL TRES PECES',
 'inlatt': '40.41099',
 'alt': {'loc': [{'staddress': 'CL TRES PECES',
    'stnumber': '25',
    'postal': '28012',
    'latt': '40.41099',
    'city': 'Madrid',
    'prov': 'Comunidad de Madrid',
    'longt': '-3.69883',
    'class': {}},
   {'staddress': 'TRES PECES',
    'stnumber': '25',
    'postal': {},
    'latt': '40.41099',
    'city': 'Comunidad de Madrid',

In [24]:
get_weather(lat=40.41099, lon=-3.69883)

{'coord': {'lon': -3.6988, 'lat': 40.411},
 'weather': [{'id': 800,
   'main': 'Clear',
   'description': 'clear sky',
   'icon': '01d'}],
 'base': 'stations',
 'main': {'temp': 299.23,
  'feels_like': 299.23,
  'temp_min': 297.66,
  'temp_max': 301.17,
  'pressure': 1017,
  'humidity': 27},
 'visibility': 10000,
 'wind': {'speed': 1.34, 'deg': 179, 'gust': 3.58},
 'clouds': {'all': 0},
 'dt': 1682594291,
 'sys': {'type': 2,
  'id': 2007545,
  'country': 'ES',
  'sunrise': 1682572781,
  'sunset': 1682622301},
 'timezone': 7200,
 'id': 3117735,
 'name': 'Madrid',
 'cod': 200}