## Extract, Transform and  Load with Python

### Importing libraries

In [1]:
import requests
import pandas as pd
#from sqlalchemy import create_engine

### Extract


In [2]:
def extract () -> dict:
    """ This API extracts data from 
    http://universities.hipolabs.com"""

    API_URL = "http://universities.hipolabs.com/search?country=Brazil"
    data = requests.get(API_URL).json()
    return data

### Transform


In [3]:
def transform(data:dict) -> pd.DataFrame:
    """Transforms the dataset into desired structure and filters"""
    df = pd.DataFrame(data)
    print(f"Total Number of universities from API {len(data)}")
    df = df[df["name"].str.contains("Rio de Janeiro")]
    print(f"Number of universities in Rio de Janeiro {len(df)}")
    df['domains'] = [','.join(map(str,l)) for l in df['domains']]
    df['web_pages'] = [','.join(map(str,l)) for l in df['web_pages']]
    df = df.reset_index(drop=True)
    return df[["domains", "country", "web_pages", "name"]]

### Load

In [8]:
def load_csv(df:pd.DataFrame, file_path: str) -> None:
    """Salva os dados transformados em um arquivo CSV."""
    df.to_csv('uni_br.csv', index=False)

### Running ETL pipeline

In [10]:
data = extract()
df = transform(data)
load_csv(df,'uni_br.csv')


Total Number of universities from API 175
Number of universities in Rio de Janeiro 5


In [12]:
print(df)

      domains country               web_pages  \
0  puc-rio.br  Brazil  http://www.puc-rio.br/   
1     uerj.br  Brazil     http://www.uerj.br/   
2     ufrj.br  Brazil     http://www.ufrj.br/   
3    ufrrj.br  Brazil    http://www.ufrrj.br/   
4   unirio.br  Brazil   http://www.unirio.br/   

                                                name  
0  Pontifícia Universidade Católica do Rio de Jan...  
1           Universidade do Estado do Rio de Janeiro  
2             Universidade Federal do Rio de Janeiro  
3       Universidade Federal Rural do Rio de Janeiro  
4                     Universidade do Rio de Janeiro  
