In [1]:
class Constants:
    schema = "euromonitor"
    table = "marketsize"

    key_name = f"org_raw_{schema}_{table}"
    tmp = f"/tmp/{key_name}/"

    base_url = "https://api.euromonitor.com"
    endpoint_auth = "/authentication/connect/token"
    endpoint_category = "/catalog/category"
    endpoint_geography = "/catalog/geography"
    endpoint_marketsize = '/statistics/marketsizes/'

In [3]:
def get_token(email: str, password: str, subscription_key: str) -> str:
    """Obtem o token de autenticação dos endpoints da API."""
    logging.info("Obtendo token de autenticação")

    header = {
        #'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
        "Accept": "application/json; api-version=1.0",
        "Content-Type": "application/x-www-form-urlencoded",
        "Ocp-Apim-Subscription-Key": subscription_key,
    }
    body = {"grant_type": "string", "username": email, "password": password}

    try:
        response = requests.post(
            "".join([Constants.base_url, Constants.endpoint_auth]),
            headers=header,
            data=body,
        )

        if not response.status_code == 200:
            raise Exception(
                'API Euromonitor retornou um status inválido '
                + f'{response.status_code}: {response.reason}!\n'
                + response.text
            )
        if not response.content:
            raise Exception(
                "API Euromonitor não retornou um conteúdo para chamda HTTP!"
            )

        content = response.json()

        if not all([key in content.keys() for key in ["access_token", "token_type"]]):
            raise Exception("API Euromonitor não retornou um token de acesso!")
    except Exception as err:
        raise Exception(f"Falha ao tentar obter token de consulta. Erro: {err}")

    api_token = " ".join([content.get("token_type"), content.get("access_token")])
    logging.info('Token obtido com sucesso')
    return api_token


In [2]:
def chunker(iterable, n: int = 1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx: min(ndx + n, l)]


In [None]:
def main(**kwargs):
    """Esta função realizado captura de dados da API Sidra:
    Dados gerais das empresas industriais com 1 ou mais pessoas ocupadas, segundo as
    divisões de atividades (CNAE 2.0).
    """
    bot = initialize()
    LND: str = bot.lnd
    adl: FileSystemClient = bot.adl

    os.makedirs(Constants.tmp, mode=0o777, exist_ok=True)

    try:
        if kwargs["reset"] is True:
            call_redis("delete", Constants.key_name)

        if kwargs["reload"] is True:
            raise Exception("Este crawler não suporta a opção de reload")

        if call_redis("exists", Constants.key_name):
            _date_str = call_redis("get", Constants.key_name).decode()
            last_update = datetime.strptime(_date_str, "%Y-%m-%d").date()
        else:
            last_update = None

        secret_client = bot.secret_client
        email = secret_client.get_secret('euromonitor-email').value
        password = secret_client.get_secret('euromonitor-pw').value
        subscription_key = secret_client.get_secret('euromonitor-key').value

        api_token = get_token(email, password, subscription_key)

        header = {
            "Accept": "application/json; api-version=1.0",
            "Content-Type": "application/x-www-form-urlencoded",
            "Ocp-Apim-Subscription-Key": subscription_key,
            "Authorization": api_token,
        }

        ### Consulta principal de marketsize

        cat_ids = ['84']
        geo_ids = ['195', '259', '310', '380', '182', '389']

        df = None
        j=1
        for cat_id_chunk in chunker(cat_ids, 100):
            c=1
            for geo_id_chunk in chunker(geo_ids, 100):
                has_more = True
                offset = 0
                while(has_more):
                    logging.info(f'Offset: {offset}')
                    try:
                        payload = {
                            "categoryIds" : [cat_id_chunk],
                            "geographyIds": geo_id_chunk,
                            "limit": '10000',
                            "offset": str(offset)
                        }
                        logging.info(
                            f'Extraindo dados da API Euromonitor. CatChunck: {j}. GeoChunk: {c}'
                        )
                        endpoint = Constants.endpoint_marketsize.format(
                            geo=','.join(geo_ids), cat=','.join(cat_ids)
                        )
                        
                        response = requests.post(
                            ''.join([Constants.base_url, endpoint]),
                            headers=header,
                            data=payload
                        )
                        if response.status_code == 403:
                            logging.warning(f'Sem autorização para essa query.')
                            has_more=False
                            sleep(1)
                            continue
                        if response.status_code == 204:
                            logging.warning(f'Sem Registros para essa query.')
                            has_more=False
                            sleep(1)
                            continue
                        if not response.status_code == 200:
                            raise Exception(
                                'API Euromonitor retornou um status inválido '
                                + f'{response.status_code}: {response.reason}!\n'
                                + response.text
                            )
                        if not response.content:
                            raise Exception(
                                "API Euromonitor não retornou um conteúdo para chamda HTTP!"
                            )

                    except Exception as err:
                        raise Exception(
                            f"Falha ao tentar obter dados da API Euromonitor. Erro: {err}"
                        )

                    logging.info('Transformando resposta em JSON')
                    data = json.loads(response.text)

                    if int(data['total']) - offset >= 10000:
                        offset += 10000
                    else: 
                        has_more = False

                    logging.info('Transformando JSON em dataframe do PANDAS')
                    tmp_df = pd.json_normalize(
                        data["marketSizes"],
                        "data",
                        [
                            "researchYear",
                            "geographyId",
                            "geographyName",
                            "categoryId",
                            "categoryName",
                            "industry",
                            "dataTypeId",
                            "dataType",
                            "unitName",
                            "inflationType",
                            "exchangeRateName",
                            "perCapitaName",
                            "unitMultiplier",
                            "isDefaultDataType",
                        ],
                        record_prefix="issue_level_problem_",
                        )

                    logging.info('Realizando merge do dataframe')

                    if df is not None:
                        df = pd.concat([df, tmp_df])
                    else:
                        df = tmp_df

                c += 1
            logging.info(f'Chunk Categoria concluída')
            j += 1

        df.columns = df.columns.str.upper()

        drop_directory(LND, adl, schema=Constants.schema, table=Constants.table)

        parquet_file_name = Constants.tmp + f'{Constants.schema}_{Constants.table}.parquet'

        df.to_parquet(parquet_file_name, index=False)

        upload_file(
            LND,
            adl,
            schema=Constants.schema,
            table=Constants.table,
            file=parquet_file_name,
        )

        if kwargs["reload"] is False:
            call_redis("set", Constants.key_name, str(date.today()))

        log_status("ok")
    except Exception as e:
        raise e
    finally:
        shutil.rmtree(Constants.tmp)


if __name__ == "__main__":
    main(**kwargs)
