# Retrieve Role Normalization API logs

Retrieve Role Normalization API logs from Loglake and save them to a file.

In [1]:
import aips_gathering as aips
import gzip
import os
import pandas as pd
import pyathena

In [2]:
s3_data_folder = "s3://datascience-lab-canastra-vg/luis/"

start_date = aips.date(2023, 5, 28)
end_date = aips.date(2023, 6, 3)
target_dates = list(aips.date_range(start_date, end_date))

logs_csv_filename = f'role_norm_logs.{str(start_date)}.{str(end_date)}.csv'
logs_gzip_filename = f'{logs_csv_filename}.gz'

## Retrieve logs

Retrieve logs from Loglake and save them in S3 as parquet files.

In [3]:
def get_role_norm_logs_by_date(conn, query_date):
    return aips.query_athena_catho(
        f"""
        SELECT
            date_format(MIN(timestamp) AT TIME ZONE 'America/Sao_Paulo', '%Y-%m-%d %H:%i:%s.%f') AS log_datetime,
            request_body AS api_request,
            response_body AS api_response
        FROM (
            SELECT
                year,
                month,
                day,
                request_body,
                response_body_custom AS response_body,
                from_iso8601_timestamp("@timestamp") AS timestamp
            FROM
                loglake.logs
            WHERE
                engine = 'catho-role-normalization'
                AND CONCAT(year, '-', month, '-', day) = '{query_date.isoformat()}'
                AND (status = '200' OR status = '204')
                AND request_method = 'POST'
            )
        GROUP BY
            year,
            month,
            day,
            request_body,
            response_body
        """
    )

In [5]:
aips.download_by_date(
    None,
    get_role_norm_logs_by_date,
    os.path.join(s3_data_folder, "role_norm_api_logs"),
    target_dates,
    desc = "Retrieving Role Normalization API logs",
    overwrite = False
)

Retrieving Role Normalization API logs:   0%|          | 0/7 [00:00<?, ?it/s]

FILE: s3://datascience-lab-canastra-vg/luis/role_norm_api_logs/2023-05-28.parquet... SAVED
FILE: s3://datascience-lab-canastra-vg/luis/role_norm_api_logs/2023-05-29.parquet... SAVED
FILE: s3://datascience-lab-canastra-vg/luis/role_norm_api_logs/2023-05-30.parquet... SAVED
FILE: s3://datascience-lab-canastra-vg/luis/role_norm_api_logs/2023-05-31.parquet... SAVED
FILE: s3://datascience-lab-canastra-vg/luis/role_norm_api_logs/2023-06-01.parquet... SAVED
FILE: s3://datascience-lab-canastra-vg/luis/role_norm_api_logs/2023-06-02.parquet... SAVED
FILE: s3://datascience-lab-canastra-vg/luis/role_norm_api_logs/2023-06-03.parquet... SAVED


## Read logs

Read log parquet files and create dataframe.

In [6]:
df_role_norm_logs = aips.read_by_date(
    os.path.join(s3_data_folder, "role_norm_api_logs"),
    target_dates,
    desc = "Reading Role Normalization API logs"
)
display(df_role_norm_logs)
df_role_norm_logs.info()

Reading Role Normalization API logs:   0%|          | 0/7 [00:00<?, ?it/s]

READING: s3://datascience-lab-canastra-vg/luis/role_norm_api_logs/2023-05-28.parquet
READING: s3://datascience-lab-canastra-vg/luis/role_norm_api_logs/2023-05-29.parquet
READING: s3://datascience-lab-canastra-vg/luis/role_norm_api_logs/2023-05-30.parquet
READING: s3://datascience-lab-canastra-vg/luis/role_norm_api_logs/2023-05-31.parquet
READING: s3://datascience-lab-canastra-vg/luis/role_norm_api_logs/2023-06-01.parquet
READING: s3://datascience-lab-canastra-vg/luis/role_norm_api_logs/2023-06-02.parquet
READING: s3://datascience-lab-canastra-vg/luis/role_norm_api_logs/2023-06-03.parquet


Unnamed: 0,log_datetime,api_request,api_response
0,2023-05-27 23:37:50.807000,"{""titles"":[""Auxiliar Contábil""]}","{""Auxiliar Contábil"": [{""normalized_role"": ""Au..."
1,2023-05-28 00:05:54.138000,"{""titles"":[""CFO - Chief Financial Operational""]}",
2,2023-05-28 00:06:34.607000,"{""titles"":[""Gerente administrativo""]}","{""Gerente administrativo"": [{""normalized_role""..."
3,2023-05-27 21:32:38.159000,"{""titles"":[""Ajudante Geral""]}","{""Ajudante Geral"": [{""normalized_role"": ""Ajuda..."
4,2023-05-28 00:07:52.768000,"{""titles"":[""Recepcionista e Tecnica de Enferma...",
...,...,...,...
81059,2023-06-03 10:09:59.716000,"{""titles"":[""Analista Júnior de RH""]}",
81060,2023-06-03 10:10:10.602000,"{""titles"":[""Administrativa ""]}","{""Administrativa "": [{""normalized_role"": ""Admi..."
81061,2023-06-03 00:41:59.789000,"{""titles"":[""Secretária Executiva - Voluntariad...",
81062,2023-06-02 21:25:27.856000,"{""titles"":[""Programador Java""]}","{""Programador Java"": [{""normalized_role"": ""Pro..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81064 entries, 0 to 81063
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   log_datetime  81064 non-null  object
 1   api_request   81064 non-null  object
 2   api_response  81064 non-null  object
dtypes: object(3)
memory usage: 1.9+ MB


### Cast log_datetime column to datetime

In [7]:
df_role_norm_logs[['log_datetime']] = df_role_norm_logs[['log_datetime']].apply(pd.to_datetime)
df_role_norm_logs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81064 entries, 0 to 81063
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   log_datetime  81064 non-null  datetime64[ns]
 1   api_request   81064 non-null  object        
 2   api_response  81064 non-null  object        
dtypes: datetime64[ns](1), object(2)
memory usage: 1.9+ MB


### Order by datetime column

In [8]:
df_role_norm_logs.sort_values(['log_datetime'], ascending=True, inplace=True)
df_role_norm_logs.reset_index(inplace=True, drop=True)
display(df_role_norm_logs)

Unnamed: 0,log_datetime,api_request,api_response
0,2023-05-27 21:00:01.576,"{""titles"":[""Motorista de aplicativo""]}",
1,2023-05-27 21:00:05.120,"{""titles"":[""Analista de Faturamento""]}","{""Analista de Faturamento"": [{""normalized_role..."
2,2023-05-27 21:00:13.271,"{""titles"":[""Analista de Compras""]}","{""Analista de Compras"": [{""normalized_role"": ""..."
3,2023-05-27 21:00:13.533,"{""titles"":[""Assistente Administrativo""]}","{""Assistente Administrativo"": [{""normalized_ro..."
4,2023-05-27 21:00:16.885,"{""titles"":[""Consultora de Pricing | Coordenado...",
...,...,...,...
81059,2023-06-03 20:57:57.756,"{""titles"":[""Recepcionista ou secretaria ""]}","{""Recepcionista ou secretaria "": [{""normalized..."
81060,2023-06-03 20:58:13.631,"{""titles"":[""ajudante auxliar de estoque""]}",
81061,2023-06-03 20:58:45.818,"{""titles"":[""Procurement Apprentice""]}",
81062,2023-06-03 20:58:57.434,"{""titles"":[""Especialista em Licitações""]}",


## Save to file

In [9]:
print(f'Logs CSV file: {logs_csv_filename}')
print(f'Logs GZIP file : {logs_gzip_filename}')

Logs CSV file: role_norm_logs.2023-05-28.2023-06-03.csv
Logs GZIP file : role_norm_logs.2023-05-28.2023-06-03.csv.gz


In [10]:
df_role_norm_logs.to_csv(logs_csv_filename, sep=',', encoding='utf-8', index=False)

In [11]:
with open(logs_csv_filename, 'rb') as f_in, gzip.open(logs_gzip_filename, 'wb') as f_out:
    f_out.writelines(f_in)