This notebook gets the number of requests to GFW Data API datasets from access logs in Cloudwatch

Currently, we have 1 month retention for the logs so that's what it'll pull by default but can look at subset using `start_date` and `end_date`.

Need to point `AWS_PROFILE` env variable to your production creds to access the Cloudwatch logs.

Need to specify `results_dir` and `file_name` to save file if `save=True`

In [1]:
from datetime import datetime
import os

from dateutil.relativedelta import relativedelta
from time import sleep

import boto3
import requests
import pandas as pd

In [3]:
start_date = datetime.now() - relativedelta(days=30)
end_date = datetime.now()
start_date_seconds = (start_date - datetime(1970, 1, 1)).total_seconds() # cloudwatch query start
end_date_seconds = (end_date - datetime(1970, 1, 1)).total_seconds() # cloudwatch query end
log_group = '/aws/ecs/gfw-data-api-log'

save=True
results_dir = '/Users/solomon.negusse/wri/'
file_name = 'temp.csv'

datasets_url = 'https://data-api.globalforestwatch.org/datasets'

In [6]:
datasets = requests.get(datasets_url).json()['data']


In [7]:
datasets_df = pd.DataFrame(datasets)
datasets_df.set_index('dataset', inplace=True)

In [8]:
cloudwatch = boto3.client('logs', region_name='us-east-1')

In [169]:
# regex length can't be longer than 10k characters
chunk_size = 100
start = 0

results = []

while start < datasets_df.index.size:
    dataset_ids =  '|'.join(datasets_df.iloc[start:start + chunk_size].index)
    query_string = f'fields @timestamp, @message | filter @message like /GET \/dataset\/(?<datasetId>{dataset_ids})\/(?!.*latest).*200$/ | parse @message /GET \/dataset\/(?<datasetId>{dataset_ids})/ | stats count() as datasetViewCount by datasetId | sort datasetViewCount desc'
    query = cloudwatch.start_query(logGroupName=log_group, queryString=query_string, startTime=int(start_date_seconds), endTime=int(end_date_seconds))
    sleep(5)
    query_response = cloudwatch.get_query_results(queryId=query["queryId"])
    while (query_response['status'] == 'Running' or query_response['status'] == 'Scheduled'):
        query_response = cloudwatch.get_query_results(queryId=query["queryId"])
        sleep(5)

    results += query_response['results']
    start = start + chunk_size
 

In [174]:
view_count_df = pd.DataFrame([{'dataset': rec[0]['value'], 'view_count': rec[1]['value']} for rec in results])
view_count_df.set_index('dataset', inplace=True)

if save:
    view_count_df.
        astype(int)
        .sort_values(
            by='view_count', ascending=False)
        .to_csv(
            os.path.join(results_dir, file_name),
            index_label='dataset
        )

In [173]:
view_count_df.head(20)

Unnamed: 0_level_0,view_count
dataset,Unnamed: 1_level_1
gadm__tcl__adm2_summary,2731233
gadm__tcl__adm2_change,2151190
gadm__tcl__adm1_summary,559844
gfw_integrated_alerts,471164
gadm__tcl__adm1_change,388675
gadm__tcl__iso_summary,377614
gadm__tcl__iso_change,317773
gadm__viirs__adm2_daily_alerts,289680
geostore__glad__daily_alerts,277344
gadm__viirs__adm2_weekly_alerts,263199
