In [1]:
!pip install pygithub backoff

Collecting pygithub
  Downloading PyGithub-1.51-py3-none-any.whl (260 kB)
[K     |████████████████████████████████| 260 kB 14.0 MB/s eta 0:00:01
[?25hCollecting backoff
  Downloading backoff-1.10.0-py2.py3-none-any.whl (31 kB)
Collecting pyjwt
  Downloading PyJWT-1.7.1-py2.py3-none-any.whl (18 kB)
Installing collected packages: pyjwt, pygithub, backoff
Successfully installed backoff-1.10.0 pygithub-1.51 pyjwt-1.7.1


In [2]:
from github import Github, RateLimitExceededException
from datacube import Datacube
import time
import os
import backoff
import pandas as pd
import psycopg2

In [3]:
ACCESS_TOKEN = '<access token here>'

In [4]:
g = Github(ACCESS_TOKEN)

In [5]:
def search_github(keyword):
    rate_limit = g.get_rate_limit()
    rate = rate_limit.search
    if rate.remaining == 0:
        print(f'You have 0/{rate.limit} API calls remaining. Reset time: {rate.reset}')
        return
    else:
        print(f'You have {rate.remaining}/{rate.limit} API calls remaining')
 
    query = f'"{keyword}" in:file extension:yaml'
    result = g.search_code(query, order='desc')
 
    max_size = 100
    print(f'Found {result.totalCount} file(s)')
    if result.totalCount > max_size:
        result = result[:max_size]
    ga_results = [file for file in result if 'GeoscienceAustralia' in file.download_url]
    if len(ga_results) > 0:
        return ga_results[0].download_url

In [6]:
dc = Datacube()

In [7]:
all_products = dc.list_products()['name']

In [8]:
@backoff.on_exception(backoff.expo,RateLimitExceededException,max_time=300)
def search_product_yaml(product):
    time.sleep(10)
    return search_github([f'name: {product}'])

In [9]:
product_url = dc.list_products()['name'].apply(search_product_yaml)

You have 30/30 API calls remaining
Found 0 file(s)
You have 27/30 API calls remaining
Found 3 file(s)
You have 25/30 API calls remaining
Found 2 file(s)
You have 23/30 API calls remaining
Found 2 file(s)
You have 21/30 API calls remaining
Found 2 file(s)
You have 19/30 API calls remaining
You have 19/30 API calls remaining
Found 2 file(s)
You have 28/30 API calls remaining
Found 1 file(s)
You have 26/30 API calls remaining
Found 1 file(s)
You have 24/30 API calls remaining
Found 1 file(s)
You have 22/30 API calls remaining
You have 22/30 API calls remaining
You have 22/30 API calls remaining
Found 0 file(s)
You have 27/30 API calls remaining
Found 2 file(s)
You have 25/30 API calls remaining
Found 4 file(s)
You have 23/30 API calls remaining
Found 4 file(s)
You have 21/30 API calls remaining
Found 1 file(s)
You have 19/30 API calls remaining
Found 2 file(s)
You have 28/30 API calls remaining
Found 1 file(s)
You have 26/30 API calls remaining
Found 5 file(s)
You have 24/30 API calls rem

In [10]:
username = os.getenv('DB_USERNAME')
hostname = os.getenv('DB_HOSTNAME')
password = os.getenv('DB_PASSWORD')
target_db = os.getenv('DB_DATABASE')
conn = psycopg2.connect(f"postgresql://{username}:{password}@{hostname}/{target_db}")

In [11]:
def get_dataset_locations(product):
    return pd.read_sql(f"""
    with products as
    (select name,id from agdc.dataset_type where name='{product}' ),
    dataset_ids as
    (select id from agdc.dataset where dataset_type_ref in (select id from products))
    select * from agdc.dataset_location where dataset_ref in (select id from dataset_ids)
    """,conn)

In [12]:
def get_common_location(product):
    return os.path.commonprefix(list(get_dataset_locations(product)['uri_body']))

In [None]:
product_location = dc.list_products()['name'].apply(get_common_location)

In [None]:
product_location_dir = product_location.apply(os.path.dirname)

In [None]:
df = pd.concat([all_products,product_url,product_location_dir], axis=1)
df.columns = ['product','definition','location']
df

In [None]:
df.to_csv('sandbox-products.csv',index=False)