In [5]:
from fair_research_login import NativeClient
import requests
import pickle
import json
import time 

xtract_base_url = "http://xtract-crawler-4.eba-ghixpmdf.us-east-1.elasticbeanstalk.com"

# MDF Materials Data at NCSA 
# source_ep_id = "82f1b5c6-6e9b-11e5-ba47-22000b92c6ec"
source_ep_id = "e38ee745-6d04-11e5-ba46-22000b92c6ec"
base_url = "https://data.materialsdatafacility.org"
folder_to_crawl = "/MDF/mdf_connect/prod"

# This only matters if you want files grouped together. 
grouper = "matio"

In [6]:
# Do Globus NativeClient Authentication, save the headers.
# I've included all the relevant ones to be safe. 
client = NativeClient(client_id='7414f0b4-7d05-4bb6-bb00-076fa3f17cf5')
tokens = client.login(
    requested_scopes=['https://auth.globus.org/scopes/56ceac29-e98a-440a-a594-b41e7a084b62/all', 
                      'urn:globus:auth:scope:transfer.api.globus.org:all',
                     "https://auth.globus.org/scopes/facd7ccc-c5f4-42aa-916b-a0e270e2c2a9/all", 
                     'email', 'openid'],
    no_local_server=True,
    no_browser=True)

auth_token = tokens["petrel_https_server"]['access_token']
transfer_token = tokens['transfer.api.globus.org']['access_token']
funcx_token = tokens['funcx_service']['access_token']

headers = {'Authorization': f"Bearer {auth_token}", 'Transfer': transfer_token, 'FuncX': funcx_token, 'Petrel': auth_token}
print(f"Headers: {headers}")

Headers: {'Authorization': 'Bearer Ag2j63gD27qkB9PD74Nl103DE9N8ejrqvVWgMOaBn5M99kkqVWtkC6kvO1Xn1YM2NVwd1g7be0oNepHm0KKoeTdk5n', 'Transfer': 'Ag3lkkvrlEmQpjpbdpyQ14EY8JzMjzzJDeOnn0zWQo9DerV4mNsVCw2832rG27g6jj4ll0pXbB4kemtXg77eJUgB6z', 'FuncX': 'Ag8rBWbWKPlO4dd9kOq34r15z5mrbxw4Dz6r8mk9zjxOv9283Wi8CJGy4bYmxVYvqPbgbrB9DpnM7kF7QDDgeSoekw', 'Petrel': 'Ag2j63gD27qkB9PD74Nl103DE9N8ejrqvVWgMOaBn5M99kkqVWtkC6kvO1Xn1YM2NVwd1g7be0oNepHm0KKoeTdk5n'}


In [7]:
# Initialize the crawl. This kicks off the Globus EP crawling service on the backend. 
crawl_url = f'{xtract_base_url}/crawl'
print(f"Crawl URL is : {crawl_url}")
crawl_req = requests.post(crawl_url, json={'repo_type': "GLOBUS", 'eid': source_ep_id, 'dir_path': folder_to_crawl, 'Transfer': transfer_token, 'Authorization': funcx_token,'grouper': grouper, 'https_info': {'base_url':base_url}})
crawl_id = json.loads(crawl_req.content)['crawl_id']
print(f"Crawl ID: {crawl_id}")

Crawl URL is : http://xtract-crawler-4.eba-ghixpmdf.us-east-1.elasticbeanstalk.com/crawl
Crawl ID: 6f889b2f-3c5c-4289-bc64-ede2def32a63


In [9]:
# Wait for the crawl to finish before we can start fetching our metadata. 
while True: 
    crawl_status = requests.get(f'{xtract_base_url}/get_crawl_status', json={'crawl_id': crawl_id})
    print(crawl_status)
    crawl_content = json.loads(crawl_status.content)
    print(f"Crawl Status: {crawl_content}")

    if crawl_content['crawl_status'] == 'SUCCEEDED':
        files_crawled = crawl_content['files_crawled']
        print("Our crawl has succeeded!")
        break
    else:
        print("Sleeping before re-polling...")
        time.sleep(2)

<Response [200]>
Crawl Status: {'bytes_crawled': 0, 'crawl_id': '6f889b2f-3c5c-4289-bc64-ede2def32a63', 'crawl_status': 'STARTING', 'files_crawled': 0, 'groups_crawled': 0}
Sleeping before re-polling...
<Response [200]>
Crawl Status: {'bytes_crawled': 0, 'crawl_id': '6f889b2f-3c5c-4289-bc64-ede2def32a63', 'crawl_status': 'STARTING', 'files_crawled': 0, 'groups_crawled': 0}
Sleeping before re-polling...


KeyboardInterrupt: 

In [56]:
# Now we fetch our metadata. Here you can configure n to be maximum number of 
# messages you want at once. 

file_ls = []
fetched_files = 0
while fetched_files < files_crawled: 
    fetch_mdata = requests.get(f'{xtract_base_url}/fetch_crawl_mdata', json={'crawl_id': crawl_id, 'n': 2})
    fetch_content = json.loads(fetch_mdata.content)
    
    for file_path in fetch_content['file_ls']:
        file_ls.append(file_path)
        fetched_files += 1
        
    if fetch_content['queue_empty']:
        print("Queue is empty!")
        print("Continuing...")
        time.sleep(2)
        
print("All files have been fetched!")
print(f"Files: {file_ls}")
    

Queue is empty!
Continuing...
All files have been fetched!
Files: ['/thurston_selfassembled_peptide_spectra_v1.1/DFT/MoleculeConfigs/di_30_-10.xyz/POSCAR', '/thurston_selfassembled_peptide_spectra_v1.1/DFT/MoleculeConfigs/di_30_-10.xyz/INCAR', '/thurston_selfassembled_peptide_spectra_v1.1/DFT/MoleculeConfigs/di_30_-10.xyz/DOSCAR', '/thurston_selfassembled_peptide_spectra_v1.1/DFT/MoleculeConfigs/di_30_-10.xyz/OUTCAR', '/thurston_selfassembled_peptide_spectra_v1.1/DFT/MoleculeConfigs/di_30_-10.xyz/KPOINTS', '/thurston_selfassembled_peptide_spectra_v1.1/DFT/MoleculeConfigs/di_30_-10.xyz/vasprun.xml', '/thurston_selfassembled_peptide_spectra_v1.1/DFT/MoleculeConfigs/di_30_-10.xyz/Transmatrix', '/thurston_selfassembled_peptide_spectra_v1.1/DFT/MoleculeConfigs/di_30_-10.xyz/PROCAR']
