## Transferring Deep Blue data between Petrel and Jetstream

In [1]:
import globus_sdk
import os
import pandas as pd
import time

### Setup Globus Transfer

In [2]:
# Set enpoint IDs and file paths
petrel_endpoint = "4f99675c-ac1f-11ea-bee8-0e716405a293"
jetstream_endpoint = "49f1efac-6049-11eb-87c8-02187389bd35"

In [3]:
# Login to Globus Auth
native_auth_client = globus_sdk.NativeAppAuthClient('7414f0b4-7d05-4bb6-bb00-076fa3f17cf5')
native_auth_client.oauth2_start_flow()

print("Login Here:\n\n{0}".format(native_auth_client.oauth2_get_authorize_url()))

Login Here:

https://auth.globus.org/v2/oauth2/authorize?client_id=7414f0b4-7d05-4bb6-bb00-076fa3f17cf5&redirect_uri=https%3A%2F%2Fauth.globus.org%2Fv2%2Fweb%2Fauth-code&scope=openid+profile+email+urn%3Aglobus%3Aauth%3Ascope%3Atransfer.api.globus.org%3Aall&state=_default&response_type=code&code_challenge=REyZrx5THxVn7wMGxZioT05K8MKs99VHyYex2KD3X8E&code_challenge_method=S256&access_type=online


In [90]:
# Authorization code
auth_code = "Kr9NT3KpvLG4wOe6rJm3bhzVNTPNrq"

# Create transfer client
token_response = native_auth_client.oauth2_exchange_code_for_tokens(auth_code)
transfer_access_token = token_response.by_resource_server['transfer.api.globus.org']['access_token']
transfer_authorizer = globus_sdk.AccessTokenAuthorizer(transfer_access_token)
transfer_client = globus_sdk.TransferClient(authorizer=transfer_authorizer)

### Transfer Data

In [4]:
# Import Deep Blue crawl data
deep_blue_crawl_df = pd.read_csv("../data/deep_blue_xtract_crawl.csv")

# Create mapping of file to file UUID
file_uuid_mapping = dict()
for index, row in deep_blue_crawl_df.iterrows():
    file_uuid_mapping[row[0]] = row[4]

# Filter files
filtered_files = deep_blue_crawl_df[deep_blue_crawl_df.extension == "zip"].sort_values(by=["size_bytes"])

In [131]:
# Load files already transferred
transferred_files_file = "data/transferred_files.txt"
transferred_files = []
with open(transferred_files_file, "r") as f:
    for line in f:
        transferred_files.append(line.strip("\n"))

# Pick which files to transfer
transfer_job_size = 0
files_to_transfer = []
max_size_threshold = 5 * 10 ** 9 # Just to make sure we don't blow up the Jetstream instance

for index, row in filtered_files.iterrows():
    file_path = row[0]
    file_size = row[1]
    
    if file_uuid_mapping[file_path] in transferred_files:
        pass
    elif transfer_job_size + file_size > max_size_threshold:
        break
    else:
        files_to_transfer.append(file_path)
        transfer_job_size += file_size

print(f"{len(files_to_transfer)} files to transfer")
print(f"Total size: {transfer_job_size / (10 ** 9)} GB")

472 files to transfer
Total size: 4.972882032 GB


In [134]:
# Transfer data

label = "Deep Blue transfer"
tdata = globus_sdk.TransferData(transfer_client, petrel_endpoint,
                                jetstream_endpoint,
                                label=label)

# Transfer file to file UUID to avoid name collisions
for file in files_to_transfer:
    tdata.add_item(file, f"~/ryan/deep_blue_data/{os.path.basename(file_uuid_mapping[file])}")

transfer_client.endpoint_autoactivate(petrel_endpoint)
transfer_client.endpoint_autoactivate(jetstream_endpoint)

submit_result = transfer_client.submit_transfer(tdata)
print("Task ID:", submit_result["task_id"])

Task ID: 8e09bbaa-60be-11eb-8c2f-0eb1aa8d4337


In [135]:
# Poll Globus for results

r = transfer_client.get_task(submit_result['task_id'])

while r.data["status"] != "SUCCEEDED":
    print("_________")
    print(f"Status: {r.data['status']}")
    print(f"Bytes transferred: {r['bytes_transferred']}, Files transferred: {r['files_transferred']}, Transfer rate: {r['effective_bytes_per_second']}")
    r = transfer_client.get_task(submit_result['task_id'])
    time.sleep(10)

# Write files that were transferred
with open("data/transferred_files.txt", "a") as f:
    for file in files_to_transfer:
        f.write(file_uuid_mapping[file] + "\n")

_________
Status: ACTIVE
Bytes transferred: 0, Files transferred: 0, Transfer rate: 0
_________
Status: ACTIVE
Bytes transferred: 0, Files transferred: 0, Transfer rate: 0
_________
Status: ACTIVE
Bytes transferred: 0, Files transferred: 0, Transfer rate: 0
_________
Status: ACTIVE
Bytes transferred: 0, Files transferred: 0, Transfer rate: 0
_________
Status: ACTIVE
Bytes transferred: 0, Files transferred: 0, Transfer rate: 0
_________
Status: ACTIVE
Bytes transferred: 0, Files transferred: 0, Transfer rate: 0
_________
Status: ACTIVE
Bytes transferred: 0, Files transferred: 0, Transfer rate: 0
_________
Status: ACTIVE
Bytes transferred: 4835255423, Files transferred: 344, Transfer rate: 75582445
_________
Status: ACTIVE
Bytes transferred: 4835255423, Files transferred: 344, Transfer rate: 65280503
