# Fraglysis download notebook

Allows to easily use the Fragalysis download API to get data for a target so that you can work with it.

In [None]:
# Define the variables that we are going to need. 
# Would these be accessible as environment variables?
FRAGALYSIS_HOST = "https://fragalysis.diamond.ac.uk"
FRAGALYSIS_TARGET = ""  # e.g. A71EV2A
FRAGALYSIS_TAS = ""     # e.g. lb32627-66

# user probably has to define this themselves.
# 1. log in to Fragalysis (e.g. authenticate using CAS)
# 2. Find you session token at https://fragalysis.diamond.ac.uk/api/token/ (or whatever Fragalysis URL you are using
# 3. Copy the value of the sessionid and use it as part of this FRAGALYSIS_AUTH_TOKEN variable
# NOTE: the session token only has a limited lifespan. Once it expires you need to repeat this process.
FRAGALYSIS_AUTH_TOKEN = "<paste-token-here>"

In [None]:
# this is the directory name of where your downloaded data will be extracted to.
DATA_DIR = "fragalysis-data"

In [None]:
# this defines the download code that will be used later

from pathlib import Path
from urllib.parse import urljoin, urlsplit

import requests
import zipfile

LOGIN_URL = "/accounts/login/"
DOWNLOAD_URL = "/api/download_structures/"
LANDING_PAGE_URL = '/viewer/react/landing/'


# this needs to be kept more or less up to date
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"


def download(url, auth_token=None, payload=None):
    print('Downloading from', url)
    splits = urlsplit(url)
    base_url = f'{splits.scheme}://{splits.netloc}'
    download_api_url = urljoin(base_url, DOWNLOAD_URL)
    landing_page_url = urljoin(url, LANDING_PAGE_URL)    

    with requests.Session() as session:
        session.headers.update(
            {
                "User-Agent": USER_AGENT,
                "Referer": landing_page_url,
                "Referrer-policy": "same-origin",
            }
        )
        
        print('Getting Fragalysis landing page')
        session.get(landing_page_url)  # sets csrftoken

        # set manually if still missing
        csrftoken = session.cookies.get('csrftoken', None)
        if csrftoken:
            session.headers.update(
                {
                    "X-CSRFToken": csrftoken,
                    "User-Agent": USER_AGENT,
                }
            )

        if auth_token:
            session.cookies.update(
                {
                    "sessionid": auth_token,
                }
            )

        # this will initiate zipfile creation process. Response is
        # returned when the file is ready (this is not an async
        # operation on the server, may take some time)
        print('Initiating creation of download')
        start_download_process_response = session.post(
            download_api_url,
            data=payload,
        )
        print(start_download_process_response)
        # successful response contains the file_url, something like this
        # {'file_url': '/code/media/downloads/c1b21660-b2ff-4e82-b928-e6f2d19582c7/A71EV2A.zip'}
        

        file_url_response = start_download_process_response.json()
        print(start_download_process_response.json())
        

        if start_download_process_response.ok:
            file_url = file_url_response['file_url']
            print('Downloading file:', file_url)

            local_filename = Path(file_url).name
            with session.get(
                    download_api_url,
                    params=file_url_response,
                    stream=True,
            ) as r:
                r.raise_for_status()
                with open(local_filename, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192): 
                        f.write(chunk)

                print('Downloaded complete')
                
                
                print('Unzipping ...')
                with zipfile.ZipFile(local_filename, 'r') as zip_ref:
                    zip_ref.extractall(DATA_DIR)
                    
                print('Finished. Files can be found in', DATA_DIR)
            

In [None]:
# this is probably what you're most interested in, the dict that
# defines the download contents. Flipping the boolean switches to
# True will include the file type. Avoid including map and other
# huge files unless you really need them.

# I don't quite understand the purpose of file_url in the initial
# request, it has to be '', not False, for this to work. This is probably
# a glitch in the serializer.

# If you're interested in individual observations, add the
# comma-separated list of shorcodes to 'proteins' key.


# this defines what types of file you want to download
payload = {
    'target_name': FRAGALYSIS_TARGET,
    'target_access_string': FRAGALYSIS_TAS,
    'proteins': '',
    'all_aligned_structures': True,
    'pdb_info': True,
    'cif_info': False,
    'mtz_info': False,
    'diff_file': False,
    'event_file': False,
    'sigmaa_file': False,
    'map_info': False,
    'single_sdf_file': False,
    'metadata_info': False,
    'static_link': False,
    'file_url': '',
    'trans_matrix_info': False,
}

# this runs the download
download(
    url=FRAGALYSIS_HOST,
    auth_token=FRAGALYSIS_AUTH_TOKEN,
    payload=payload,
)