In [1]:
from google.colab import drive
import os

drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [None]:
import os
import time
import requests
import pandas as pd
import numpy as np
from multiprocessing import Pool, cpu_count
from tqdm.notebook import tqdm

In [None]:
os.chdir('/content/drive/MyDrive/BRI Analysis')

In [None]:
# @title Read back in data

clean_output_file = f'./data/cleaned_connective_chains.csv'
final_clean_df = pd.read_csv(clean_output_file)

#replace nan values of final_clean_df['chain_id'] with string 'NA'
final_clean_df['chain_id'] = final_clean_df['chain_id'].fillna('NA')
final_clean_df['chain_id'] = final_clean_df['chain_id'].astype(str)

pdb_ids = final_clean_df['pdb_id'].unique()

In [None]:
# --- Configuration ---
OUTPUT_FILE = "./data/PDB727K_webscrape_meta_data.csv" # Keeping filename consistent with your request
BATCH_SIZE = 250   # How many IDs to send in ONE request (API handles ~500 max reliably)
SAVE_INTERVAL = 20 # Save to disk every 20 batches (approx every 5,000 IDs)

# --- GraphQL Query ---
# We ask for all fields for a LIST of IDs in one go.
GRAPHQL_QUERY = """
query ($ids: [String!]!) {
  entries(entry_ids: $ids) {
    rcsb_id
    exptl {
      method
    }
    rcsb_entry_info {
      resolution_combined
    }
    rcsb_accession_info {
      deposit_date
      initial_release_date
    }
    refine {
      ls_R_factor_R_work
      ls_R_factor_R_free
      ls_R_factor_obs
    }
    rcsb_primary_citation {
      title
      rcsb_authors
      pdbx_database_id_DOI
    }
  }
}
"""

def fetch_batch_metadata(batch_ids):
    """
    Fetches metadata for a list of PDB IDs using the RCSB GraphQL API.
    """
    url = "https://data.rcsb.org/graphql"

    try:
        response = requests.post(
            url,
            json={'query': GRAPHQL_QUERY, 'variables': {'ids': batch_ids}},
            timeout=30
        )
        response.raise_for_status()
        data = response.json()

        # If API returns errors (e.g. some IDs not found), they appear in 'errors'
        # but valid data appears in 'data'. We process 'data' if it exists.
        if 'data' not in data or not data['data']['entries']:
            return []

        parsed_records = []
        for entry in data['data']['entries']:
            try:
                # Helper to safely get first item from list or None
                def get_first(lst, key=None):
                    if not lst or not isinstance(lst, list): return "NA"
                    item = lst[0]
                    return item.get(key, "NA") if key else item

                # Helper for nested dicts
                def get_nested(root, *keys):
                    val = root
                    for k in keys:
                        if isinstance(val, dict):
                            val = val.get(k)
                        else:
                            return "NA"
                    return val if val is not None else "NA"

                # Extracting specific fields safely
                # Method
                method = get_first(entry.get('exptl'), 'method')

                # Resolution
                resolution = get_nested(entry, 'rcsb_entry_info', 'resolution_combined')

                # Dates
                dep_date = get_nested(entry, 'rcsb_accession_info', 'deposit_date')
                rel_date = get_nested(entry, 'rcsb_accession_info', 'initial_release_date')

                # R-Values (Taken from first refinement shell)
                refine = entry.get('refine', [])
                r_work = get_first(refine, 'ls_R_factor_R_work')
                r_free = get_first(refine, 'ls_R_factor_R_free')
                r_obs  = get_first(refine, 'ls_R_factor_obs')

                # Citation
                cit = entry.get('rcsb_primary_citation', {})
                # Citation might be a dict or null, not a list in some contexts, but usually direct dict in rcsb_primary_citation
                # However, sometimes it's missing entirely.
                citation_title = cit.get('title', "NA") if cit else "NA"
                doi = cit.get('pdbx_database_id_DOI', "NA") if cit else "NA"
                authors = cit.get('rcsb_authors', []) if cit else []
                # Join authors list into string
                authors_str = "; ".join(authors) if isinstance(authors, list) else "NA"

                parsed_records.append({
                    'pdb_id': entry['rcsb_id'],
                    'Method': method,
                    'Resolution': resolution,
                    'R-Value Free': r_free,
                    'R-Value Work': r_work,
                    'R-Value Observed': r_obs,
                    'citation': citation_title,
                    'doi': doi,
                    'authors': authors_str,
                    'deposition_date': dep_date,
                    'release_date': rel_date
                })
            except Exception as e:
                # Catch individual record errors
                continue

        return parsed_records

    except Exception as e:
        print(f"Batch failed: {e}")
        return []



Starting API fetch for 110328 IDs using GraphQL...
Processing 442 batches with 2 cores.


Fetching Data:   0%|          | 0/442 [00:00<?, ?it/s]


✅ Completed in 48.92 seconds.
Fetched 110298 records.
Saved to: ./data/PDB727K_webscrape_meta_data.csv


In [None]:
# --- Main Execution ---
if __name__ == '__main__':

    # 1. Setup Input List
    # Assuming 'pdb_ids' variable exists from your previous cells
    # If not, uncomment below:
    # pdb_ids = final_clean_df['pdb_id'].unique()

    pdb_id_list = list(pdb_ids)
    total_ids = len(pdb_id_list)

    # Create output directory
    os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)

    print(f"Starting API fetch for {total_ids} IDs using GraphQL...")

    # 2. Create Batches
    # Split list into chunks of BATCH_SIZE (e.g., 250)
    batches = [pdb_id_list[i:i + BATCH_SIZE] for i in range(0, total_ids, BATCH_SIZE)]
    print(f"Processing {len(batches)} batches with {cpu_count()} cores.")

    # 3. Process in Parallel
    all_results = []
    start_time = time.time()

    with Pool(processes=min(cpu_count(), 8)) as pool: # Cap at 8 to avoid rate limits

        # imap_unordered yields results as they come in
        results_iter = pool.imap_unordered(fetch_batch_metadata, batches)

        for i, batch_res in enumerate(tqdm(results_iter, total=len(batches), desc="Fetching Data")):
            if batch_res:
                all_results.extend(batch_res)

            # Intermediate Save
            if (i + 1) % SAVE_INTERVAL == 0:
                temp_df = pd.DataFrame(all_results)
                temp_df.to_csv(OUTPUT_FILE, index=False)

    # 4. Final Save
    final_df = pd.DataFrame(all_results)
    final_df.to_csv(OUTPUT_FILE, index=False)

    duration = time.time() - start_time
    print(f"\n✅ Completed in {duration:.2f} seconds.")
    print(f"Fetched {len(final_df)} records.")
    print(f"Saved to: {OUTPUT_FILE}")

In [None]:
final_df

Unnamed: 0,pdb_id,Method,Resolution,R-Value Free,R-Value Work,R-Value Observed,citation,doi,authors,deposition_date,release_date
0,1A53,X-RAY DIFFRACTION,[2.0],0.212,0.159,0.159,The catalytic mechanism of indole-3-glycerol p...,10.1016/S0022-2836(02)00378-9,"Hennig, M.; Darimont, B.D.; Jansonius, J.N.; K...",1998-02-19T00:00:00Z,1999-03-23T00:00:00Z
1,1A54,X-RAY DIFFRACTION,[1.6],0.208,0.177,,Crystal structure of phosphate binding protein...,10.1021/bi980428z,"Hirshberg, M.; Henrick, K.; Haire, L.L.; Vasis...",1998-02-19T00:00:00Z,1998-10-14T00:00:00Z
2,1A55,X-RAY DIFFRACTION,[2.4],0.227,0.173,,Crystal structure of phosphate binding protein...,10.1021/bi980428z,"Hirshberg, M.; Henrick, K.; Haire, L.L.; Vasis...",1998-02-19T00:00:00Z,1998-10-14T00:00:00Z
3,1A56,SOLUTION NMR,,,,,Primary sequence and solution conformation of ...,,"Timkovich, R.; Bergmann, D.; Arciero, D.M.; Ho...",1998-02-20T00:00:00Z,1998-10-21T00:00:00Z
4,1A57,SOLUTION NMR,,,,,The three-dimensional structure of a helix-les...,,"Steele, R.A.; Emmert, D.A.; Kao, J.; Hodsdon, ...",1998-02-20T00:00:00Z,1998-05-27T00:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...
110293,9RSA,X-RAY DIFFRACTION,[1.8],,,0.196,Crystal structure of two covalent nucleoside d...,10.1021/bi00456a012,"Nachman, J.; Miller, M.; Gilliland, G.L.; Cart...",1989-08-28T00:00:00Z,1991-04-15T00:00:00Z
110294,9RUB,X-RAY DIFFRACTION,[2.6],,,0.199,"Crystal structure of activated ribulose-1,5-bi...",,"Lundqvist, T.; Schneider, G.",1990-11-28T00:00:00Z,1993-01-15T00:00:00Z
110295,9WGA,X-RAY DIFFRACTION,[1.8],,,0.175,2.2 A resolution structure analysis of two ref...,10.1016/S0022-2836(05)80174-3,"Wright, C.S.",1990-04-20T00:00:00Z,1990-10-15T00:00:00Z
110296,9XIA,X-RAY DIFFRACTION,[1.9],,,0.141,X-ray analysis of D-xylose isomerase at 1.9 A:...,10.1073/pnas.86.12.4440,"Carrell, H.L.; Glusker, J.P.; Burger, V.; Manf...",1990-10-11T00:00:00Z,1991-10-15T00:00:00Z
