In [1]:
"""
Trim proteins sequences to the Pfam domain boundaries using the InterPro API.

$ python trim_sequence.py > output.fa
"""

import requests
from time import sleep

def main():
    pfam_id= "PF02216"  # Example Pfam ID
    api_url=f"https://www.ebi.ac.uk/interpro/api/protein/uniprot/entry/pfam/{pfam_id}?page_size=200&extra_fields=sequence"
    api_output = get_interpro_data(api_url)
    trimmed = trim_sequence(api_output)
    print("\n".join(trimmed))

def get_interpro_data(url):
  output = []
  
  while url:
    attempts = 0
    while attempts < 3:
      try:
        response = requests.get(url, headers={"Accept": "application/json"})
        if response.status_code == 408:
          attempts += 1
          print(f"Received 408 Timeout. Retrying {attempts}/3...")
          sleep(61)
          continue
        elif response.status_code == 204:
          #no data so leave loop
          break
        response.raise_for_status()
        data = response.json()

        if data.get('results'):
          output.extend(data.get('results'))
        else:
          output.append(data)

        url = data.get('next', "")
        sleep(1)
        break
      except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        break
    else:
      print("Max retries reached for URL:", url)
      break

  return output

def trim_sequence(api_output):
    all_trimmed = []
    for protein in api_output:
        # Get the protein sequence
        sequence = protein["extra_fields"]["sequence"]
        protein_acc = protein.get("metadata", {}).get("accession", "")
        fragments = []
        # Get the fragments (start and end positions)
        for entry in protein.get("entries", []):
            for loc in entry["entry_protein_locations"]:
                for frag in loc["fragments"]:
                    fragments.append((int(frag["start"]), int(frag["end"])))
        # Sort fragments by start position
        fragments.sort()
        trimmed_parts = []
        for i, (start, end) in enumerate(fragments):
            frag_seq = sequence[start-1:end]
            all_trimmed.append(f">{protein_acc}/{start}-{end}\n{frag_seq}")
    return all_trimmed

if __name__ == "__main__":
    main()


>A0A077UI41/37-88
AQHDEDQQNAFYQVLNMPNLNADQRNGFIQSLKDDPSQSANVLGEAKKLNDS
>A0A077UI41/97-149
QNNFNKDQQSAFYEILNMPNLNEAQRNGFIQSLKDDPSQSTNVLGEAKKLNES
>A0A077UI41/155-207
DNNFNKDQQNAFYEILNMPNLNEEQRNGFIQSLKDDPSQSANLLAEAKKLNES
>A0A077UI41/213-265
DNKFNKEQQNAFYEILHLPNLNEEQRNGFIQSLKDDPSQSANLLAEAKKLNDA
>A0A077UI41/271-323
DNKFNKEQQNAFYEILHLPNLNEEQRNGFIQSLKDDPSVSKEILAEAKKLNDA
>A0A077UKV7/44-94
QNNYVTDQQKAFYQVLHLKGIAEEQRDQYIKTLREHPERAQEVFSESLKDS
>A0A077UKV7/96-147
NPERRVAQQNAFYDIHNNKNLTEAEKNNYIAQIKENPDRSQQVWVESIQSPK
>A0A0A7LVG6/1-51
QHDEAQQNAFYQVLNMPNLNADQRNGFIQSLKDDPSQSANVLGEAQKLNDS
>A0A0A7LVG6/60-112
QNNFNKDQQSAFYEILNMPNLNEAQRNGFIQSLKDDPSQSTNVLGEAKKLNES
>A0A0A7LVG6/118-163
DNNFNKEQQNAFYEILNMPNLNEEQRNGFIQSLKDDPSQSANLLSE
>A0A0D3QAF5/44-94
QNNYVTDQQKAFYQVLHLKGITEEQRNQYIKTLREHPERAQEVFSESLKDS
>A0A0D3QAF5/96-148
NPDRRVAQQNAFYNVLKNDNLTEQEKNNYIAQIKENPDRSQQVWVESVQSSKA
>A0A0E1XAJ8/49-100
AQHDEAQQNAFYQVLNMPNLNADQRNGFIQSLKDDPSQSANVLGEAQKLNDS
>A0A0E1XAJ8/109-161
QNKFNKDQQSAFYEILNMPNLNEEQRNGFIQSLKDDPSQSTNVLGEAK