In [1]:
# import library
import json
import pandas as pd
import requests

from itertools import groupby
from operator import itemgetter


In [2]:
def get_mobidb_disordered_data(uniprot):
    """
    Retrieve disorder-related information for a protein using the DISPROT/MobiDB database.

    This function takes a UniProt ID as input and queries the DISPROT/MobiDB database to retrieve disorder-related
    information for the specified protein. It returns a list of intervals representing disordered regions based on
    specific keywords, with a preference for "curated" disorder information.

    Args:
        uniprot (str): The UniProt ID of the protein for which disorder information is to be retrieved.

    Returns:
        list: A list of intervals representing disordered regions. Each interval is represented as a tuple (start, end).

    Example:
        disordered_regions = get_mobidb_disordered_data("P12345")
        print("Disordered Regions:")
        for interval in disordered_regions:
            print(f"[{interval[0]}, {interval[1]}]")

    Note:
        - The function queries the DISPROT/MobiDB database via its API.
        - It prioritizes "curated" disorder information when available.
        - Disorder information is returned based on specific keywords.
        - If no disorder information is found, an empty list is returned.
    """
    # The information about these triplets can be found here: https://mobidb.org/help#vocabulary
    keywords = ['curated-disorder-priority',
                'homology-disorder-priority',
                'derived-missing_residues-priority', 
                'prediction-disorder-priority', 'derived-mobile-th_90'] #, , 'derived-mobile_context_dependent-th_90'

    url = 'https://mobidb.org/api/download?format=json&acc=' + uniprot

    # Check if the ID exists in DISPROT/MOBIDB
    res = requests.get(url)

    if res.status_code == 200:
        try:
            result = res.json()
        except:
            print("ID DOES NOT EXITS IN THE DATABASE")
            return []  # Return an empty list if JSON parsing fails

        disordered_regions = []

        for key in keywords:
            if key in result.keys():
                regions = result[key]['regions']
                # print(key, regions)
                disordered_regions.append(tuple(regions))

        # print(disordered_regions)
        if len(disordered_regions) == 0:
            print("NO DISORDER REGION FOUND")
            return "Fully structured"
        return disordered_regions

    
    return []  # Return an empty list if the ID does not exist in the database


In [3]:
def merge_data(data_sets):
    """
    Merge and compact intervals from multiple data sets.

    This function takes a list of data sets, where each data set consists of one or more intervals.
    It combines intervals from all data sets, merges overlapping intervals, and further merges adjacent intervals.
    The resulting merged intervals are printed in compact form.

    Args:
        data_sets (list): A list of data sets, where each data set is represented as a tuple of intervals.
            Each interval is represented as a list with two elements: [start, end].

    Returns:
        None: The function prints the merged intervals but does not return a value.

    Example:
        data_sets = [([1, 56],), ([1, 56],), ([59, 68],), ([57, 58], [150, 160])]
        merge_data(data_sets)

    Output:
        Compact Merged Intervals:
        [[1, 68], [150, 160]]

    Note:
        - Intervals within each data set are merged if they overlap.
        - Intervals across different data sets are combined and merged.
        - Adjacent intervals are further merged into compact intervals.

    """
    # print(data_sets)
    if isinstance(data_sets, str):
        # if protein is fully structure, data_sets will be string- then do nothing
        return []
    if len(data_sets) == 0:
        # print("[ID does not exist in Database]")
        return []
    # Initialize an empty list to store all intervals
    all_intervals = []

    # Iterate through each data set and collect all intervals
    for intervals in data_sets:
        all_intervals.extend(intervals)
    
#     print(all_intervals)
    # Sort the intervals by their start values
    all_intervals.sort(key=lambda x: x[0])

    # Initialize a list to store the merged intervals
    merged_intervals = []

    # Iterate through the sorted intervals and merge overlapping intervals
    for interval in all_intervals:
        if not merged_intervals or interval[0] > merged_intervals[-1][1]:
            # If the interval does not overlap with the last merged interval, add it as a new merged interval
            merged_intervals.append(interval)
        else:
            # If the interval overlaps with the last merged interval, merge them
            merged_intervals[-1] = [merged_intervals[-1][0], max(merged_intervals[-1][1], interval[1])]

    # Further merge adjacent intervals
    final_merged_intervals = []
    current_interval = merged_intervals[0]

    for interval in merged_intervals[1:]:
        if current_interval[1] + 1 == interval[0]:
            current_interval[1] = interval[1]
        else:
            final_merged_intervals.append(current_interval)
            current_interval = interval

    final_merged_intervals.append(current_interval)


    return tuple(final_merged_intervals)


In [4]:
def count_overlap(first_range, second_ranges):
    """
    Count the number of overlapping numbers between two range tuples.

    Args:
        first_range (tuple): A tuple representing the first range as (start, end).
        second_ranges (tuple or list of tuples): A tuple or list of tuples representing the second ranges.

    Returns:
        list: A list of counts, where each count corresponds to the number of overlapping numbers for each region
            in the second_ranges.

    Example:
        first_range = (57, 160)
        second_ranges = ([1, 68], [150, 160])
        overlaps = count_overlap(first_range, second_ranges)
        print(overlaps)  # Output: [12, 11]

    """
    # print(first_range, second_ranges)
    if isinstance(second_ranges, tuple):
        second_ranges = list(second_ranges)
    # print(second_ranges)
    overlaps = []

    if not bool(second_ranges):
        # when disorder region is empty-fully structure-return an empty list
        return []
    
    for second_range in second_ranges:
        # print(second_range)
        start = max(first_range[0], second_range[0])
        end = min(first_range[1], second_range[1])

        if start <= end:
            overlap_count = end - start + 1
            overlaps.append(overlap_count)
        else:
            overlaps.append(0)

    return overlaps


In [5]:
def classify_idr_regions(length_of_idr, length_in_ped):
    """
    Classifies Intrinsic Disorder Regions (IDR) based on specified criteria.

    Args:
        length_of_idr (list of int): A list of integer values representing the lengths
            of individual IDR regions.
        length_of_pdb (int): The total length of the Protein Data Bank (PDB) structure.

    Returns:
        str: A classification string for the IDR regions based on the following criteria:
            - If any region has a length greater than or equal to 50 and accounts for
              more than 95% of the PDB length, it is classified as "Full IDP."
            - If any region has a length greater than or equal to 20, it is classified as
              "Long IDR."
            - If any region has a length between 5 and 19 (inclusive), it is classified as
              "Short IDR."
            - If all regions have lengths less than 5, the entire structure is classified
              as "Fully Structured."

    Examples:
        >>> length_of_idr = [1, 10, 69]
        >>> length_of_pdb = 69
        >>> classification = classify_idr_regions(length_of_idr, leng_of_pdb)
        >>> print(classification)
        "Long IDR"
        
    Note:
        - If multiple criteria apply to different regions, the most strict criteria
          are applied. The priority is "Full IDP" > "Long IDR" > "Short IDR" > "Fully Structured."
        - This function assumes that the input values are valid and correctly represent
          the lengths of IDR regions and the PDB length.

    """
    if length_in_ped < 20:
        return "Not Classified"
    
    classifications = []

    for idr_length in length_of_idr:
        if idr_length >= 50 and (idr_length / length_in_ped) >= 0.95:
            # the second condition should be sum(length_of_idr)/ length_in_ped it will make more sense
            classifications.append("Full IDP")
        elif idr_length >= 20:
            classifications.append("Long IDR")
        elif 5 <= idr_length <= 19:
            classifications.append("Short IDR")
        else:
            classifications.append("Undefined")

    # Determine the final classification based on priority
    if "Full IDP" in classifications:
        return "Fully IDP"
    elif "Long IDR" in classifications:
        return "Long IDR"
    elif "Short IDR" in classifications:
        return "Short IDR"
    else:
        return "Fully Structured"


In [6]:
def shift_and_filter_disorder_regions(disorder_regions_uniprot, segment_in_uniprot, pdb_indices):
    """
    Shifts and filters disorder regions from UniProt coordinates to PDB coordinates.

    Parameters:
    - uniprot_length (int): The length of the UniProt sequence.
    - disorder_regions_uniprot (list of tuples): List of disorder region start and end coordinates in UniProt.
    - segment_in_uniprot (list): List containing the start and end coordinates of the segment of interest in UniProt.
    - pdb_indices (list): List containing the start and end coordinates of the corresponding segment in the PDB structure.

    Returns:
    - disorder_regions_shifted_pdb (list of tuples): List of disorder region start and end coordinates in PDB.
    """

    if segment_in_uniprot[1] - segment_in_uniprot[0] != pdb_indices[1] - pdb_indices[0]:
#         raise ValueError("The lengths of the segments in UniProt and PDB must be the same.")
        print("The lengths of the segments in UniProt and PDB must be the same.")
#         return []

    # Calculate the shift rule
    shift_rule = segment_in_uniprot[0] - pdb_indices[0]

    # Initialize a list to store the shifted disorder regions
    disorder_regions_shifted = []

    # Apply the shift rule to each disorder region in UniProt
    for disorder in disorder_regions_uniprot:
        start = disorder[0] - shift_rule
        end = disorder[1] - shift_rule
        disorder_regions_shifted.append([start, end])

    # Initialize a list to store the filtered disorder regions in PDB
    disorder_regions_shifted_pdb = []

    # Filter the shifted disorder regions based on PDB indices
    for disorder_shifted in disorder_regions_shifted:
        start = max(disorder_shifted[0], pdb_indices[0])
        end = min(disorder_shifted[1], pdb_indices[1])

        if start <= end:
            disorder_regions_shifted_pdb.append([start, end])

    return disorder_regions_shifted_pdb




In [7]:
def combine_fragment_pdb_indices(data_sets):
    
    """
    Merge and compact intervals from a list of data sets.

    This function takes a list of data sets, where each data set consists of one or more intervals.
    It combines intervals from all data sets, merges overlapping intervals, and further merges adjacent intervals.
    The resulting merged intervals are returned as a list of compact intervals.

    Args:
        data_sets (list): A list of data sets, where each data set is represented as a list of intervals.
            Each interval is represented as a list with two elements: [start, end].

    Returns:
        list: A list of compact merged intervals, where each interval is represented as a list with two elements: [start, end].

    Example:
        data_sets = [[1, 15], [16, 252], [253, 259], [260, 578], [579, 588], [589, 825]]
        merged_intervals = merge_data(data_sets)
        print(merged_intervals)

    Output:
        [[1, 825]]

    Note:
        - Intervals within each data set are merged if they overlap.
        - Intervals across different data sets are combined and merged.
        - Adjacent intervals are further merged into compact intervals.
    """
        
    if not data_sets:
        return []

    # Initialize an empty list to store all intervals
    all_intervals = data_sets

    # Sort the intervals by their start values
    all_intervals.sort(key=lambda x: x[0])

    # Initialize a list to store the merged intervals
    merged_intervals = []

    # Iterate through the sorted intervals and merge overlapping intervals
    for interval in all_intervals:
        if not merged_intervals or interval[0] > merged_intervals[-1][1]:
            # If the interval does not overlap with the last merged interval, add it as a new merged interval
            merged_intervals.append(interval)
        else:
            # If the interval overlaps with the last merged interval, merge them
            merged_intervals[-1] = [merged_intervals[-1][0], max(merged_intervals[-1][1], interval[1])]

    # Further merge adjacent intervals
    final_merged_intervals = []
    current_interval = merged_intervals[0]

    for interval in merged_intervals[1:]:
        if current_interval[1] + 1 == interval[0]:
            current_interval[1] = interval[1]
        else:
            final_merged_intervals.append(current_interval)
            current_interval = interval

    final_merged_intervals.append(current_interval)

    return final_merged_intervals


In [8]:
def get_ped_stats(PEDID):
    """
    Retrieve and display protein ensemble deposition (PED) statistics for a given PED entry.

    This function queries the Protein Ensemble Deposition (PED) API to retrieve statistics and information for a
    specific PED entry identified by its PEDID. It provides details such as the entry ID, title, and information
    about chains and fragments within the entry. Additionally, it calculates the number of overlapping residues
    between fragment positions and disordered regions retrieved from the DISPROT/MobiDB database.

    Args:
        PEDID (str): The PED entry ID for the entry to retrieve statistics.

    Returns:
        None: The function prints the PED statistics and overlap counts but does not return a value.

    Example:
        get_ped_stats("PED12345")

    Note:
        - This function requires the 'requests' library for HTTP requests and 'colorama' for colored output.
        - It queries the PED API to obtain entry information.
        - It retrieves disordered region information using the DISPROT/MobiDB database.
        - Overlapping residues between fragment positions and disordered regions are calculated and displayed.
    """
    url = "https://deposition.proteinensemble.org/api/v1/entries/" + PEDID
    res = requests.get(url)
    if res.status_code == 200:
        res = res.json()
#         print("PED ID\t# chains in entry\tProtein name\t\"Length in PED (tag counted)\"\tUniProt\tLength UniProt\t\"Disordered region from MobiDB/DisProt\"\t\"PDB region (align to Uniprot)\"\tPDB Indices\tLength of IDR\t Classification\tDisorder Regions (PDB)")

        construct_chains = res['construct_chains']
        
        res = []

        for chain in construct_chains:
            if len(construct_chains) == 1:
                # chain_name = chain['chain_name']
                entry = PEDID
            else:
                # chain_name = res['entry_id'] + '_' + chain['chain_name']
                entry= PEDID + '_' + chain['chain_name']

            n_fragments = len(chain['fragments'])
            fragments = chain['fragments']
            """
            whole_disorder_pdb is the disorder region in pdb file, which is the indices in pdb (shifted from mobiDB)
            whole_pdb: list of all residue indices in pdb
            """
            whole_disorder_pdb = [] # store disorder of all fragment
            linker_pdb = []
            whole_pdb = []
            for fragment, fragment_stats in zip(fragments, chain['fragments_stats']):
                protein_name = fragment['description']
                length_in_ped = fragment_stats['length_total_pdb']
                uniprot = fragment_stats['uniprot']
                length_uniprot = fragment_stats['length_total_uniprot']

                mobi_disorder_regions = tuple()
                if fragment_stats['uniprot'] is not None:
                    mobi_disorder_regions = merge_data(get_mobidb_disordered_data(fragment_stats['uniprot']))
                else:
#                     #when uniprot is None -> this is linker, will be added to disorder region
#                     whole_disorder_pdb.append([fragment_stats['start_position_pdb'], fragment_stats['end_position_pdb']])
                    linker_pdb.append([fragment_stats['start_position_pdb'], fragment_stats['end_position_pdb']])

                pdb_region = tuple([fragment['start_position'], fragment['end_position']])
                pdb_indices = [fragment_stats['start_position_pdb'], fragment_stats['end_position_pdb']]
                whole_pdb.append(pdb_indices)
                length_of_idr = count_overlap(pdb_region, mobi_disorder_regions)
                classification = classify_idr_regions(length_of_idr, length_in_ped)
                
                # Disorder region in pdb
                disorder_regions_shifted_pdb = shift_and_filter_disorder_regions(mobi_disorder_regions, pdb_region, pdb_indices)
                
                if len(disorder_regions_shifted_pdb) != 0:
                    for disorder in disorder_regions_shifted_pdb:
                        whole_disorder_pdb.append(disorder)
                
#                 print(f"{entry}\t{len(construct_chains)}\t{protein_name}\t{length_in_ped}\t{uniprot}\t{length_uniprot}\t{mobi_disorder_regions}\t{pdb_region}\t{pdb_indices}\t{length_of_idr}\t{classification}\t{disorder_regions_shifted_pdb}")
            """
            This is disorder region in PDB, what is full residue indices in pdb so that we can distinguish between disorder and order?
            """
#             print(entry)
#             print(f"Disorder region: {combine_fragment_pdb_indices(whole_disorder_pdb)}")
#             print(f"PDB indices: {combine_fragment_pdb_indices(whole_pdb)}")
            entry_chain = {"entry":entry, "linker":linker_pdb, "disorder_regions":combine_fragment_pdb_indices(whole_disorder_pdb), "pdb_indices":combine_fragment_pdb_indices(whole_pdb)}
            res.append(entry_chain)
        return res
    elif res.status_code == 404:
        print(f"Entry {PEDID} does not exist")
        return
    

In this code, we need to printout the range of residues in PDB, disorder regions in PDB indices.

In [16]:
final_res = []
for id in range(301, 434):
    PEDID='PED'+f'{id:05d}'
    print(PEDID)
#     get_ped_stats(PEDID)
    entry_res = get_ped_stats(PEDID)
    if entry_res:
        for item in entry_res:
            final_res.append(item)
    

PED00301
PED00302
PED00303
PED00304
PED00305
PED00306
PED00307
PED00308
The lengths of the segments in UniProt and PDB must be the same.
PED00309
The lengths of the segments in UniProt and PDB must be the same.
PED00310
PED00311
PED00312
PED00313
PED00314
PED00315
PED00316
PED00317
ID DOES NOT EXITS IN THE DATABASE
PED00318
PED00319
ID DOES NOT EXITS IN THE DATABASE
The lengths of the segments in UniProt and PDB must be the same.
ID DOES NOT EXITS IN THE DATABASE
PED00320
PED00321
The lengths of the segments in UniProt and PDB must be the same.
PED00322
PED00323
PED00324
PED00325
PED00326
The lengths of the segments in UniProt and PDB must be the same.
The lengths of the segments in UniProt and PDB must be the same.
PED00327
NO DISORDER REGION FOUND
PED00328
The lengths of the segments in UniProt and PDB must be the same.
PED00329
The lengths of the segments in UniProt and PDB must be the same.
PED00330
The lengths of the segments in UniProt and PDB must be the same.
PED00331
PED00332


In [17]:
for item in final_res:
    print(json.dumps(item), end=',\n')

{"entry": "PED00301", "linker": [], "disorder_regions": [[1, 33], [52, 77], [91, 102], [108, 136]], "pdb_indices": [[1, 136]]},
{"entry": "PED00302", "linker": [[1, 2]], "disorder_regions": [[36, 62], [81, 90]], "pdb_indices": [[1, 90]]},
{"entry": "PED00303", "linker": [], "disorder_regions": [[34, 45]], "pdb_indices": [[1, 45]]},
{"entry": "PED00304", "linker": [], "disorder_regions": [[1, 2], [29, 65]], "pdb_indices": [[1, 86]]},
{"entry": "PED00305_A", "linker": [], "disorder_regions": [[38, 83]], "pdb_indices": [[1, 83]]},
{"entry": "PED00305_B", "linker": [], "disorder_regions": [[138, 183]], "pdb_indices": [[101, 183]]},
{"entry": "PED00306", "linker": [[41, 44]], "disorder_regions": [[45, 61]], "pdb_indices": [[41, 115]]},
{"entry": "PED00307_A", "linker": [[1, 1]], "disorder_regions": [[4, 4]], "pdb_indices": [[1, 82]]},
{"entry": "PED00307_B", "linker": [[1, 1]], "disorder_regions": [[14, 14]], "pdb_indices": [[1, 90]]},
{"entry": "PED00307_C", "linker": [], "disorder_regions