In [2]:
import requests

# Function to get UniProt mappings for a PDB code
def get_uniprot_mappings(pdb_code):
    url = f'https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_code}'
    response = requests.get(url)
    data = response.json()
    if pdb_code in data:
        return data[pdb_code]['UniProt']
    return None

# Example list of CATH domains
cath_domains = ['1oaiA00', '1go5A00']  # Example CATH domains

# List to store mapping results
mapping = []

# Loop through CATH domains to get UniProt mappings
for domain in cath_domains:
    pdb_code = domain[:4]  # Extract PDB code
    chain = domain[4]      # Extract chain
    uniprot_mappings = get_uniprot_mappings(pdb_code)
    
    if uniprot_mappings:
        # Loop through the UniProt entries and their mappings
        for uniprot_id, details in uniprot_mappings.items():
            for entry in details['mappings']:
                if entry['chain_id'] == chain:
                    mapping.append({
                        'CATH Domain': domain,
                        'PDB Code': pdb_code,
                        'Chain': chain,
                        'UniProt ID': uniprot_id,
                        'UniProt Name': details.get('identifier', ''),
                        'UniProt Start': entry['unp_start'],
                        'UniProt End': entry['unp_end']
                    })

# Print the results
for result in mapping:
    print(result)


{'CATH Domain': '1oaiA00', 'PDB Code': '1oai', 'Chain': 'A', 'UniProt ID': 'Q9UBU9', 'UniProt Name': 'NXF1_HUMAN', 'UniProt Start': 561, 'UniProt End': 619}
{'CATH Domain': '1go5A00', 'PDB Code': '1go5', 'Chain': 'A', 'UniProt ID': 'Q9UBU9', 'UniProt Name': 'NXF1_HUMAN', 'UniProt Start': 551, 'UniProt End': 619}


In [3]:
import pandas as pd

# Define column names based on the data structure
column_names = [
    'Domain', 'Class', 'Architecture', 'Topology', 'Homologous_superfamily',
    'S35', 'S60', 'S95', 'S100_cluster', 'S00_count', 'Domain_length', 'Resolution'
]

# Read the file, skipping lines that start with '#'
file_path = '../datasets/cath-domain-list.txt'
df = pd.read_csv(file_path, delim_whitespace=True, comment='#', header=None, names=column_names)


df['Domain']


  df = pd.read_csv(file_path, delim_whitespace=True, comment='#', header=None, names=column_names)


0         1oaiA00
1         1go5A00
2         3frhA01
3         3friA01
4         3b89A01
           ...   
500233    4aybQ00
500234    3hkzY00
500235    3hkzZ00
500236    3zbeA00
500237    3duzA04
Name: Domain, Length: 500238, dtype: object

In [4]:
df

Unnamed: 0,Domain,Class,Architecture,Topology,Homologous_superfamily,S35,S60,S95,S100_cluster,S00_count,Domain_length,Resolution
0,1oaiA00,1,10,8,10,1,1,1,1,1,59,1.00
1,1go5A00,1,10,8,10,1,1,1,1,2,69,999.00
2,3frhA01,1,10,8,10,2,1,1,1,1,58,1.20
3,3friA01,1,10,8,10,2,1,1,1,2,54,1.80
4,3b89A01,1,10,8,10,2,1,1,2,1,54,2.60
...,...,...,...,...,...,...,...,...,...,...,...,...
500233,4aybQ00,6,20,450,10,1,1,1,2,1,50,3.20
500234,3hkzY00,6,20,450,10,1,1,2,1,1,45,3.40
500235,3hkzZ00,6,20,450,10,1,1,2,1,2,45,3.40
500236,3zbeA00,6,20,450,20,1,1,1,1,1,71,999.00


In [None]:
df[['Architecture','Topology','Homologous_superfamily','S95', 'Domain_length']].apply(str, axis=1).nunique()

In [None]:
df.apply(str, axis=1).nunique()

Domain                    500238
Class                          5
Architecture                  26
Topology                     520
Homologous_superfamily       671
S35                          873
S60                           21
S95                          116
S100_cluster                 217
S00_count                   1022
Domain_length                737
Resolution                   355
dtype: int64

In [4]:
# Function to get UniProt mappings for a PDB code
def get_uniprot_mappings(pdb_code):
    url = f'https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_code}'
    response = requests.get(url)
    data = response.json()
    if pdb_code in data:
        return data[pdb_code]['UniProt']
    return None

# Convert df['Domain'] into a list
cath_domains = df['Domain'].tolist()[:50]

# List to store mapping results
mapping = []

# Loop through CATH domains to get UniProt mappings
for domain in cath_domains:
    pdb_code = domain[:4]  # Extract PDB code
    chain = domain[4]      # Extract chain
    uniprot_mappings = get_uniprot_mappings(pdb_code)
    
    if uniprot_mappings:
        # Loop through the UniProt entries and their mappings
        for uniprot_id, details in uniprot_mappings.items():
            for entry in details['mappings']:
                if entry['chain_id'] == chain:
                    mapping.append({
                        'CATH_Domain': domain,
                        'PDB_Code': pdb_code,
                        'Chain': chain,
                        'UniProt_ID': uniprot_id,
                        'UniProt_Name': details.get('identifier', ''),
                        'UniProt_Start': entry['unp_start'],
                        'UniProt_End': entry['unp_end']
                    })

# Convert the results into a DataFrame for better readability or further analysis
mapping_df = pd.DataFrame(mapping)

mapping_df

Unnamed: 0,CATH_Domain,PDB_Code,Chain,UniProt_ID,UniProt_Name,UniProt_Start,UniProt_End
0,1oaiA00,1oai,A,Q9UBU9,NXF1_HUMAN,561,619
1,1go5A00,1go5,A,Q9UBU9,NXF1_HUMAN,551,619
2,3frhA01,3frh,A,Q763K9,Q763K9_ECOLX,1,251
3,3friA01,3fri,A,Q763K9,Q763K9_ECOLX,1,251
4,3b89A01,3b89,A,Q763K9,Q763K9_ECOLX,2,251
5,4g3oA00,4g3o,A,Q9UKV5,AMFR_HUMAN,456,498
6,4heoA00,4heo,A,O55778,PHOSP_HENDH,654,707
7,4heoB00,4heo,B,O55778,PHOSP_HENDH,654,707
8,1oksA00,1oks,A,P03422,PHOSP_MEASE,459,507
9,4wp2F00,4wp2,F,G0SET4,G0SET4_CHATD,600,657


In [5]:
import pandas as pd
import json

# Load your datasets (replace 'file1.csv' and 'file2.csv' with actual file paths)
family_df = pd.read_csv('../datasets/deeploc_lookup_labels_family.csv')  # Table 1 with protein family data

family_df.rename(columns={'ID': 'UniProt_ID', 'family': 'Family', 'sub_family': 'Sub_family'}, inplace=True)
family_df
# Merge the two tables on 'UniProt_ID' using an inner join
merged_data = pd.merge(family_df, mapping_df, on='UniProt_ID', how='inner')

In [6]:
merged_data

Unnamed: 0,UniProt_ID,Location,Family_raw,Family,Sub_family,CATH_Domain,PDB_Code,Chain,UniProt_Name,UniProt_Start,UniProt_End
0,Q9UBU9,Cytoplasm,NXF family,NXF family,,1oaiA00,1oai,A,NXF1_HUMAN,561,619
1,Q9UBU9,Cytoplasm,NXF family,NXF family,,1go5A00,1go5,A,NXF1_HUMAN,551,619


In [7]:
df.rename(columns={'Domain': 'CATH_Domain'}, inplace=True)

merged_result = pd.merge(merged_data, df, on='CATH_Domain', how='inner')

merged_result


Unnamed: 0,UniProt_ID,Location,Family_raw,Family,Sub_family,CATH_Domain,PDB_Code,Chain,UniProt_Name,UniProt_Start,...,Architecture,Topology,Homologous_superfamily,S35,S60,S95,S100_cluster,S00_count,Domain_length,Resolution
0,Q9UBU9,Cytoplasm,NXF family,NXF family,,1oaiA00,1oai,A,NXF1_HUMAN,561,...,10,8,10,1,1,1,1,1,59,1.0
1,Q9UBU9,Cytoplasm,NXF family,NXF family,,1go5A00,1go5,A,NXF1_HUMAN,551,...,10,8,10,1,1,1,1,2,69,999.0


In [8]:
merged_result.to_csv('../datasets/deeploc_lookup_labels_family_cath_prelim.csv', index=False)

In [9]:
# # Group by 'UniProt_ID' and organize the PDB entries under each UniProt_ID
# grouped_data = {}

# for uni_id, group in merged_data.groupby('UniProt_ID'):
#     # For each UniProt_ID, create a structure with family info and nested PDB codes
#     protein_info = {
#         "Location": group["Location"].iloc[0] if pd.notna(group["Location"].iloc[0]) else "",
#         "Family_raw": group["Family_raw"].iloc[0] if pd.notna(group["Family_raw"].iloc[0]) else "",
#         "Family": group["Family"].iloc[0] if pd.notna(group["Family"].iloc[0]) else "",
#         "Sub_family": group["Sub_family"].iloc[0] if pd.notna(group["Sub_family"].iloc[0]) else "",
#         "PDB_Codes": []
#     }
    
#     # For each PDB code related to the current UniProt_ID, add to the PDB_Codes list
#     for _, row in group.iterrows():
#         pdb_info = {
#             "PDB_Code": row["PDB_Code"] if pd.notna(row["PDB_Code"]) else "",
#             "Chain": row["Chain"] if pd.notna(row["Chain"]) else "",
#             "UniProt_Name": row["UniProt_Name"] if pd.notna(row["UniProt_Name"]) else "",
#             "UniProt_Start": row["UniProt_Start"] if pd.notna(row["UniProt_Start"]) else None,
#             "UniProt_End": row["UniProt_End"] if pd.notna(row["UniProt_End"]) else None
#         }
#         protein_info["PDB_Codes"].append(pdb_info)
    
#     # Add the protein info for this UniProt_ID to the final dictionary
#     grouped_data[uni_id] = protein_info

# # Save the structured data as a JSON object
# with open('../datasets/mapped_data.json', 'w') as json_file:
#     json.dump(grouped_data, json_file, indent=4)

# # Optional: Print the grouped data for verification
# print(json.dumps(grouped_data, indent=4))

In [10]:
import requests
import pandas as pd
import concurrent.futures
import time

# Function to get UniProt mappings for a PDB code
def get_uniprot_mappings(pdb_code):
    url = f'https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_code}'
    response = requests.get(url)
    data = response.json()
    if pdb_code in data:
        return data[pdb_code]['UniProt']
    return None

# Convert df['Domain'] into a list
cath_domains = df['CATH_Domain'].tolist()

# Function to process each domain and fetch the mapping
def process_domain(domain):
    pdb_code = domain[:4]  # Extract PDB code
    chain = domain[4]      # Extract chain
    uniprot_mappings = get_uniprot_mappings(pdb_code)
    
    result = []
    if uniprot_mappings:
        # Loop through the UniProt entries and their mappings
        for uniprot_id, details in uniprot_mappings.items():
            for entry in details['mappings']:
                if entry['chain_id'] == chain:
                    result.append({
                        'CATH_Domain': domain,
                        'PDB_Code': pdb_code,
                        'Chain': chain,
                        'UniProt_ID': uniprot_id,
                        'UniProt_Name': details.get('identifier', ''),
                        'UniProt_Start': entry['unp_start'],
                        'UniProt_End': entry['unp_end']
                    })
    return result

# Timer start
start_time = time.time()

# Use ThreadPoolExecutor to run the tasks in parallel
mapping = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = executor.map(process_domain, cath_domains)
    for result in results:
        mapping.extend(result)  # Append the result to the mapping list

# Timer end
end_time = time.time()

# Convert the results into a DataFrame for better readability or further analysis
mapping_df = pd.DataFrame(mapping)

# Display time taken and the DataFrame
print(f"Time taken: {end_time - start_time} seconds")
mapping_df


Time taken: 5319.642820596695 seconds


Unnamed: 0,CATH_Domain,PDB_Code,Chain,UniProt_ID,UniProt_Name,UniProt_Start,UniProt_End
0,1oaiA00,1oai,A,Q9UBU9,NXF1_HUMAN,561,619
1,1go5A00,1go5,A,Q9UBU9,NXF1_HUMAN,551,619
2,3frhA01,3frh,A,Q763K9,Q763K9_ECOLX,1,251
3,3friA01,3fri,A,Q763K9,Q763K9_ECOLX,1,251
4,3b89A01,3b89,A,Q763K9,Q763K9_ECOLX,2,251
...,...,...,...,...,...,...,...
462275,4aybQ00,4ayb,Q,B8YB65,RPO13_SACSH,1,104
462276,3hkzY00,3hkz,Y,Q980B8,RPO13_SACS2,1,104
462277,3hkzZ00,3hkz,Z,Q980B8,RPO13_SACS2,1,104
462278,3zbeA00,3zbe,A,Q8XAD5,Q8XAD5_ECO57,2,63


In [11]:
mapping_df.to_csv('../datasets/family_cath_mapping.csv', index=False)