In [1]:
import requests
import concurrent.futures
import multiprocessing
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Code to Obtain Mus Musculus Complexome Data from Complex Portal

In [None]:
# Get the number of CPUs available
num_cpus = multiprocessing.cpu_count()

# Define the API endpoint
# API below is to Search with a wildcard to retrieve all the information
url = 'https://www.ebi.ac.uk/intact/complex-ws/search/*'
# Make a GET request to the API
response = requests.get(url)

if response.status_code == 200:
  data = response.json()
  #filter for complexes annotated with Mus Musculus as the organism
  mouse_complexes = [element for element in data['elements'] if element['organismName']=='Mus musculus; 10090']
  #get the IDs of these complexes
  mouse_AC = [item['complexAC'] for item in mouse_complexes]
else:
  print(f"Failed to retrieve {response.status_code}")

In [None]:
def fetch_complex_data(mouse_complex):
    url_complex = f"https://www.ebi.ac.uk/intact/complex-ws/complex/{mouse_complex}"
    response = requests.get(url_complex)
    if response.status_code == 200:
        # Get desired info from the json stuctured data
        # participants is the key that contains a dictionary of the participating proteins in the complex
        # crossReferences key contains pubmed, GO, GCO and other such annotations about the complex
        # We are interested in the GO pathways the complex is associated with
        data = response.json()
        desired_keys = ['systematicName', 'ligands', 'complexAssemblies', 'participants']
        output = {key: data.get(key) for key in desired_keys}
        keys = ['qualifier', 'identifier', 'description'] #keys of interest in the crossReferences dictionary
        output['GO'] = [[dbi.get(key) for key in keys] for dbi in data['crossReferences'] if dbi['database'] == 'gene ontology']
        return output
    else:
        return f"Failed to retrieve {response.status_code}"

#initialize output dictionary
complex_data = {}

# Use ThreadPoolExecutor to fetch data concurrently
with concurrent.futures.ThreadPoolExecutor(max_workers=num_cpus) as executor:
    future_to_mouse_complex = {executor.submit(fetch_complex_data, mouse_complex): mouse_complex for mouse_complex in mouse_AC}
    for future in concurrent.futures.as_completed(future_to_mouse_complex):
        mouse_complex = future_to_mouse_complex[future]
        try:
            complex_data[mouse_complex] = future.result()
        except Exception as exc:
            print(f'{mouse_complex} generated an exception: {exc}')

In [None]:
# check the output
complex_data['CPX-679']

{'systematicName': 'Nr1h2:Rxra',
 'ligands': ['9-cis-retinoic acid (CHEBI:50648)', 'oxysterol (CHEBI:53030)'],
 'complexAssemblies': ['Heterodimer'],
 'participants': [{'interactorAC': 'EBI-12513495',
   'identifier': 'P28700',
   'identifierLink': 'https://www.uniprot.org/uniprotkb/P28700/entry',
   'name': 'Rxra',
   'description': 'Retinoic acid receptor RXR-alpha',
   'stochiometry': 'minValue: 1, maxValue: 1',
   'bioRole': 'unspecified role',
   'bioRoleMI': 'MI:0499',
   'bioRoleDefinition': 'Role not specified or not applicable to the data.',
   'interactorType': 'protein',
   'interactorTypeMI': 'MI:0326',
   'interactorTypeDefinition': 'A linear polymer of amino acids joined by peptide bonds in a specific sequence.',
   'linkedFeatures': [{'participantId': 'Q60644',
     'featureType': 'binding region',
     'featureTypeMI': 'MI:0117',
     'featureTypeDefinition': 'A region of a molecule or a component of a complex identified as being involved in an interaction. This may or 

In [None]:
len(complex_data)

746

In [None]:
#store the output in a json file
file_path = "mus_musculus_protein_complex_compiled_data.json"
with open(file_path, "w") as outfile:
  json.dump(complex_data, outfile)

# Mapping Complexome to Proteome and Downstream Analysis

In [None]:
'''
Load in the Mouse Protein Complex JSON file created above
'''
file_path = '/content/mus_musculus_protein_complex_compiled_data.json'
with open(file_path, 'r') as f:
  data = json.load(f)

'''
Get UniProt Ids of participants in each complex
We will then use this data to see how many protein complexes are represented
  in the proteomics dataset
'''
#Get Uniprot IDs for each participant in a complex
# We have filtered for only getting the protein interactors
complex_protein_interactors = {key: [participant['identifier'] for participant in data[key]['participants'] if participant['interactorType']=='protein'] for key in list(data.keys())}

In [2]:
'''
Load in Proteomics Data - Ctrl & Iso for all Mouse Strains
Map the proteins in the proteomics dataset to the mouse complexome built from ComplexPortal database (above)
'''

strains = ['dba','fvb','cej','c57', 'balbc','aj']
conditions = ['iso', 'ctrl']
imputations = ['before', 'after']
pathway = '/content/'

all_data = list()
for strain in strains :
  for cond in conditions:
    for imp in imputations:
      filename = pathway + cond + "_" + strain + "_" + imp + ".csv"
      dat = pd.read_csv(filename)
      dat["cond"] = cond
      dat["strain"] = strain
      dat["imp"] = imp
      all_data.append(dat)

# Combine all dataframes in the list `all_data`
combined_data = pd.concat(all_data)

# Filter data for ISO experimental group
iso_data = combined_data[combined_data["cond"] == "iso"]

# Filter data for CTRL group
ctrl_data = combined_data[combined_data["cond"] == "ctrl"]

In [6]:
prots_of_interest = "P10922" , "Q3UQ17"
print([prot for prot in prots_of_interest if prot in iso_data.UniProt.unique()])
print([prot for prot in prots_of_interest if prot in ctrl_data.UniProt.unique()])

['P10922']
['P10922']


In [None]:
# unique proteins in the ISO proteomics data
iso_proteome = iso_data.UniProt.unique()
print(len(iso_proteome))
# unique proteins in the CTRL proteomics data
ctrl_proteome = ctrl_data.UniProt.unique()
len(ctrl_proteome)

4181


4011

## Iso Analysis

In [None]:
'''
iso_complexome will contain only those complexes for which all proteins are present in iso_proteome
'''

iso_complexome = {}  # Initialize the iso_complexome dictionary

for complx, proteins in complex_protein_interactors.items():
    # Check if all proteins in the complex are represented in iso_proteome
    if all(protein in iso_proteome for protein in proteins):
        # If every single protein in the complex is present, save info to iso_complexome
        iso_complexome[complx] = {'interactors' : proteins,
                                  'assembly':''.join(data[complx]['complexAssemblies']).replace('.', '').lower(),
                                  'GO' : data[complx]['GO']}

# print(len(iso_complexome))
# with open('iso_complexome.json', 'w') as out:
#   json.dump(iso_complexome, out)

In [None]:
# What types of complex assemblies are present in the data
{iso['assembly'] for cpx, iso in iso_complexome.items()}

{'',
 'heterodimer',
 'heterododecamer',
 'heterohexamer',
 'heterononamer',
 'heterooctamer',
 'heterooctomer',
 'heteropentamer',
 'heterotetramer',
 'heterotrimer',
 'homodecamer',
 'homodimer',
 'homohexamer',
 'homooligomer',
 'homopentamer',
 'homotetramer',
 'homotrimer'}

In [None]:
'''
ISO Complexome Annotations
'''

complex_names = []
interactor_lists = []
assembly_list = []
go_bp_list = []
go_cc_list = []
go_mf_list = []

# Iterate over each complex
for complex_name, details in iso_complexome.items():
    # Append complex name
    complex_names.append(complex_name)
    # Join interactors into a string and append
    interactor_lists.append(', '.join(details['interactors']))
    # Append complex assembly type to list
    assembly_list.append(details['assembly'])
    # Flatten GO annotations and append
    ### Cellular Component
    go_cc = '; '.join([', '.join(go) for go in details['GO'] if go[0] == 'cellular component'])
    go_cc_list.append(go_cc)
    ### Molecular Function
    go_mf = '; '.join([', '.join(go) for go in details['GO'] if go[0] == 'molecular function'])
    go_mf_list.append(go_mf)
    ### Biological Process
    go_bp = '; '.join([', '.join(go) for go in details['GO'] if go[0] == 'biological process'])
    go_bp_list.append(go_bp)

df_iso = pd.DataFrame({
    'Complex': complex_names,
    'Assembly': assembly_list,
    'Interactors': interactor_lists,
    'GO Cellular Component': go_cc_list,
    'GO Molecular Function': go_mf_list,
    'GO Biological Process': go_bp_list
})


# Create a list of proteins by splitting the 'Interactors' column
df_iso['Interactors'] = df_iso['Interactors'].str.split(', ')

# Filter the DataFrame to include only complexes where all interactors have hetero assembly for proteins
df_hetero_complexes = df_iso[
    (df_iso['Assembly'].str.contains('hetero')) &
    (df_iso['Interactors'].apply(lambda x: len(x) > 1))
]


df_hetero_complexes

Unnamed: 0,Complex,Assembly,Interactors,GO Cellular Component,GO Molecular Function,GO Biological Process
0,CPX-689,heterodimer,"[P29416, P20060]","cellular component, GO:1905379, beta-N-acetylh...","molecular function, GO:0004563, beta-N-acetylh...","biological process, GO:0030203, glycosaminogly..."
4,CPX-3287,heterotetramer,"[Q61081, P11499]","cellular component, GO:1990565, HSP90-CDC37 ch...","molecular function, GO:0019887, protein kinase...","biological process, GO:0032435, negative regul..."
5,CPX-3289,heterotetramer,"[Q61081, P07901]","cellular component, GO:1990565, HSP90-CDC37 ch...","molecular function, GO:0019887, protein kinase...","biological process, GO:0022417, protein matura..."
7,CPX-16,heterotrimer,"[P19123, P48787, P50752]","cellular component, GO:0030017, sarcomere","molecular function, GO:0005509, calcium ion bi...","biological process, GO:0060048, cardiac muscle..."
10,CPX-4301,heterodimer,"[O08529, O88456]","cellular component, GO:0110158, calpain complex","molecular function, GO:0005509, calcium ion bi...","biological process, GO:0006508, proteolysis"
...,...,...,...,...,...,...
120,CPX-3011,heterotrimer,"[P02468, Q61292, Q60675]","cellular component, GO:0005609, laminin-4 complex","molecular function, GO:0005201, extracellular ...","biological process, GO:0030334, regulation of ..."
121,CPX-3015,heterotrimer,"[P02469, P02468, P97927]","cellular component, GO:0043257, laminin-8 complex","molecular function, GO:0005201, extracellular ...","biological process, GO:0030334, regulation of ..."
122,CPX-3031,heterotrimer,"[P02468, Q61292, P97927]","cellular component, GO:0043258, laminin-9 complex","molecular function, GO:0005201, extracellular ...","biological process, GO:0030334, regulation of ..."
123,CPX-3016,heterotrimer,"[P02469, P02468, Q61001]","cellular component, GO:0043259, laminin-10 com...","molecular function, GO:0005201, extracellular ...","biological process, GO:0030334, regulation of ..."


In [None]:
'''
We loop over each protein interactor
For each such protein, we select the subset of complexes where this protein is an interactor.
We then loop through these subsets and find the complex that hasn't been added yet with protein subsets that haven't been added yet
'''

# Identify interactor counts
interactor_counts = df_hetero_complexes.Interactors.explode().value_counts()
interactor_counts

# Initialize an empty DataFrame to store the unique complexes
unique_complexes_df = pd.DataFrame(columns=df_hetero_complexes.columns)

# Initialize sets to keep track of already added complexes and interactors
added_complexes = set()
added_interactors = set()

# Iterate over each interactor in the interactors_repeat Series
for interactor, count in interactor_counts.items():

    # Get the subset of df_hetero_complexes where the interactor is present
    interactor_subset = df_hetero_complexes[df_hetero_complexes['Interactors'].apply(lambda x: interactor in x)]

    # Iterate over the rows in this subset to find the first complex not already added
    for idx, row in interactor_subset.iterrows():

        if row['Complex'] not in added_complexes and not any(item in added_interactors for item in row['Interactors']):
            # Add the complex to the unique_complexes_df
            unique_complexes_df = unique_complexes_df.append(row)
            # Add the complex to the set of added complexes
            added_complexes.add(row['Complex'])
            # Add the interactors to the set of added interactors
            added_interactors.update(row['Interactors'])

all(unique_complexes_df.Interactors.explode().value_counts() == 1)

  unique_complexes_df = unique_complexes_df.append(row)
  unique_complexes_df = unique_complexes_df.append(row)
  unique_complexes_df = unique_complexes_df.append(row)
  unique_complexes_df = unique_complexes_df.append(row)
  unique_complexes_df = unique_complexes_df.append(row)
  unique_complexes_df = unique_complexes_df.append(row)
  unique_complexes_df = unique_complexes_df.append(row)
  unique_complexes_df = unique_complexes_df.append(row)
  unique_complexes_df = unique_complexes_df.append(row)
  unique_complexes_df = unique_complexes_df.append(row)
  unique_complexes_df = unique_complexes_df.append(row)
  unique_complexes_df = unique_complexes_df.append(row)
  unique_complexes_df = unique_complexes_df.append(row)
  unique_complexes_df = unique_complexes_df.append(row)
  unique_complexes_df = unique_complexes_df.append(row)
  unique_complexes_df = unique_complexes_df.append(row)
  unique_complexes_df = unique_complexes_df.append(row)
  unique_complexes_df = unique_complexes_df.appe

True

In [None]:
unique_complexes_df

Unnamed: 0,Complex,Assembly,Interactors,GO Cellular Component,GO Molecular Function,GO Biological Process
119,CPX-3009,heterotrimer,"[P02469, P02468, Q60675]","cellular component, GO:0005607, laminin-2 complex","molecular function, GO:0005201, extracellular ...","biological process, GO:0030334, regulation of ..."
53,CPX-3027,heterodimer,"[P09055, P11688]","cellular component, GO:0034674, alpha5-beta1 i...","molecular function, GO:0004888, transmembrane ...","biological process, GO:0007229, integrin-media..."
82,CPX-1054,heterodimer,"[P52293, P70168]","cellular component, GO:0005829, cytosol; cellu...","molecular function, GO:0061608, nuclear import...","biological process, GO:0006606, protein import..."
31,CPX-5862,heterotrimer,"[P10630, P63073, Q6NZJ6]","cellular component, GO:0016281, eukaryotic tra...","molecular function, GO:0003743, translation in...","biological process, GO:0006413, translational ..."
23,CPX-5849,heterotrimer,"[Q91WG5, Q9R078, Q5EG47]","cellular component, GO:0031588, nucleotide-act...","molecular function, GO:0004679, AMP-activated ...","biological process, GO:0031669, cellular respo..."
24,CPX-5851,heterotrimer,"[Q8BRK8, O54950, Q6PAM0]","cellular component, GO:0031588, nucleotide-act...","molecular function, GO:0004679, AMP-activated ...","biological process, GO:0031669, cellular respo..."
48,CPX-2055,heterotetramer,"[P12382, P47857]","cellular component, GO:0005945, 6-phosphofruct...","molecular function, GO:0003872, 6-phosphofruct...","biological process, GO:0006096, glycolysis"
54,CPX-3035,heterodimer,"[O54890, P43406]","cellular component, GO:0034683, alphav-beta3 i...","molecular function, GO:0004888, transmembrane ...","biological process, GO:0007229, integrin-media..."
19,CPX-5152,heterotetramer,"[P62743, P84091, P17426, Q9DBG3]","cellular component, GO:0009898, cytoplasmic si...","molecular function, GO:0030276, clathrin bindi...","biological process, GO:0072583, clathrin-depen..."
41,CPX-2185,heterodimer,"[Q61239, Q8K2I1]","cellular component, GO:0005965, protein farnes...","molecular function, GO:0004660, protein farnes...","biological process, GO:0018343, protein farnes..."


In [None]:
### Proteomics Data for ISO Group ###
# Compute the log2 of median.K
iso_data = iso_data.assign(
    log2_median_K = np.log2(iso_data['median.K'])
).rename(columns={'UniProt': 'Interactors'})

# Group the data by 'UniProt' and 'imp' and calculate the mean and SEM of the turunover data for all conditions and strains before and after imputation
average_data = (
    iso_data
    .groupby(['Interactors', 'imp'])
    .agg(
        AvgTurnover=('log2_median_K', 'mean'),  # Calculate average turnover rate
        SEM=('log2_median_K', lambda x: np.std(x, ddof=1) / np.sqrt(x.count()) if x.count() > 1 else 0),  # Calculate SEM
    )
    .reset_index()
    .fillna(0)
)

In [None]:
# # List of desired GO annotations
# # I am interested in all of the complexes since the hetero complexes are not a lot, I will manually review the annotations for complexes where the protein interactor shows up in multiple complexes
# cellular_components_of_interest = df_hetero_complexes['GO Cellular Component']

# df_iso_interest = df_hetero_complexes[
#     df_hetero_complexes['GO Cellular Component'].apply(
#         lambda x: any(obj_type in x for obj_type in cellular_components_of_interest)
#     )
# ]

# Explode the 'Interactors' column to create a new row for each interactor
expanded_df = unique_complexes_df.explode('Interactors')

iso_complexome_turnovers = pd.merge(
    expanded_df,
    average_data,
    left_on='Interactors',
    right_on='Interactors',
    how='left'
)
iso_complexome_turnovers


Unnamed: 0,Complex,Assembly,Interactors,GO Cellular Component,GO Molecular Function,GO Biological Process,imp,AvgTurnover,SEM
0,CPX-3009,heterotrimer,P02469,"cellular component, GO:0005607, laminin-2 complex","molecular function, GO:0005201, extracellular ...","biological process, GO:0030334, regulation of ...",after,-4.204458,0.097913
1,CPX-3009,heterotrimer,P02469,"cellular component, GO:0005607, laminin-2 complex","molecular function, GO:0005201, extracellular ...","biological process, GO:0030334, regulation of ...",before,-4.227526,0.084927
2,CPX-3009,heterotrimer,P02468,"cellular component, GO:0005607, laminin-2 complex","molecular function, GO:0005201, extracellular ...","biological process, GO:0030334, regulation of ...",after,-4.198551,0.072064
3,CPX-3009,heterotrimer,P02468,"cellular component, GO:0005607, laminin-2 complex","molecular function, GO:0005201, extracellular ...","biological process, GO:0030334, regulation of ...",before,-4.299389,0.076716
4,CPX-3009,heterotrimer,Q60675,"cellular component, GO:0005607, laminin-2 complex","molecular function, GO:0005201, extracellular ...","biological process, GO:0030334, regulation of ...",after,-4.796800,0.106358
...,...,...,...,...,...,...,...,...,...
183,CPX-2956,heterotrimer,Q01149,"cellular component, GO:0005584, collagen type ...","molecular function, GO:0005201, extracellular ...","biological process, GO:0030199, collagen fibri...",before,-4.677923,0.100245
184,CPX-3129,heterodimer,P05555,"cellular component, GO:0034688, alphaM-beta2 i...","molecular function, GO:0004888, transmembrane ...","biological process, GO:0007229, integrin-media...",after,-0.407227,0.000000
185,CPX-3129,heterodimer,P05555,"cellular component, GO:0034688, alphaM-beta2 i...","molecular function, GO:0004888, transmembrane ...","biological process, GO:0007229, integrin-media...",before,-0.103365,0.000000
186,CPX-3129,heterodimer,P11835,"cellular component, GO:0034688, alphaM-beta2 i...","molecular function, GO:0004888, transmembrane ...","biological process, GO:0007229, integrin-media...",after,-0.278256,1.079828


In [None]:
# Sort the DataFrame to arrange the interactors grouped by complex
iso_complexome_turnovers['Interactor_ID'] = iso_complexome_turnovers.groupby(['Complex']).cumcount() + 1
iso_complexome_turnovers.sort_values(by=['Complex', 'Interactor_ID'], inplace=True)
iso_complexome_turnovers.to_csv('iso_complexome_turnovers.csv')

In [None]:
'''
Store combined complexome data and turnover values for ALL of the heterocomplexes represented in the proteome data
'''
# expanded_df = df_hetero_complexes.explode('Interactors')

# iso_complexome_turnovers = pd.merge(
#     expanded_df,
#     average_data,
#     left_on='Interactors',
#     right_on='Interactors',
#     how='left'
# )
# iso_complexome_turnovers

# # Sort the DataFrame to arrange the interactors grouped by complex
# iso_complexome_turnovers['Interactor_ID'] = iso_complexome_turnovers.groupby(['Complex']).cumcount() + 1
# iso_complexome_turnovers.sort_values(by=['Complex', 'Interactor_ID'], inplace=True)
# iso_complexome_turnovers.to_csv('iso_turnovers_all_heterocomplexes.csv')

'\nStore combined complexome data and turnover values for ALL of the heterocomplexes represented in the proteome data \n'

## CTRL Analysis

In [None]:
ctrl_data

Unnamed: 0,UniProt,median.K,cond,strain,imp
0,A2A8U2,0.101636,ctrl,dba,before
1,A2AAJ9,0.085461,ctrl,dba,before
2,A2ADF7,0.326856,ctrl,dba,before
3,A2AGT5,0.593946,ctrl,dba,before
4,A2AIL4,0.076795,ctrl,dba,before
...,...,...,...,...,...
2604,Q9Z2W0,0.095750,ctrl,aj,after
2605,Q9Z2X1,0.314000,ctrl,aj,after
2606,Q9Z2Y3,0.325714,ctrl,aj,after
2607,Q9Z2Y8,0.118833,ctrl,aj,after


In [None]:
### Proteomics Data for CTRL Group ###
# Compute the log2 of median.K
ctrl_data = ctrl_data.assign(
    log2_median_K = np.log2(ctrl_data['median.K'])
).rename(columns={'UniProt': 'Interactors'})

# Group the data by 'UniProt' and 'imp' and calculate the mean and SEM of the turunover data for all conditions and strains before and after imputation
average_data_ctrl = (
    ctrl_data
    .groupby(['Interactors', 'imp'])
    .agg(
        AvgTurnover=('log2_median_K', 'mean'),  # Calculate average turnover rate
        SEM=('log2_median_K', lambda x: np.std(x, ddof=1) / np.sqrt(x.count()) if x.count() > 1 else 0),  # Calculate SEM
    )
    .reset_index()
    .fillna(0)
)

In [None]:
average_data_ctrl

Unnamed: 0,Interactors,imp,AvgTurnover,SEM
0,A2A5R2,after,-1.859362,0.938346
1,A2A8U2,after,-2.309588,0.319581
2,A2A8U2,before,-2.389285,0.598205
3,A2A8Z1,after,-1.070499,0.526779
4,A2AAE1,after,-3.502914,0.076289
...,...,...,...,...
6939,Q9Z2Y8,after,-2.363193,0.644634
6940,Q9Z2Y8,before,-2.417988,0.652535
6941,Q9Z2Z6,after,-4.712402,0.054872
6942,Q9Z2Z6,before,-4.793228,0.070296


In [None]:
'''
ctrl_complexome will contain only those complexes for which all proteins are present in ctrl_proteome
'''

ctrl_complexome = {}  # Initialize the ctrl_complexome dictionary

for complx, proteins in complex_protein_interactors.items():
    # Check if all proteins in the complex are represented in ctrl_proteome
    if all(protein in ctrl_proteome for protein in proteins):
        # If every single protein in the complex is present, save info to ctrl_complexome
        ctrl_complexome[complx] = {'interactors' : proteins,
                                  'assembly':''.join(data[complx]['complexAssemblies']).replace('.', '').lower(),
                                  'GO' : data[complx]['GO']}


# len(ctrl_complexome)
# with open('ctrl_complexome.json', 'w') as out:
#   json.dump(ctrl_complexome, out)

In [None]:
# What types of complex assemblies are present in the data
{ctrl['assembly'] for cpx, ctrl in ctrl_complexome.items()}

{'',
 'heterodimer',
 'heterododecamer',
 'heterohexamer',
 'heterononamer',
 'heterooctamer',
 'heterooctomer',
 'heteropentamer',
 'heterotetramer',
 'heterotrimer',
 'homodimer',
 'homooligomer',
 'homopentamer',
 'homotetramer',
 'homotrimer'}

In [None]:
'''
CTRL Complexome Annotations
'''

complex_names = []
interactor_lists = []
assembly_list = []
go_bp_list = []
go_cc_list = []
go_mf_list = []

# Iterate over each complex
for complex_name, details in ctrl_complexome.items():
    # Append complex name
    complex_names.append(complex_name)
    # Join interactors into a string and append
    interactor_lists.append(', '.join(details['interactors']))
    # Append complex assembly type to list
    assembly_list.append(details['assembly'])
    # Flatten GO annotations and append
    ### Cellular Component
    go_cc = '; '.join([', '.join(go) for go in details['GO'] if go[0] == 'cellular component'])
    go_cc_list.append(go_cc)
    ### Molecular Function
    go_mf = '; '.join([', '.join(go) for go in details['GO'] if go[0] == 'molecular function'])
    go_mf_list.append(go_mf)
    ### Biological Process
    go_bp = '; '.join([', '.join(go) for go in details['GO'] if go[0] == 'biological process'])
    go_bp_list.append(go_bp)

df_ctrl = pd.DataFrame({
    'Complex': complex_names,
    'Assembly': assembly_list,
    'Interactors': interactor_lists,
    'GO Cellular Component': go_cc_list,
    'GO Molecular Function': go_mf_list,
    'GO Biological Process': go_bp_list
})


# Create a list of proteins by splitting the 'Interactors' column
df_ctrl['Interactors'] = df_ctrl['Interactors'].str.split(', ')

# Filter the DataFrame to include only complexes where all interactors have hetero assembly for proteins
df_hetero_complexes_ctrl = df_ctrl[
    (df_ctrl['Assembly'].str.contains('hetero')) &
    (df_ctrl['Interactors'].apply(lambda x: len(x) > 1))
]


df_hetero_complexes_ctrl

Unnamed: 0,Complex,Assembly,Interactors,GO Cellular Component,GO Molecular Function,GO Biological Process
0,CPX-689,heterodimer,"[P29416, P20060]","cellular component, GO:1905379, beta-N-acetylh...","molecular function, GO:0004563, beta-N-acetylh...","biological process, GO:0030203, glycosaminogly..."
4,CPX-3287,heterotetramer,"[Q61081, P11499]","cellular component, GO:1990565, HSP90-CDC37 ch...","molecular function, GO:0019887, protein kinase...","biological process, GO:0032435, negative regul..."
5,CPX-3289,heterotetramer,"[Q61081, P07901]","cellular component, GO:1990565, HSP90-CDC37 ch...","molecular function, GO:0019887, protein kinase...","biological process, GO:0022417, protein matura..."
6,CPX-16,heterotrimer,"[P19123, P48787, P50752]","cellular component, GO:0030017, sarcomere","molecular function, GO:0005509, calcium ion bi...","biological process, GO:0060048, cardiac muscle..."
9,CPX-4301,heterodimer,"[O08529, O88456]","cellular component, GO:0110158, calpain complex","molecular function, GO:0005509, calcium ion bi...","biological process, GO:0006508, proteolysis"
...,...,...,...,...,...,...
109,CPX-3011,heterotrimer,"[P02468, Q61292, Q60675]","cellular component, GO:0005609, laminin-4 complex","molecular function, GO:0005201, extracellular ...","biological process, GO:0030334, regulation of ..."
110,CPX-3015,heterotrimer,"[P02469, P02468, P97927]","cellular component, GO:0043257, laminin-8 complex","molecular function, GO:0005201, extracellular ...","biological process, GO:0030334, regulation of ..."
111,CPX-3031,heterotrimer,"[P02468, Q61292, P97927]","cellular component, GO:0043258, laminin-9 complex","molecular function, GO:0005201, extracellular ...","biological process, GO:0030334, regulation of ..."
112,CPX-3016,heterotrimer,"[P02469, P02468, Q61001]","cellular component, GO:0043259, laminin-10 com...","molecular function, GO:0005201, extracellular ...","biological process, GO:0030334, regulation of ..."


In [None]:
# Explode the 'Interactors' column to create a new row for each interactor
expanded_df = df_hetero_complexes_ctrl.explode('Interactors')

ctrl_complexome_turnovers = pd.merge(
    expanded_df,
    average_data_ctrl,
    left_on='Interactors',
    right_on='Interactors',
    how='left'
)

# Sort the DataFrame to arrange the interactors grouped by complex
ctrl_complexome_turnovers['Interactor_ID'] = ctrl_complexome_turnovers.groupby(['Complex']).cumcount() + 1
ctrl_complexome_turnovers.sort_values(by=['Complex', 'Interactor_ID'], inplace=True)
ctrl_complexome_turnovers.to_csv('ctrl_turnovers_all_heterocomplexes.csv')

In [None]:
ctrl_complexome_turnovers

Unnamed: 0,Complex,Assembly,Interactors,GO Cellular Component,GO Molecular Function,GO Biological Process,imp,AvgTurnover,SEM,Interactor_ID
183,CPX-1054,heterodimer,P52293,"cellular component, GO:0005829, cytosol; cellu...","molecular function, GO:0061608, nuclear import...","biological process, GO:0006606, protein import...",after,0.240364,0.210608,1
184,CPX-1054,heterodimer,P52293,"cellular component, GO:0005829, cytosol; cellu...","molecular function, GO:0061608, nuclear import...","biological process, GO:0006606, protein import...",before,-0.171632,0.610362,2
185,CPX-1054,heterodimer,P70168,"cellular component, GO:0005829, cytosol; cellu...","molecular function, GO:0061608, nuclear import...","biological process, GO:0006606, protein import...",after,-2.739862,0.084905,3
186,CPX-1054,heterodimer,P70168,"cellular component, GO:0005829, cytosol; cellu...","molecular function, GO:0061608, nuclear import...","biological process, GO:0006606, protein import...",before,-2.734632,0.078640,4
187,CPX-1056,heterodimer,P70168,"cellular component, GO:0005829, cytosol; cellu...","molecular function, GO:0061608, nuclear import...","biological process, GO:0006606, protein import...",after,-2.739862,0.084905,1
...,...,...,...,...,...,...,...,...,...,...
225,CPX-865,heterodimer,Q99LC5,"cellular component, GO:0005759, mitochondrial ...","molecular function, GO:0009055, electron carri...","biological process, GO:0009063, cellular amino...",before,-4.718469,0.038950,4
226,CPX-898,heterotetramer,P08207,"cellular component, GO:0098797, plasma membran...","molecular function, GO:0005544, calcium-depend...","biological process, GO:1905686, positive regul...",after,-3.397634,0.069720,1
227,CPX-898,heterotetramer,P08207,"cellular component, GO:0098797, plasma membran...","molecular function, GO:0005544, calcium-depend...","biological process, GO:1905686, positive regul...",before,-3.549620,0.000000,2
228,CPX-898,heterotetramer,P07356,"cellular component, GO:0098797, plasma membran...","molecular function, GO:0005544, calcium-depend...","biological process, GO:1905686, positive regul...",after,-3.835912,0.085787,3


In [None]:
# filter df_hetero_complexes_ctrl for complexes in unique_complexes_df
df_hetero_complexes_ctrl_filtered = df_hetero_complexes_ctrl[df_hetero_complexes_ctrl['Complex'].isin(unique_complexes_df['Complex'])]
df_hetero_complexes_ctrl_filtered

Unnamed: 0,Complex,Assembly,Interactors,GO Cellular Component,GO Molecular Function,GO Biological Process
0,CPX-689,heterodimer,"[P29416, P20060]","cellular component, GO:1905379, beta-N-acetylh...","molecular function, GO:0004563, beta-N-acetylh...","biological process, GO:0030203, glycosaminogly..."
4,CPX-3287,heterotetramer,"[Q61081, P11499]","cellular component, GO:1990565, HSP90-CDC37 ch...","molecular function, GO:0019887, protein kinase...","biological process, GO:0032435, negative regul..."
6,CPX-16,heterotrimer,"[P19123, P48787, P50752]","cellular component, GO:0030017, sarcomere","molecular function, GO:0005509, calcium ion bi...","biological process, GO:0060048, cardiac muscle..."
9,CPX-4301,heterodimer,"[O08529, O88456]","cellular component, GO:0110158, calpain complex","molecular function, GO:0005509, calcium ion bi...","biological process, GO:0006508, proteolysis"
11,CPX-6084,heterooctamer,"[Q3URS9, Q9CXJ4]","cellular component, GO:0062157, mitochondrial ...","molecular function, GO:0019829, ATPase-coupled...","biological process, GO:0006884, cell volume ho..."
14,CPX-4622,heterodimer,"[P47758, Q9DBG7]","cellular component, GO:0016020, membrane; cell...","molecular function, GO:0003924, GTPase activit...","biological process, GO:0006617, SRP-dependent ..."
16,CPX-5152,heterotetramer,"[P62743, P84091, P17426, Q9DBG3]","cellular component, GO:0009898, cytoplasmic si...","molecular function, GO:0030276, clathrin bindi...","biological process, GO:0072583, clathrin-depen..."
20,CPX-5851,heterotrimer,"[Q8BRK8, O54950, Q6PAM0]","cellular component, GO:0031588, nucleotide-act...","molecular function, GO:0004679, AMP-activated ...","biological process, GO:0031669, cellular respo..."
23,CPX-5861,heterodimer,"[Q64213, P26369]","cellular component, GO:0089701, U2AF; cellular...","molecular function, GO:0030628, pre-mRNA 3'-sp...","biological process, GO:0000398, mRNA splicing,..."
24,CPX-5862,heterotrimer,"[P10630, P63073, Q6NZJ6]","cellular component, GO:0016281, eukaryotic tra...","molecular function, GO:0003743, translation in...","biological process, GO:0006413, translational ..."


In [None]:
# Do df_hetero_complexes_ctrl_filtered.Complex and unique_complexes_df.Complex have all the same complexes?

unique_complexes_df[unique_complexes_df.Complex.isin(df_hetero_complexes_ctrl_filtered.Complex) == False]


Unnamed: 0,Complex,Assembly,Interactors,GO Cellular Component,GO Molecular Function,GO Biological Process
23,CPX-5849,heterotrimer,"[Q91WG5, Q9R078, Q5EG47]","cellular component, GO:0031588, nucleotide-act...","molecular function, GO:0004679, AMP-activated ...","biological process, GO:0031669, cellular respo..."
17,CPX-5141,heterotetramer,"[P35585, P22892, O35643, P61967]","cellular component, GO:0005765, lysosomal memb...","molecular function, GO:0030276, clathrin bindi...","biological process, GO:0016192, vesicle-mediat..."
91,CPX-2962,heterotrimer,"[O88207, Q3U962]","cellular component, GO:0005588, collagen type ...","molecular function, GO:0005201, extracellular ...","biological process, GO:0030199, collagen fibri..."
15,CPX-4629,heterooctamer,"[Q9QZE7, Q62348]","cellular component, GO:1902555, endoribonuclea...","molecular function, GO:0004521, endoribonuclea...","biological process, GO:0030422, production of ..."
73,CPX-3251,heterotrimer,"[Q62432, P97471]","cellular component, GO:0071144, heteromeric SM...","molecular function, GO:0003700, sequence-speci...","biological process, GO:0006351, transcription,..."
61,CPX-3129,heterodimer,"[P05555, P11835]","cellular component, GO:0034688, alphaM-beta2 i...","molecular function, GO:0004888, transmembrane ...","biological process, GO:0007229, integrin-media..."


In [None]:
len(df_hetero_complexes_ctrl_filtered)

35

In [None]:
# Explode the 'Interactors' column to create a new row for each interactor
expanded_df = df_hetero_complexes_ctrl_filtered.explode('Interactors')

ctrl_complexome_turnovers_filtered = pd.merge(
    expanded_df,
    average_data_ctrl,
    left_on='Interactors',
    right_on='Interactors',
    how='left'
)

# Sort the DataFrame to arrange the interactors grouped by complex
ctrl_complexome_turnovers_filtered['Interactor_ID'] = ctrl_complexome_turnovers_filtered.groupby(['Complex']).cumcount() + 1
ctrl_complexome_turnovers_filtered.sort_values(by=['Complex', 'Interactor_ID'], inplace=True)
ctrl_complexome_turnovers_filtered.to_csv('ctrl_turnovers_for_heterocomplexes_shared_with_iso.csv')