Script to get all the .zips from my drive folder, and compute all features into one .csv to be later processed

Notes :
- took out the nanfunction in favor of the normal functions since nanstd works weirdly with the dtype
- no nan or inf in the AF input data

# Set Up

In [None]:
#@title Mount google drive
from google.colab import drive
drive.mount('/content/drive')

from pydrive.drive import GoogleDrive
from pydrive.auth import GoogleAuth
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
print("You are logged into Google Drive and are good to go!")

Mounted at /content/drive
You are logged into Google Drive and are good to go!


In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import os
#from glob import glob
from zipfile import ZipFile
import json

import time

In [None]:
#@title Paths and global vars
path = "/content/drive/MyDrive/Biotech_Work/Dev_files/ColabFold_runs/IO/output"

positive_seqs = pd.read_csv('shaped_positives_reduced.csv')
negative_seqs = pd.read_csv('shaped_negatives_reduced.csv')

# Functions

In [None]:
#@title Functions

#
# both glob.glob and pathlib.glob don't like working with a zip or a list of strings
# > everyone just ends up re-writing stuff, and we don't have a complicated regex anyway
def sort_files(file_list, prefix = ''):

  disto_list = []
  score_list = []
  repr_list = []
  pdb_list = []

  for file_name in file_list:
    if 'custom_disto' in file_name:
      disto_list.append(prefix + file_name)
    if '.pdb' in file_name:
      pdb_list.append(prefix + file_name)
    if 'scores' in file_name:
      score_list.append(prefix + file_name)
    if 'repr' in file_name:
      repr_list.append(prefix + file_name)
  #

  return pdb_list, score_list, disto_list, repr_list


#
# don't want to bother with the bytes or the path-like objects
# > so extract, process, delete
def extract_and_process_scores(file_list, zip_archive, nb_pep):

  storage = {
      'max_pae': [],
      'ptm': [],
      'iptm': [],
      'multimer': [],

      'max_plddt': [],
      'median_plddt': [],
      'mean_plddt': [],
      'std_plddt': [],

      'max_pep_plddt': [],
      'median_pep_plddt': [],
      'mean_pep_plddt': [],
      'std_pep_plddt': [],
  }

  for file_name in file_list:
    new_local_file = zip_archive.extract(file_name)

    if storage['max_pae'] == [] and 'rank_001' not in new_local_file:
      print('error : rank_001 not first in list', new_local_file)

    with open(new_local_file) as json_file:
      file_dict = json.load(json_file)

    storage['max_pae'].append(file_dict['max_pae'])
    storage['ptm'].append(file_dict['ptm'])
    storage['iptm'].append(file_dict['iptm'])
    storage['multimer'].append(file_dict['iptm']*0.8 + file_dict['ptm']*0.2)

    temp_plddt = file_dict['plddt']
    storage['max_plddt'].append(np.nanmax(temp_plddt))
    storage['median_plddt'].append(np.median(temp_plddt))
    storage['mean_plddt'].append(np.mean(temp_plddt, dtype = np.float64))
    storage['std_plddt'].append(np.std(temp_plddt, dtype = np.float64))

    pep_plddt = temp_plddt[:nb_pep]
    storage['max_pep_plddt'].append(np.nanmax(pep_plddt))
    storage['median_pep_plddt'].append(np.median(pep_plddt))
    storage['mean_pep_plddt'].append(np.mean(pep_plddt, dtype = np.float64))
    storage['std_pep_plddt'].append(np.std(pep_plddt, dtype = np.float64))

    os.remove(new_local_file)
  #

  feature_dict = {}
  for feature in storage:
    feature_dict['rank_1_'+feature] = np.float16(storage[feature][0])
    # feature_dict['max_ranks_'+feature] = np.max(storage[feature]).astype(np.float16)
    feature_dict['median_ranks_'+feature] = np.median(storage[feature]).astype(np.float16)
    feature_dict['mean_ranks_'+feature] = np.mean(storage[feature], dtype = np.float64).astype(np.float16)
    feature_dict['std_ranks_'+feature] = np.std(storage[feature], dtype = np.float64).astype(np.float16)
  feature_dict['nb_ranks'] = len(storage[feature])

  return feature_dict


#
# don't want to bother with the bytes or the path-like objects
# > so extract, process, delete
def extract_and_process_disto(file_list, zip_archive, nb_peptide):

  storage = {
      'min_dist': [],
      'median_dist': [],
      'mean_dist': [],
      'std_dist': [],

      'max_16_prob': [],
      'median_16_prob': [],
      'mean_16_prob': [],
      'std_16_prob': [],

      'max_17_prob': [],
      'median_17_prob': [],
      'mean_17_prob': [],
      'std_17_prob': [],

      'max_18_prob': [],
      'median_18_prob': [],
      'mean_18_prob': [],
      'std_18_prob': [],

      'max_19_prob': [],
      'median_19_prob': [],
      'mean_19_prob': [],
      'std_19_prob': [],

      'max_20_prob': [],
      'median_20_prob': [],
      'mean_20_prob': [],
      'std_20_prob': [],

      'max_21_prob': [],
      'median_21_prob': [],
      'mean_21_prob': [],
      'std_21_prob': [],
  }

  for file_name in file_list:
    new_local_file = zip_archive.extract(file_name)

    if storage['min_dist'] == [] and 'rank_001' not in new_local_file:
      print('error : rank_001 not first in list', new_local_file)
    custom_disto = np.load(new_local_file, allow_pickle = True)

    # 16 features
    sum_to_16 = custom_disto[0]['contact_map'] - np.sum(custom_disto[0]['slices_17_to_21'][:, :, 0:2], axis = -1)
    sum_crop = sum_to_16[nb_peptide +1:, 0:nb_peptide +0] # +1 to go over the ':' row, +0 to stop before the ':' col
    storage['max_16_prob'].append(np.max(sum_crop))
    storage['median_16_prob'].append(np.median(sum_crop))
    storage['mean_16_prob'].append(np.mean(sum_crop, dtype = np.float64))
    storage['std_16_prob'].append(np.std(sum_crop, dtype = np.float64))

    for i in range(5): # 0 to 4 <-> 17 to 21
      sum_to_i = sum_to_16 + np.sum(custom_disto[0]['slices_17_to_21'][:, :, 0:i+1], axis = -1)
      sum_crop = sum_to_i[nb_peptide +1:, 0:nb_peptide +0] # +1 to go over the ':' row, +0 to stop before the ':' col
      slice_nb = 17 + i
      storage[f'max_{slice_nb}_prob'].append(np.max(sum_crop))
      storage[f'median_{slice_nb}_prob'].append(np.median(sum_crop))
      storage[f'mean_{slice_nb}_prob'].append(np.mean(sum_crop, dtype = np.float64))
      storage[f'std_{slice_nb}_prob'].append(np.std(sum_crop, dtype = np.float64))

    dist_mat = custom_disto[0]['distance_matrix']
    dist_mat = dist_mat[nb_peptide +1:, 0:nb_peptide +0]

    # print(np.sum(np.isinf(dist_mat)), np.sum(np.isnan(dist_mat)))
    # none, clean
    storage['min_dist'].append(np.min(dist_mat))
    storage['median_dist'].append(np.median(dist_mat))
    storage['mean_dist'].append(np.mean(dist_mat, dtype = np.float64))
    storage['std_dist'].append(np.std(dist_mat, dtype = np.float64))

    os.remove(new_local_file)
  #

  feature_dict = {}
  for feature in storage:
    feature_dict['rank_1_'+feature] = np.float16(storage[feature][0])
    # feature_dict['max_ranks_'+feature] = np.max(storage[feature]).astype(np.float16)
    feature_dict['median_ranks_'+feature] = np.median(storage[feature]).astype(np.float16)
    feature_dict['mean_ranks_'+feature] = np.mean(storage[feature], dtype = np.float64).astype(np.float16)
    feature_dict['std_ranks_'+feature] = np.std(storage[feature], dtype = np.float64).astype(np.float16)
  #feature_dict['nb_ranks'] = len(storage[feature])

  return feature_dict

# Main Run

In [None]:
#@title Main run

# main run
all_drive_elems = os.listdir(path)
temp_zips = [elem for elem in all_drive_elems if ".zip" in elem]

zip = temp_zips[0] #TODO becomes a loop

global_storage = {}

for zip in temp_zips:
  time.sleep(4) # oof Drive <-> Colab connection

  complex_id = zip.split('.')[0]
  is_positive_sample = len(complex_id) == 8

  if is_positive_sample:
    complex_sequence = positive_seqs[positive_seqs['id'] == complex_id]['sequence'].values[0]
  else:
    complex_sequence = negative_seqs[negative_seqs['id'] == complex_id]['sequence'].values[0]
  nb_peptide = len(complex_sequence.split(':')[0])

  print(complex_id)

  archive = ZipFile(path + '/' + zip, 'r')
  files = archive.namelist()
  pdb, scores, disto, repr = sort_files(files) # name paths

  feature_dict = extract_and_process_scores(scores, archive, nb_peptide)
  feature_dict_disto = extract_and_process_disto(disto, archive, nb_peptide)
  feature_dict.update(feature_dict_disto)

  global_storage[complex_id] = feature_dict
#

Exception ignored in: <function ZipFile.__del__ at 0x7ff168f92dd0>
Traceback (most recent call last):
  File "/usr/lib/python3.10/zipfile.py", line 1821, in __del__
  File "/usr/lib/python3.10/zipfile.py", line 1843, in close
  File "/usr/lib/python3.10/zipfile.py", line 1943, in _fpclose
OSError: [Errno 107] Transport endpoint is not connected


6iur_C-6iqj_A
6kmj_C-6i42_A
6qbb_P-6jfa_A
6l7c_S-6i7q_V
6jnf_D-6ifc_A
6i42_B-6jfa_A
6i7q_H-6i42_A
6sat_P-6spb_F
6qmp_A-6tzc_B
6lry_B-6i42_A
6i7q_H-6q68_A
6o23_E-6p8s_A
6vo5_C-6p8s_A
6l7c_S-6q68_A
6l0v_B-6spb_F
6uyo_B-6gc3_A
6e5n_A-6p8s_A
6s6q_C-6jfa_A
6tzc_C-6p8s_A
6qxz_B-6rir_A
6q9f_B-6tzc_B
6v7o_C-6rir_A
6snc_C-6jfa_A
6sof_F-6p8s_A
6t7y_B-6rir_A
6iqj_C-6rir_A
6vdb_H-6rir_A
6q53_B-6tzc_B
6o40_B-6gc3_A
6s1u_I-6kvm_A
6p9x_P-6gc3_A
6uyo_B-6spb_F
6qxz_B-6kvm_A
6o40_B-6q68_A
6sa8_B-6kvm_A
6jlj_X-6i42_A
6ifc_B-6rir_A
6p9x_P-6p8s_A
6i3z_B-6q68_A
6p9x_P-6q68_A
6rm8_C-6p8s_A
6j8y_D-6kvm_A
6vo5_C-6kvm_A
6iur_C-6p8s_A
6e5n_A-6kvm_A
6pau_C-6pj8_A
6iiw_B-6pj8_A
6roy_C-6pj8_A
6jlj_X-6rir_A
6e7i_P-6pj8_A
6roy_C-6pbv_A
6jez_C-6snc_A
6t2d_B-6snc_A
6jlh_B-6rqf_A
6v2h_B-6sen_A
6mf6_C-6snc_A
6lry_B-6rqf_A
6a0h_C-6hqu_A
6ifc_B-6rqf_A
6nq3_D-6sen_A
6q53_B-6kvm_A
6hl6_S-6pbv_A
6s6q_C-6sen_A
6p8s_E-6hqu_A
6p7e_U-6j8o_B
6t7y_B-6tyv_A
6pgq_B-6tyv_A
6ttu_I-6snc_A
6t1y_F-6j8o_B
6pau_C-6itm_A
6t1y_F-6itm_A
6p7e_U

In [None]:
test_df = pd.DataFrame(global_storage)

In [None]:
test_df.T.to_csv("first_compute_161_features_622_complexes.csv")

# Sandbox

In [None]:
complex_id = zip.split('.')[0]
is_positive_sample = len(complex_id) == 8

if is_positive_sample:
  complex_sequence = positive_seqs[positive_seqs['id'] == complex_id]['sequence'].values[0]
else:
  complex_sequence = negative_seqs[negative_seqs['id'] == complex_id]['sequence'].values[0]
nb_peptide = len(complex_sequence.split(':')[0])

print(complex_id)
print(is_positive_sample)
print(complex_sequence)
print(nb_peptide)

In [None]:
archive = ZipFile(path + '/' + zip, 'r')
files = archive.namelist()

In [None]:
pdb, scores, disto, repr = sort_files(files)

In [None]:
feature_dict = extract_and_process_scores(scores, archive, nb_peptide)

In [None]:
feature_dict.keys() #TODO check if values true, check file load and deletion works well

In [None]:
feature_dict_disto = extract_and_process_disto(disto, archive, nb_peptide)

In [None]:
feature_dict_disto.keys() #TODO check if values true, check file load and deletion works well

In [None]:
disto

In [None]:
archive.extract('6iur_C-6iqj_A_custom_disto_rank_004_alphafold2_multimer_v3_model_2_seed_000.npy')

In [None]:
test = np.load('/content/6iur_C-6iqj_A_custom_disto_rank_004_alphafold2_multimer_v3_model_2_seed_000.npy', allow_pickle = True)

In [None]:
test[0].keys()

In [None]:
test[0]['slices_17_to_21'].shape

In [None]:
sum_to_16 = test[0]['contact_map'] - np.sum(test[0]['slices_17_to_21'][:, :, 0:2], axis = -1)
for i in range(5):
  sum_to_i = sum_to_16 + np.sum(test[0]['slices_17_to_21'][:, :, 0:i+1], axis = -1)
  # plt.figure(figsize = (10, 10))
  # plt.imshow(sum_to_i, cmap = 'gray')
  # plt.title(f"{16+i}\n{np.sum(sum_to_i - test[0]['contact_map'])}")
  # plt.show()

In [None]:
test[0]['bin_edges'][0:18+1]

bin_edges
```
array([ 2.312,  2.625,  2.938,  3.25 ,  3.562,  3.875,  4.188,  4.5  ,
        4.812,  5.125,  5.438,  5.75 ,  6.062,  6.375,  6.688,  7.   ,
        7.312,  7.625,  7.938,  8.25 ,  8.56 ,  8.875,  9.19 ,  9.5  ,
        9.81 , 10.125, 10.44 , 10.75 , 11.06 , 11.375, 11.69 , 12.   ,
       12.31 , 12.625, 12.94 , 13.25 , 13.56 , 13.875, 14.19 , 14.5  ,
       14.81 , 15.125, 15.44 , 15.75 , 16.06 , 16.38 , 16.69 , 17.   ,
       17.31 , 17.62 , 17.94 , 18.25 , 18.56 , 18.88 , 19.19 , 19.5  ,
       19.81 , 20.12 , 20.44 , 20.75 , 21.06 , 21.38 , 21.69 ],
      dtype=float16)
```

