<a href="https://colab.research.google.com/github/zhuzihan728/COMP0138-Metal-Binding-Site-Prediction/blob/main/colab_scripts/label.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libs

In [None]:
!pip install biopython

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting biopython
  Downloading biopython-1.81-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.81


In [None]:
import pandas as pd
import numpy as np
from Bio import SeqIO
import sys
import h5py
import json

# Extract datasets

In [None]:
#!tar -xvf /content/drive/MyDrive/FYP/miniconda -C /root

In [None]:
!tar -xvf /content/drive/MyDrive/FYP/uniprot_datasets -C /content

ChEBI-IDs_for_metal_binding.tsv
NEG_clustered_rep_seq.fasta
NEG_TRAIN.fasta
POS_TRAIN.fasta
POS_TRAIN_FULL.fasta
POS_TRAIN_FULL.tsv
POS_TRAIN.tsv
filtered_combined.fasta
trimed_combined.fasta


In [None]:
!tar -zxvf /content/drive/MyDrive/FYP/data.tar.gz -C /content

./data/
./data/train_neg15.fasta
./data/test_full.fasta
./data/test.fasta
./data/train_neg7.fasta
./data/train_neg9.fasta
./data/train_neg8.fasta
./data/train_neg13.fasta
./data/train_pos.fasta
./data/train_neg11.fasta
./data/train_neg0.fasta
./data/train_neg6.fasta
./data/train_neg2.fasta
./data/train_neg14.fasta
./data/train_neg5.fasta
./data/train_neg3.fasta
./data/train_neg10.fasta
./data/train_neg12.fasta
./data/train_neg1.fasta
./data/train_neg4.fasta


In [None]:
!cat POS_TRAIN.fasta NEG_TRAIN.fasta > combined.fasta

In [None]:
total_len = len(list(SeqIO.parse("combined.fasta", "fasta")))
print("Full data set size: ", total_len)

Full data set size:  195450


In [None]:
!cp /content/drive/MyDrive/FYP/dicts/class_encode.json class_encode.json

In [None]:
!cp /content/drive/MyDrive/FYP/dicts/coalesced_encode.json coalesced_encode.json

# Helper functions

In [None]:
def check_metal(seqs, metal, anno, metal_count_df):
  cnt = 0
  temp = anno.loc[anno['Accession'].isin(seqs)]
  temp1 = temp['ChEBI-ID'].value_counts().to_frame().reset_index()
  row = temp1[temp1['index'] == metal]['ChEBI-ID']
  cnt = 0 if len(row) == 0 else int(row)
  per = cnt / int(metal_count_df[metal_count_df['ChEBI-ID'] == metal]['count'])
  return per

In [None]:
def check_metal_num(seqs, metal, anno):
  cnt = 0
  temp = anno.loc[anno['Accession'].isin(seqs)]
  temp1 = temp['ChEBI-ID'].value_counts().to_frame().reset_index()
  row = temp1[temp1['index'] == metal]['ChEBI-ID']
  cnt = 0 if len(row) == 0 else int(row)
  return cnt

In [None]:
def check_metal_specific_residue_proportion(acc_ls, source = 'POS_TRAIN_FULL.tsv', use_trimed=True):
  anno = pd.read_csv(source, sep='\t')
  metal_count_df = anno['ChEBI-ID'].value_counts().to_frame().reset_index()
  metal_count_df.columns = ['ChEBI-ID', 'count']
  if use_trimed:
    acc, _ = fasta2acc_seq_ls("trimed_combined.fasta")
    temp_cnt = []
    for i in metal_count_df['ChEBI-ID']:
      temp_cnt.append(check_metal_num(acc, i, anno))
    metal_count_df = pd.DataFrame({'ChEBI-ID': metal_count_df['ChEBI-ID'], 'count': temp_cnt})
  metal_id_name_df = pd.read_csv('ChEBI-IDs_for_metal_binding.tsv', sep='\t')
  for metal in metal_count_df['ChEBI-ID'].unique():
    metal_name = metal_id_name_df[metal_id_name_df['ChEBI-ID']==metal]['Name'].iloc[0]
    per = check_metal(acc_ls, metal, anno, metal_count_df) 
    num = int(metal_count_df[metal_count_df['ChEBI-ID'] == metal]['count'])
    print(f'{metal:12}| {metal_name:29} | num: {int(num*per):6} | %: {per}')

In [None]:
def write_seq_ls2fasta(file_out, ls, source):
  with open(file_out, 'w') as f_out:
    for seq_record in SeqIO.parse(source, "fasta"):
      seq_acc = seq_record.id.split('|')[1]
      if seq_acc in ls:
        r = SeqIO.write(seq_record, f_out, 'fasta')

        if r!=1: 
          print('Error while writing sequence: ' + seq_acc)
        else:
          print(f'writing {seq_acc} to train fasta file.')

In [None]:
def fasta2acc_seq_ls(path):
  acc = []
  seq = []

  for seq_record in SeqIO.parse(path, "fasta"):
    acc.append(seq_record.id.split('|')[1])
    seq.append(str(seq_record.seq))
  return acc, seq

In [None]:
def check_pos_neg_proportion(ls):
  total_num = len(ls)
  
  acc, _ = fasta2acc_seq_ls("POS_TRAIN_FULL.fasta")
  inter = set(acc).intersection(ls)
  pos_num = len(inter)
  neg_num = total_num - pos_num
  pos_portion = pos_num/total_num
  neg_portion = neg_num/total_num
  print(f'total seq in the set: {total_num}')
  print(f'proportion over full dataset: {total_num/total_len}')
  print(f'pos: {pos_num} %: {pos_portion}')
  print(f'neg: {neg_num} %: {neg_portion}')
  return total_num, pos_num, neg_num, pos_portion, neg_portion

In [None]:
def identity_above_threshold(m8file, thres):
  data = pd.read_csv(m8file, sep="\t", index_col=False, header=None)
  data.columns = ["query", "target","sequence identity","alignment length","mismatch","gap opening", "query domain start position", "end position","target domain start position", "end position", "evalue", "bit score"]
  
  seq_above_thres = data[data["sequence identity"] > thres]["query"].unique()
  seq_below_thres = data[~data["query"].isin(seq_above_thres)]["query"].unique()
  # print(data[data["sequence identity"] > thres]["sequence identity"].unique())
  all_seq = data["query"].unique()
  proportion = len(seq_above_thres) / len(all_seq)
  print(len(all_seq) == len(seq_above_thres) + len(seq_below_thres))
  return seq_above_thres, seq_below_thres, proportion

In [None]:
def read_fasta(fasta_path, split_char="|", id_field=1):
    '''
        Reads in fasta file containing multiple sequences.
        Split_char and id_field allow to control identifier extraction from header.
        E.g.: set split_char="|" and id_field=1 for SwissProt/UniProt Headers.
        Returns dictionary holding multiple sequences or only single 
        sequence, depending on input file.
    '''
    
    seqs = dict()
    with open( fasta_path, 'r' ) as fasta_f:
        for line in fasta_f:
            # get uniprot ID from header and create new entry
            if line.startswith('>'):
                uniprot_id = line.replace('>', '').strip().split(split_char)[id_field]
                # replace tokens that are mis-interpreted when loading h5
                uniprot_id = uniprot_id.replace("/","_").replace(".","_")
                seqs[ uniprot_id ] = ''
            else:
                # repl. all whie-space chars and join seqs spanning multiple lines, drop gaps and cast to upper-case
                seq= ''.join( line.split() ).upper().replace("-","")
                # repl. all non-standard AAs and map them to unknown/X
                seq = seq.replace('U','X').replace('Z','X').replace('O','X')
                seqs[ uniprot_id ] += seq 
    example_id=next(iter(seqs))
    print("Read {} sequences.".format(len(seqs)))
    print("Example:\n{}\n{}".format(example_id,seqs[example_id]))

    return seqs

In [None]:
def dataset_metal_binding_summary(acc_ls, source = 'POS_TRAIN_FULL.tsv' , coalesce = True):
  total_num = len(acc_ls)
  print(f'total seq in the set: {total_num}, {total_num/total_len*100:.{3}}% of full dataset')
  
  all_pos_acc_ls, _ = fasta2acc_seq_ls("POS_TRAIN_FULL.fasta")
  if coalesce:
    metals =  {'CHEBI:29105':0,'CHEBI:18420':1,'CHEBI:49883':2,'CHEBI:29108':3,'CHEBI:29035':4,'CHEBI:60240':5,'CHEBI:24875':6,'CHEBI:190135':7,'CHEBI:23378':8,'CHEBI:29103':9,'CHEBI:49786':10,'CHEBI:29101':11,'CHEBI:29034':12,'CHEBI:30408':13,'CHEBI:29036':14,'CHEBI:29033':15, 'CHEBI:48828':16, 'CHEBI:25213':17, 'CHEBI:21137':13,'CHEBI:49552':8,'CHEBI:48775':5,'CHEBI:21143':13,'CHEBI:47739':13,'CHEBI:16793':5,'CHEBI:177874':13,'CHEBI:60400':13,'CHEBI:49415':17, 'CHEBI:49713':17}
  else:
    metals = {'CHEBI:29105':0,'CHEBI:18420':1,'CHEBI:49883':2,'CHEBI:29108':3,'CHEBI:29035':4,'CHEBI:60240':5,'CHEBI:24875':6,'CHEBI:190135':7,'CHEBI:23378':8,'CHEBI:29103':9,'CHEBI:49786':10,'CHEBI:29101':11,'CHEBI:29034':12,'CHEBI:30408':13,'CHEBI:29036':14,'CHEBI:29033':15,'CHEBI:21137':16,'CHEBI:49552':17,'CHEBI:48775':18,'CHEBI:48828':19,'CHEBI:21143':20,'CHEBI:25213':21,'CHEBI:47739':22,'CHEBI:16793':23,'CHEBI:177874':24,'CHEBI:60400':25,'CHEBI:49415':26,'CHEBI:60504':27,'CHEBI:49713':28}
  anno = pd.read_csv(source, sep='\t')
  metal_count_df = anno['ChEBI-ID'].value_counts().to_frame().reset_index()
  metal_count_df.columns = ['ChEBI-ID', 'count']
  metal_id_name_df = pd.read_csv('ChEBI-IDs_for_metal_binding.tsv', sep='\t')
  prot_counter = [0]*29 
  res_counter = [0]*29
  pos_acc = set(all_pos_acc_ls).intersection(acc_ls)
  if not total_num == len(pos_acc):
    print(f"#metal-binding protein: {len(pos_acc)} \n#non-binding protein: {total_num-len(pos_acc)} \npos/neg: {len(pos_acc)/(total_num-len(pos_acc))}")
  if coalesce:
      for i, metal in enumerate(metals):
        temp = anno[anno['ChEBI-ID'] == metal]
        prot_counter[metals[metal]] += len(temp[temp['Accession'].isin(pos_acc)]['Accession'].unique())
        res_counter[metals[metal]] += check_metal_num(acc_ls, metal, anno)
      i = 0
      for metal in list(metals.keys())[:18]:
        metal_name = metal_id_name_df[metal_id_name_df['ChEBI-ID']==metal]['Name'].iloc[0]
        print(f"{metal:13}|{metal_name:30}|#p: {prot_counter[i]:10}|#residue: {res_counter[i]:6}")
        i+=1
      return prot_counter, res_counter
        

  for i, metal in enumerate(metals):
    metal_name = metal_id_name_df[metal_id_name_df['ChEBI-ID']==metal]['Name'].iloc[0]
    temp = anno[anno['ChEBI-ID'] == metal]
    prot_counter[i] += len(temp[temp['Accession'].isin(pos_acc)]['Accession'].unique())
    res_counter[i] += check_metal_num(acc_ls, metal, anno)
    total_res_num = np.sum(metal_count_df['count'])
    print(f"{metal:13}|{metal_name:30}|#p: {prot_counter[i]:10}|#residue: {res_counter[i]:6}|% over all binding residues: {res_counter[i]/total_res_num:{5}.{3}}")
  return prot_counter, res_counter, 

In [None]:
def retrive_json(path):
  with open(path, 'r') as fp:
    data = json.load(fp)
  return data

In [None]:
def save_json(data, path):
  with open(path, 'w') as fp:
      json.dump(data, fp)

# Prepare train set

In [None]:
pos_acc, _ = fasta2acc_seq_ls('POS_TRAIN.fasta')

In [None]:
train_acc, train_seq = fasta2acc_seq_ls('/content/data/train_pos.fasta')

In [None]:
train_dc = read_fasta('/content/data/train_pos.fasta')

Read 2093 sequences.
Example:
A0A023GPI8
ADTIVAVELDTYPNTDIGDPSYPHIGIDIKSVRSKKTAKWNMQNGKVGTAHIIYNSVGKRLSAVVSYPNGDSATVSYDVDLDNVLPEWVRVGLSATTGLYKETNTILSWSFTSKLKSNSTHETNALHFMFNQFSKDQKDLILQGDATTGRDGNLELTRVSSNGSPQGSSVGRALFYAPVHIWESSAVVASFDATFTFLIKSSDSHPADGIAFFISNIDSSIPSGSTGRLLGLFPDAN


In [None]:
dataset_metal_binding_summary(train_acc)
pass

total seq in the set: 2093, 1.07% of full dataset
CHEBI:29105  |Zn(2+)                        |#p:        710|#residue:   6457
CHEBI:18420  |Mg(2+)                        |#p:        508|#residue:   2423
CHEBI:49883  |[4Fe-4S] cluster              |#p:        110|#residue:    934
CHEBI:29108  |Ca(2+)                        |#p:        399|#residue:   5169
CHEBI:29035  |Mn(2+)                        |#p:        184|#residue:   1191
CHEBI:60240  |a divalent metal cation       |#p:         83|#residue:    810
CHEBI:24875  |Fe cation                     |#p:        163|#residue:   1048
CHEBI:190135 |[2Fe-2S] cluster              |#p:         53|#residue:    381
CHEBI:23378  |Cu cation                     |#p:         82|#residue:    614
CHEBI:29103  |K(+)                          |#p:         22|#residue:    167
CHEBI:49786  |Ni(2+)                        |#p:         15|#residue:     93
CHEBI:29101  |Na(+)                         |#p:         20|#residue:    152
CHEBI:29034  |Fe(3+)      

In [None]:
test_acc, test_seq = fasta2acc_seq_ls('/content/data/test.fasta')

In [None]:
test_dc = read_fasta('/content/data/test.fasta')

Read 3911 sequences.
Example:
A0A0H3KB22
MTYAVKEIFYTLQGEGANAGRPAVFCRFAGCNLWSGREEDRAQAVCRFCDTDFVGTDGENGGKFKDADALVATIAGLWPAGEAHRFVVCTGGEPMLQLDQPLVDALHAAGFGIAIETNGSLPVLESIDWICVSPKADAPLVVTKGNELKVVIPQDNQRLADYAKLDFEYFLVQPMDGPSRDLNTKLAIDWCKRHPQWRLSMQTHKYLNIP


In [None]:
dataset_metal_binding_summary(test_acc)
pass

total seq in the set: 3911, 2.0% of full dataset
#metal-binding protein: 224 
#non-binding protein: 3687 
pos/neg: 0.06075400054244643
CHEBI:29105  |Zn(2+)                        |#p:         95|#residue:    700
CHEBI:18420  |Mg(2+)                        |#p:         58|#residue:    263
CHEBI:49883  |[4Fe-4S] cluster              |#p:         11|#residue:    101
CHEBI:29108  |Ca(2+)                        |#p:         26|#residue:    560
CHEBI:29035  |Mn(2+)                        |#p:         25|#residue:    131
CHEBI:60240  |a divalent metal cation       |#p:         10|#residue:    107
CHEBI:24875  |Fe cation                     |#p:         12|#residue:     93
CHEBI:190135 |[2Fe-2S] cluster              |#p:          4|#residue:     46
CHEBI:23378  |Cu cation                     |#p:         10|#residue:     53
CHEBI:29103  |K(+)                          |#p:          3|#residue:     18
CHEBI:49786  |Ni(2+)                        |#p:          1|#residue:      4
CHEBI:29101  |Na(+

# Generate encodings

In [None]:
metal_id_name_df = pd.read_csv('ChEBI-IDs_for_metal_binding.tsv', sep='\t')
metal_id_name_df

Unnamed: 0,ChEBI-ID,Name,ChEBI-ID Parents
0,CHEBI:48775,Cd(2+),"CHEBI:60240,CHEBI:25213"
1,CHEBI:29108,Ca(2+),"CHEBI:60240,CHEBI:25213"
2,CHEBI:48828,Co(2+),"CHEBI:60240,CHEBI:25213"
3,CHEBI:49415,Co(3+),CHEBI:25213
4,CHEBI:23378,Cu cation,CHEBI:25213
5,CHEBI:49552,Cu(+),"CHEBI:23378,CHEBI:25213"
6,CHEBI:29036,Cu(2+),"CHEBI:23378,CHEBI:60240,CHEBI:25213"
7,CHEBI:60240,a divalent metal cation,CHEBI:25213
8,CHEBI:190135,[2Fe-2S] cluster,CHEBI:30408
9,CHEBI:24875,Fe cation,CHEBI:25213


In [None]:
metal_encode = {'CHEBI:29105':0,'CHEBI:18420':1,'CHEBI:49883':2,'CHEBI:29108':3,'CHEBI:29035':4,'CHEBI:60240':5,'CHEBI:24875':6,'CHEBI:190135':7,'CHEBI:23378':8,'CHEBI:29103':9,'CHEBI:49786':10,'CHEBI:29101':11,'CHEBI:29034':12,'CHEBI:30408':13,'CHEBI:29036':14,'CHEBI:29033':15, 'CHEBI:48828':16, 'CHEBI:25213':17, 'CHEBI:21137':13,'CHEBI:49552':8,'CHEBI:48775':5,'CHEBI:21143':13,'CHEBI:47739':13,'CHEBI:16793':5,'CHEBI:177874':13,'CHEBI:60400':13,'CHEBI:49415':17, 'CHEBI:49713':17}

In [None]:
i = 0
for m in metal_encode.keys():
  temp1 = metal_id_name_df[metal_id_name_df['ChEBI-ID']==m]
  print(i, temp1['ChEBI-ID'].iloc[0], temp1['Name'].iloc[0], temp1['ChEBI-ID Parents'].iloc[0])


  i+=1

0 CHEBI:29105 Zn(2+) CHEBI:60240,CHEBI:25213
1 CHEBI:18420 Mg(2+) CHEBI:60240,CHEBI:25213
2 CHEBI:49883 [4Fe-4S] cluster CHEBI:30408
3 CHEBI:29108 Ca(2+) CHEBI:60240,CHEBI:25213
4 CHEBI:29035 Mn(2+) CHEBI:60240,CHEBI:25213
5 CHEBI:60240 a divalent metal cation CHEBI:25213
6 CHEBI:24875 Fe cation CHEBI:25213
7 CHEBI:190135 [2Fe-2S] cluster CHEBI:30408
8 CHEBI:23378 Cu cation CHEBI:25213
9 CHEBI:29103 K(+) CHEBI:25213
10 CHEBI:49786 Ni(2+) CHEBI:60240,CHEBI:25213
11 CHEBI:29101 Na(+) CHEBI:25213
12 CHEBI:29034 Fe(3+) CHEBI:24875,CHEBI:25213
13 CHEBI:30408 iron-sulfur cluster nan
14 CHEBI:29036 Cu(2+) CHEBI:23378,CHEBI:60240,CHEBI:25213
15 CHEBI:29033 Fe(2+) CHEBI:24875,CHEBI:60240,CHEBI:25213
16 CHEBI:48828 Co(2+) CHEBI:60240,CHEBI:25213
17 CHEBI:25213 a metal cation nan
18 CHEBI:21137 [3Fe-4S] cluster CHEBI:30408
19 CHEBI:49552 Cu(+) CHEBI:23378,CHEBI:25213
20 CHEBI:48775 Cd(2+) CHEBI:60240,CHEBI:25213
21 CHEBI:21143 [8Fe-7S] cluster CHEBI:30408
22 CHEBI:47739 [Ni-4Fe-4S] cluster CHEBI:

In [None]:
coalesced_encode = {}

for metal in metal_encode.keys():
  ls = []
  parents = metal_id_name_df[metal_id_name_df['ChEBI-ID'] == metal]['ChEBI-ID Parents'].iloc[0]
  ls.append(metal_encode[metal])
  if not pd.isna(parents):
    ls.extend([metal_encode[i] for i in parents.split(',')])
  coalesced_encode[metal] = list(set(ls))

In [None]:
coalesced_encode

{'CHEBI:29105': [0, 17, 5],
 'CHEBI:18420': [1, 5, 17],
 'CHEBI:49883': [2, 13],
 'CHEBI:29108': [17, 3, 5],
 'CHEBI:29035': [17, 4, 5],
 'CHEBI:60240': [17, 5],
 'CHEBI:24875': [17, 6],
 'CHEBI:190135': [13, 7],
 'CHEBI:23378': [8, 17],
 'CHEBI:29103': [9, 17],
 'CHEBI:49786': [17, 10, 5],
 'CHEBI:29101': [17, 11],
 'CHEBI:29034': [17, 12, 6],
 'CHEBI:30408': [13],
 'CHEBI:29036': [8, 17, 5, 14],
 'CHEBI:29033': [17, 5, 6, 15],
 'CHEBI:48828': [16, 17, 5],
 'CHEBI:25213': [17],
 'CHEBI:21137': [13],
 'CHEBI:49552': [8, 17],
 'CHEBI:48775': [17, 5],
 'CHEBI:21143': [13],
 'CHEBI:47739': [13],
 'CHEBI:16793': [17, 5],
 'CHEBI:177874': [13],
 'CHEBI:60400': [13],
 'CHEBI:49415': [17],
 'CHEBI:49713': [17]}

In [None]:
cnt = 0
class_encode = {}
for i in coalesced_encode:
  metal_name = metal_id_name_df[metal_id_name_df["ChEBI-ID"] == i]['Name'].iloc[0]
  enc_ls = coalesced_encode[i]
  if cnt <= 17:
    class_encode[cnt] = (i, metal_name)
  print(f"{cnt:2}|{i:13}|{metal_name:25} {enc_ls}")
  cnt += 1

 0|CHEBI:29105  |Zn(2+)                    [0, 17, 5]
 1|CHEBI:18420  |Mg(2+)                    [1, 5, 17]
 2|CHEBI:49883  |[4Fe-4S] cluster          [2, 13]
 3|CHEBI:29108  |Ca(2+)                    [17, 3, 5]
 4|CHEBI:29035  |Mn(2+)                    [17, 4, 5]
 5|CHEBI:60240  |a divalent metal cation   [17, 5]
 6|CHEBI:24875  |Fe cation                 [17, 6]
 7|CHEBI:190135 |[2Fe-2S] cluster          [13, 7]
 8|CHEBI:23378  |Cu cation                 [8, 17]
 9|CHEBI:29103  |K(+)                      [9, 17]
10|CHEBI:49786  |Ni(2+)                    [17, 10, 5]
11|CHEBI:29101  |Na(+)                     [17, 11]
12|CHEBI:29034  |Fe(3+)                    [17, 12, 6]
13|CHEBI:30408  |iron-sulfur cluster       [13]
14|CHEBI:29036  |Cu(2+)                    [8, 17, 5, 14]
15|CHEBI:29033  |Fe(2+)                    [17, 5, 6, 15]
16|CHEBI:48828  |Co(2+)                    [16, 17, 5]
17|CHEBI:25213  |a metal cation            [17]
18|CHEBI:21137  |[3Fe-4S] cluster          [13]
1

In [None]:
class_encode

{0: ('CHEBI:29105', 'Zn(2+)'),
 1: ('CHEBI:18420', 'Mg(2+)'),
 2: ('CHEBI:49883', '[4Fe-4S] cluster'),
 3: ('CHEBI:29108', 'Ca(2+)'),
 4: ('CHEBI:29035', 'Mn(2+)'),
 5: ('CHEBI:60240', 'a divalent metal cation'),
 6: ('CHEBI:24875', 'Fe cation'),
 7: ('CHEBI:190135', '[2Fe-2S] cluster'),
 8: ('CHEBI:23378', 'Cu cation'),
 9: ('CHEBI:29103', 'K(+)'),
 10: ('CHEBI:49786', 'Ni(2+)'),
 11: ('CHEBI:29101', 'Na(+)'),
 12: ('CHEBI:29034', 'Fe(3+)'),
 13: ('CHEBI:30408', 'iron-sulfur cluster'),
 14: ('CHEBI:29036', 'Cu(2+)'),
 15: ('CHEBI:29033', 'Fe(2+)'),
 16: ('CHEBI:48828', 'Co(2+)'),
 17: ('CHEBI:25213', 'a metal cation')}

In [None]:
# with open('coalesced_encode.json', 'w') as fp:
#     json.dump(coalesced_encode, fp)

# with open('class_encode.json', 'w') as fp:
#     json.dump(class_encode, fp)

# Retrive encodings

In [None]:
coalesced_encode = retrive_json('coalesced_encode.json')

In [None]:
coalesced_encode

{'CHEBI:29105': [0, 17, 5],
 'CHEBI:18420': [1, 5, 17],
 'CHEBI:49883': [2, 13],
 'CHEBI:29108': [17, 3, 5],
 'CHEBI:29035': [17, 4, 5],
 'CHEBI:60240': [17, 5],
 'CHEBI:24875': [17, 6],
 'CHEBI:190135': [13, 7],
 'CHEBI:23378': [8, 17],
 'CHEBI:29103': [9, 17],
 'CHEBI:49786': [17, 10, 5],
 'CHEBI:29101': [17, 11],
 'CHEBI:29034': [17, 12, 6],
 'CHEBI:30408': [13],
 'CHEBI:29036': [8, 17, 5, 14],
 'CHEBI:29033': [17, 5, 6, 15],
 'CHEBI:48828': [16, 17, 5],
 'CHEBI:25213': [17],
 'CHEBI:21137': [13],
 'CHEBI:49552': [8, 17],
 'CHEBI:48775': [17, 5],
 'CHEBI:21143': [13],
 'CHEBI:47739': [13],
 'CHEBI:16793': [17, 5],
 'CHEBI:177874': [13],
 'CHEBI:60400': [13],
 'CHEBI:49415': [17],
 'CHEBI:49713': [17]}

In [None]:
class_encode_temp = retrive_json('class_encode.json')

In [None]:
class_encode = {}
for i, v in class_encode_temp.items():
  class_encode[int(i)] = v

In [None]:
class_encode

{0: ['CHEBI:29105', 'Zn(2+)'],
 1: ['CHEBI:18420', 'Mg(2+)'],
 2: ['CHEBI:49883', '[4Fe-4S] cluster'],
 3: ['CHEBI:29108', 'Ca(2+)'],
 4: ['CHEBI:29035', 'Mn(2+)'],
 5: ['CHEBI:60240', 'a divalent metal cation'],
 6: ['CHEBI:24875', 'Fe cation'],
 7: ['CHEBI:190135', '[2Fe-2S] cluster'],
 8: ['CHEBI:23378', 'Cu cation'],
 9: ['CHEBI:29103', 'K(+)'],
 10: ['CHEBI:49786', 'Ni(2+)'],
 11: ['CHEBI:29101', 'Na(+)'],
 12: ['CHEBI:29034', 'Fe(3+)'],
 13: ['CHEBI:30408', 'iron-sulfur cluster'],
 14: ['CHEBI:29036', 'Cu(2+)'],
 15: ['CHEBI:29033', 'Fe(2+)'],
 16: ['CHEBI:48828', 'Co(2+)'],
 17: ['CHEBI:25213', 'a metal cation']}

# Label writing

In [None]:
def retrive_label_from_fasta(fa_file, npz_file = None):
  acc_ls, seqs = fasta2acc_seq_ls(fa_file)
  acc_dc = read_fasta(fa_file)
  pos_acc, _ = fasta2acc_seq_ls('POS_TRAIN.fasta')
  anno = pd.read_csv('POS_TRAIN.tsv', sep='\t')
  pos_in_ls = set(pos_acc).intersection(acc_ls)
  relevant_anno = anno[(anno['Accession'].isin(pos_in_ls)) & (anno['ChEBI-ID']!='CHEBI:60504')]

  def retrive_label(acc):
  
    target = np.zeros((18, len(acc_dc[acc])),dtype=np.int8)
    if acc not in pos_in_ls:
      return np.zeros(1,dtype=np.int8)
    temp = relevant_anno[relevant_anno['Accession'] == acc]
    for i, m in enumerate(coalesced_encode):
      pos = list(temp[temp['ChEBI-ID']==m]['Position'])
      ind = [j-1 for j in pos] # position in anno starts at 1, but label ndarray starts at 0
      for x_ind in coalesced_encode[m]:
        target[x_ind, ind] = 1
    return target
  
  label_ls = [retrive_label(i) for i in acc_ls]
  if npz_file is not None:
    np.savez(npz_file, **dict(zip(acc_ls, label_ls)))

  return label_ls

# Scripts

In [None]:
fa_files = [f'train_neg{i}' for i in range(16)]

In [None]:
fa_files.append('train_pos')

In [None]:
fa_files.append('test_full')

In [None]:
fa_files

['train_neg0',
 'train_neg1',
 'train_neg2',
 'train_neg3',
 'train_neg4',
 'train_neg5',
 'train_neg6',
 'train_neg7',
 'train_neg8',
 'train_neg9',
 'train_neg10',
 'train_neg11',
 'train_neg12',
 'train_neg13',
 'train_neg14',
 'train_neg15',
 'train_pos',
 'test_full']

In [None]:
fa_files = ['test']

In [None]:
for f in fa_files:
  retrive_label_from_fasta(f'/content/data/{f}.fasta', npz_file = f'{f}.npz')

Read 3911 sequences.
Example:
A0A0H3KB22
MTYAVKEIFYTLQGEGANAGRPAVFCRFAGCNLWSGREEDRAQAVCRFCDTDFVGTDGENGGKFKDADALVATIAGLWPAGEAHRFVVCTGGEPMLQLDQPLVDALHAAGFGIAIETNGSLPVLESIDWICVSPKADAPLVVTKGNELKVVIPQDNQRLADYAKLDFEYFLVQPMDGPSRDLNTKLAIDWCKRHPQWRLSMQTHKYLNIP


In [None]:
labels = np.load('/content/drive/MyDrive/FYP/label_write/test.npz')

In [None]:
list(labels.keys())[:10]

['A0A0H3KB22',
 'A0A1C7D1B7',
 'A0NLY7',
 'A0Q5Y3',
 'A0QZY0',
 'A3DC27',
 'A4XF23',
 'A5TYT6',
 'A9CK16',
 'B0T0B1']

In [None]:
labels['Q9VMG0']

array([0], dtype=int8)

In [None]:
labels['P08200'].shape

(18, 416)

In [None]:
for i in labels['P08200']:
  print(np.where(i == 1))

(array([], dtype=int64),)
(array([306]),)
(array([], dtype=int64),)
(array([], dtype=int64),)
(array([], dtype=int64),)
(array([306]),)
(array([], dtype=int64),)
(array([], dtype=int64),)
(array([], dtype=int64),)
(array([], dtype=int64),)
(array([], dtype=int64),)
(array([], dtype=int64),)
(array([], dtype=int64),)
(array([], dtype=int64),)
(array([], dtype=int64),)
(array([], dtype=int64),)
(array([], dtype=int64),)
(array([306]),)


In [None]:
anno_smaller = pd.read_csv('POS_TRAIN.tsv', sep='\t')

In [None]:
anno_smaller[anno_smaller['Accession'] == 'P08200'].iloc[1]

Accession         P08200
Evidence     ECO:0000269
ChEBI-ID     CHEBI:18420
Position             307
Name: 9333, dtype: object

In [None]:
!cp *.npz /content/drive/MyDrive/FYP/label_write/