<a href="https://colab.research.google.com/github/zhuzihan728/metal-binding-site-prediction/blob/main/test_set_redundancy_reduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libs

In [None]:
!pip install biopython

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting biopython
  Downloading biopython-1.80-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.80


In [None]:
import pandas as pd
from Bio import SeqIO

# Extract datasets

In [None]:
%env MY_PATH=/content/drive/MyDrive/FYP

env: MY_PATH=/content/drive/MyDrive/FYP


In [None]:
import os

my_path = os.environ['MY_PATH']   
my_path

'/content/drive/MyDrive/FYP'

In [None]:
!tar -xvf /content/drive/MyDrive/FYP/uniprot_datasets

ChEBI-IDs_for_metal_binding.tsv
NEG_clustered_rep_seq.fasta
NEG_TRAIN.fasta
POS_TRAIN.fasta
POS_TRAIN_FULL.fasta
POS_TRAIN_FULL.tsv
POS_TRAIN.tsv
filtered_combined.fasta
trimed_combined.fasta


In [None]:
total_len = len(list(SeqIO.parse("trimed_combined.fasta", "fasta")))
print("Full data set size: ", total_len)

Full data set size:  177367


In [None]:
!cp /content/drive/MyDrive/FYP/TEST_POS_NEG.fasta TEST_POS_NEG.fasta

In [None]:
!cp /content/drive/MyDrive/FYP/TRAIN_POS_NEG.fasta TRAIN_POS_NEG.fasta

# Install MMSEQS

In [None]:
!tar -xvf /content/drive/MyDrive/FYP/miniconda -C /root

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
miniconda/lib/python3.7/site-packages/conda_env/cli/__pycache__/main_config.cpython-37.pyc
miniconda/lib/python3.7/site-packages/conda_env/cli/main_config.py
miniconda/lib/python3.7/site-packages/conda_env/cli/main.py
miniconda/lib/python3.7/site-packages/conda_env/cli/main_vars.py
miniconda/lib/python3.7/site-packages/conda_env/exceptions.py
miniconda/lib/python3.7/site-packages/conda_env/installers/
miniconda/lib/python3.7/site-packages/conda_env/installers/__init__.py
miniconda/lib/python3.7/site-packages/conda_env/installers/conda.py
miniconda/lib/python3.7/site-packages/conda_env/installers/base.py
miniconda/lib/python3.7/site-packages/conda_env/installers/__pycache__/
miniconda/lib/python3.7/site-packages/conda_env/installers/__pycache__/pip.cpython-37.pyc
miniconda/lib/python3.7/site-packages/conda_env/installers/__pycache__/conda.cpython-37.pyc
miniconda/lib/python3.7/site-packages/conda_env/installers/__pycache__/__init__.cpython-37.pyc

In [None]:
%alias activate $HOME/miniconda/bin/activate

In [None]:
%alias mmseqs $HOME/miniconda/pkgs/mmseqs2-14.7e284-pl5321hf1761c0_0/bin/mmseqs

In [None]:
activate tutorial

In [None]:
mmseqs

MMseqs2 (Many against Many sequence searching) is an open-source software suite for very fast, 
parallelized protein sequence searches and clustering of huge protein sequence data sets.

Please cite: M. Steinegger and J. Soding. MMseqs2 enables sensitive protein sequence searching for the analysis of massive data sets. Nature Biotechnology, doi:10.1038/nbt.3988 (2017).

MMseqs2 Version: 14.7e284
© Martin Steinegger (martin.steinegger@snu.ac.kr)

usage: mmseqs <command> [<args>]

Easy workflows for plain text input/output
  easy-search       	Sensitive homology search
  easy-cluster      	Slower, sensitive clustering
  easy-linclust     	Fast linear time cluster, less sensitive clustering
  easy-taxonomy     	Taxonomic classification
  easy-rbh          	Find reciprocal best hit

Main workflows for database input/output
  search            	Sensitive homology search
  map               	Map nearly identical sequences
  rbh               	Reciprocal best hit search
  linclust          	F

# Helper functions

In [None]:
def check_metal_per(seqs, metal, anno, metal_count_df):
  cnt = 0
  temp = anno.loc[anno['Accession'].isin(seqs)]
  temp1 = temp['ChEBI-ID'].value_counts().to_frame().reset_index()
  row = temp1[temp1['index'] == metal]['ChEBI-ID']
  cnt = 0 if len(row) == 0 else int(row)
  per = cnt / int(metal_count_df[metal_count_df['ChEBI-ID'] == metal]['count'])
  return per

In [None]:
def check_metal_num(seqs, metal, anno):
  cnt = 0
  temp = anno.loc[anno['Accession'].isin(seqs)]
  temp1 = temp['ChEBI-ID'].value_counts().to_frame().reset_index()
  row = temp1[temp1['index'] == metal]['ChEBI-ID']
  cnt = 0 if len(row) == 0 else int(row)
  return cnt

In [None]:
def check_metal_specific_residue_proportion(acc_ls, source = 'POS_TRAIN_FULL.tsv'):
  anno = pd.read_csv(source, sep='\t')
  metal_count_df = anno['ChEBI-ID'].value_counts().to_frame().reset_index()
  metal_count_df.columns = ['ChEBI-ID', 'count']
  metal_id_name_df = pd.read_csv('ChEBI-IDs_for_metal_binding.tsv', sep='\t')
  for metal in metal_count_df['ChEBI-ID'].unique():
    metal_name = metal_id_name_df[metal_id_name_df['ChEBI-ID']==metal]['Name'].iloc[0]
    num = check_metal_num(acc_ls, metal, anno) 
    total_num = int(metal_count_df[metal_count_df['ChEBI-ID'] == metal]['count'])
    print(f'{metal:12}| {metal_name:29} | num: {int(num):6} | %: {num/total_num}')

In [None]:
def write_seq_ls2fasta(file_out, ls, source):
  with open(file_out, 'w') as f_out:
    for seq_record in SeqIO.parse(source, "fasta"):
      seq_acc = seq_record.id.split('|')[1]
      if seq_acc in ls:
        r = SeqIO.write(seq_record, f_out, 'fasta')

        if r!=1: 
          print('Error while writing sequence: ' + seq_acc)
        else:
          print(f'writing {seq_acc} to train fasta file.')

In [None]:
def fasta2acc_seq_ls(path):
  acc = []
  seq = []

  for seq_record in SeqIO.parse(path, "fasta"):
    acc.append(seq_record.id.split('|')[1])
    seq.append(str(seq_record.seq))
  return acc, seq

In [None]:
def check_pos_neg_proportion(ls):
  total_num = len(ls)
  
  acc, _ = fasta2acc_seq_ls("POS_TRAIN_FULL.fasta")
  inter = set(acc).intersection(ls)
  pos_num = len(inter)
  neg_num = total_num - pos_num
  pos_portion = pos_num/total_num
  neg_portion = neg_num/total_num
  print(f'total seq in the set: {total_num}')
  print(f'proportion over full dataset: {total_num/total_len}')
  print(f'pos: {pos_num} %: {pos_portion}')
  print(f'neg: {neg_num} %: {neg_portion}')
  return total_num, pos_num, neg_num, pos_portion, neg_portion

In [None]:
def identity_above_threshold(m8file, thres):
  data = pd.read_csv(m8file, sep="\t", index_col=False, header=None)
  data.columns = ["query", "target","sequence identity","alignment length","mismatch","gap opening", "query domain start position", "end position","target domain start position", "end position", "evalue", "bit score"]
  
  seq_above_thres = data[data["sequence identity"] > thres]["query"].unique()
  seq_below_thres = data[~data["query"].isin(seq_above_thres)]["query"].unique()
  # print(data[data["sequence identity"] > thres]["sequence identity"].unique())
  all_seq = data["query"].unique()
  print(len(all_seq))
  proportion = len(seq_above_thres) / len(all_seq)
  print(len(all_seq) == len(seq_above_thres) + len(seq_below_thres))
  return seq_above_thres, seq_below_thres, proportion

In [None]:
def target_identity_above_threshold(m8file, thres):
  data = pd.read_csv(m8file, sep="\t", index_col=False, header=None)
  data.columns = ["query", "target","sequence identity","alignment length","mismatch","gap opening", "query domain start position", "end position","target domain start position", "end position", "evalue", "bit score"]
  
  seq_above_thres = data[data["sequence identity"] > thres]["target"].unique()
  seq_below_thres = data[~data["target"].isin(seq_above_thres)]["target"].unique()
  # print(data[data["sequence identity"] > thres]["sequence identity"].unique())
  all_seq = data["target"].unique()
  print(len(all_seq))
  proportion = len(seq_above_thres) / len(all_seq)
  print(len(all_seq) == len(seq_above_thres) + len(seq_below_thres))
  return seq_above_thres, seq_below_thres, proportion

In [None]:
def read_fasta(fasta_path, split_char="|", id_field=1):
    '''
        Reads in fasta file containing multiple sequences.
        Split_char and id_field allow to control identifier extraction from header.
        E.g.: set split_char="|" and id_field=1 for SwissProt/UniProt Headers.
        Returns dictionary holding multiple sequences or only single 
        sequence, depending on input file.
    '''
    
    seqs = dict()
    with open( fasta_path, 'r' ) as fasta_f:
        for line in fasta_f:
            # get uniprot ID from header and create new entry
            if line.startswith('>'):
                uniprot_id = line.replace('>', '').strip().split(split_char)[id_field]
                # replace tokens that are mis-interpreted when loading h5
                uniprot_id = uniprot_id.replace("/","_").replace(".","_")
                seqs[ uniprot_id ] = ''
            else:
                # repl. all whie-space chars and join seqs spanning multiple lines, drop gaps and cast to upper-case
                seq= ''.join( line.split() ).upper().replace("-","")
                # repl. all non-standard AAs and map them to unknown/X
                seq = seq.replace('U','X').replace('Z','X').replace('O','X')
                seqs[ uniprot_id ] += seq 
    example_id=next(iter(seqs))
    print("Read {} sequences.".format(len(seqs)))
    print("Example:\n{}\n{}".format(example_id,seqs[example_id]))

    return seqs

In [None]:
def dataset_metal_binding_summary(acc_ls, source = 'POS_TRAIN_FULL.tsv'):
  total_num = len(acc_ls)
  print(f'total seq in the set: {total_num}')

  all_pos_acc_ls, _ = fasta2acc_seq_ls("POS_TRAIN_FULL.fasta")
  metals = {'CHEBI:29105':0,'CHEBI:18420':1,'CHEBI:49883':2,'CHEBI:29108':3,'CHEBI:29035':4,'CHEBI:60240':5,'CHEBI:24875':6,'CHEBI:190135':7,'CHEBI:23378':8,'CHEBI:29103':9,'CHEBI:49786':10,'CHEBI:29101':11,'CHEBI:29034':12,'CHEBI:30408':13,'CHEBI:29036':14,'CHEBI:29033':15,'CHEBI:21137':16,'CHEBI:49552':17,'CHEBI:48775':18,'CHEBI:48828':19,'CHEBI:21143':20,'CHEBI:25213':21,'CHEBI:47739':22,'CHEBI:16793':23,'CHEBI:177874':24,'CHEBI:60400':25,'CHEBI:49415':26,'CHEBI:60504':27,'CHEBI:49713':28}
  anno = pd.read_csv(source, sep='\t')
  metal_count_df = anno['ChEBI-ID'].value_counts().to_frame().reset_index()
  metal_count_df.columns = ['ChEBI-ID', 'count']
  metal_id_name_df = pd.read_csv('ChEBI-IDs_for_metal_binding.tsv', sep='\t')
  prot_counter = [0]*29 
  res_counter = [0]*29
  pos_acc = set(all_pos_acc_ls).intersection(acc_ls)
  for i, metal in enumerate(metals):
    metal_name = metal_id_name_df[metal_id_name_df['ChEBI-ID']==metal]['Name'].iloc[0]
    temp = anno[anno['ChEBI-ID'] == metal]
    prot_counter[i] += len(temp[temp['Accession'].isin(pos_acc)]['Accession'].unique())
    res_counter[i] += check_metal_num(acc_ls, metal, anno)
    total_res_num = int(metal_count_df[metal_count_df['ChEBI-ID'] == metal]['count'])
    print(f"{metal:13}|{metal_name:30}|#p: {prot_counter[i]:10}|#residue: {res_counter[i]:6}|%residue/all: {res_counter[i]/total_res_num:{5}.{3}}")
  print(f"#non-binding protein: {total_num-len(pos_acc)}")
  return prot_counter, res_counter


# Train/test set analysis

In [None]:
train_acc, _ = fasta2acc_seq_ls("TRAIN_POS_NEG.fasta")
test_acc, _ = fasta2acc_seq_ls("TEST_POS_NEG.fasta")
total_acc, _ = fasta2acc_seq_ls("trimed_combined.fasta")

In [None]:
print(f'train set size: {len(train_acc)}')
print(f'test set size: {len(test_acc)}')
print(len(train_acc) + len(test_acc) == len(total_acc))

train set size: 148681
test set size: 28686
True


## Train Set Metal-binding Residues

In [None]:
dataset_metal_binding_summary(train_acc)
pass

total seq in the set: 148681
CHEBI:29105  |Zn(2+)                        |#p:      19474|#residue:  87526|%residue/all: 0.654
CHEBI:18420  |Mg(2+)                        |#p:      23707|#residue:  64704|%residue/all: 0.724
CHEBI:49883  |[4Fe-4S] cluster              |#p:       8105|#residue:  39824|%residue/all: 0.772
CHEBI:29108  |Ca(2+)                        |#p:       3642|#residue:  29597|%residue/all: 0.702
CHEBI:29035  |Mn(2+)                        |#p:       4195|#residue:  17456|%residue/all: 0.778
CHEBI:60240  |a divalent metal cation       |#p:       3289|#residue:  14153|%residue/all: 0.778
CHEBI:24875  |Fe cation                     |#p:       3869|#residue:  13485|%residue/all: 0.773
CHEBI:190135 |[2Fe-2S] cluster              |#p:       1977|#residue:   6866|%residue/all:  0.76
CHEBI:23378  |Cu cation                     |#p:       1051|#residue:   5590|%residue/all: 0.756
CHEBI:29103  |K(+)                          |#p:       1944|#residue:   4964|%residue/all: 0.792
C

## Train set summary

In [None]:
check_pos_neg_proportion(train_acc)

total seq in the set: 148681
proportion over full dataset: 0.8382675469506728
pos: 67609 %: 0.45472521707548375
neg: 81072 %: 0.5452747829245163


(148681, 67609, 81072, 0.45472521707548375, 0.5452747829245163)

## Test Set Metal-binding Residues

In [None]:
dataset_metal_binding_summary(test_acc)
pass

total seq in the set: 28686
CHEBI:29105  |Zn(2+)                        |#p:       5282|#residue:  22031|%residue/all: 0.165
CHEBI:18420  |Mg(2+)                        |#p:       7244|#residue:  16194|%residue/all: 0.181
CHEBI:49883  |[4Fe-4S] cluster              |#p:       2688|#residue:  10033|%residue/all: 0.194
CHEBI:29108  |Ca(2+)                        |#p:        972|#residue:   7413|%residue/all: 0.176
CHEBI:29035  |Mn(2+)                        |#p:        794|#residue:   4427|%residue/all: 0.197
CHEBI:60240  |a divalent metal cation       |#p:       1112|#residue:   3551|%residue/all: 0.195
CHEBI:24875  |Fe cation                     |#p:        891|#residue:   3384|%residue/all: 0.194
CHEBI:190135 |[2Fe-2S] cluster              |#p:        620|#residue:   1729|%residue/all: 0.191
CHEBI:23378  |Cu cation                     |#p:        348|#residue:   1411|%residue/all: 0.191
CHEBI:29103  |K(+)                          |#p:        311|#residue:   1254|%residue/all:   0.2
CH

## Test set summary

In [None]:
check_pos_neg_proportion(test_acc)

total seq in the set: 28686
proportion over full dataset: 0.1617324530493271
pos: 19536 %: 0.6810290734156035
neg: 9150 %: 0.3189709265843966


(28686, 19536, 9150, 0.6810290734156035, 0.3189709265843966)

# Identity Analysis: filter based on identity

test set: <40% identity to train set \
train set: filter highly homogeneous proteins (take reps of clusters id > 80%)

## fiter train set (This section is not performed as suggested by David)

In [None]:
mmseqs easy-cluster $MY_PATH/MY_TRAIN_POS_NEG.fasta assembly_clustered tmp --cluster-mode 2 --min-seq-id 0.8

easy-cluster /content/drive/MyDrive/FYP/data_split_cdhit/MY_TRAIN_POS_NEG.fasta assembly_clustered tmp --cluster-mode 2 --min-seq-id 0.8 

MMseqs Version:                     	14.7e284
Substitution matrix                 	aa:blosum62.out,nucl:nucleotide.out
Seed substitution matrix            	aa:VTML80.out,nucl:nucleotide.out
Sensitivity                         	4
k-mer length                        	0
k-score                             	seq:2147483647,prof:2147483647
Alphabet size                       	aa:21,nucl:5
Max sequence length                 	65535
Max results per query               	20
Split database                      	0
Split mode                          	2
Split memory limit                  	0
Coverage threshold                  	0.8
Coverage mode                       	0
Compositional bias                  	1
Compositional bias                  	1
Diagonal scoring                    	true
Exact k-mer matching                	0
Mask residues                       

In [None]:
!cp assembly_clustered_rep_seq.fasta $MY_PATH/assembly_clustered_rep_seq.fasta

In [None]:
temp1 = pd.read_csv("assembly_clustered_cluster.tsv", sep='\t')
l1 = len(temp1)

temp2 = SeqIO.parse("assembly_clustered_rep_seq.fasta", "fasta")
l2 = len(list(temp2))
print(f'initial train set size {l1}')
print(f'filtered train set size {l2}')
print(f'percentage of filtered/initial {l2/l1}')


initial train set size 256227
filtered train set size 128442
percentage of filtered/initial 0.5012820662927795


In [None]:
new_train_acc, _ = fasta2acc_seq_ls("assembly_clustered_rep_seq.fasta")

In [None]:
check_metal_specific_residue_proportion(new_train_acc)

CHEBI:29105 | Zn(2+)                        | num:  55340 | %: 0.4135807543701002
CHEBI:18420 | Mg(2+)                        | num:  33109 | %: 0.37028876909656205
CHEBI:49883 | [4Fe-4S] cluster              | num:  17798 | %: 0.34487569515763367
CHEBI:29108 | Ca(2+)                        | num:  17791 | %: 0.4217975769932905
CHEBI:29035 | Mn(2+)                        | num:   8445 | %: 0.3763704429984847
CHEBI:60240 | a divalent metal cation       | num:   6721 | %: 0.369448109058927
CHEBI:24875 | Fe cation                     | num:   7826 | %: 0.44853278312700595
CHEBI:190135| [2Fe-2S] cluster              | num:   3312 | %: 0.3665338645418327
CHEBI:23378 | Cu cation                     | num:   3313 | %: 0.4480054090601758
CHEBI:29103 | K(+)                          | num:   2204 | %: 0.3517395467602937
CHEBI:49786 | Ni(2+)                        | num:   1252 | %: 0.3687776141384389
CHEBI:29101 | Na(+)                         | num:    587 | %: 0.26275738585496866
CHEBI:29034 |

In [None]:
check_pos_neg_proportion(new_train_acc)

total seq in the set: 128442
proportion over full dataset: 0.44749100258860663
pos: 35796 %: 0.27869388517774557
neg: 92646 %: 0.7213061148222544


(128442, 35796, 92646, 0.27869388517774557, 0.7213061148222544)

## filter test set

In [None]:
mmseqs easy-search

usage: mmseqs easy-search <i:queryFastaFile1[.gz|.bz2]> ... <i:queryFastaFileN[.gz|.bz2]>|<i:stdin> <i:targetFastaFile[.gz]>|<i:targetDB> <o:alignmentFile> <tmpDir> [options]
options:                              
 -s FLOAT                      Sensitivity: 1.0 faster; 4.0 fast; 7.5 sensitive [5.700]
 --max-seqs INT                Maximum results per query sequence allowed to pass the prefilter (affects sensitivity) [300]
                             
 --alignment-mode INT          How to compute the alignment:
                               0: automatic
                               1: only score and end_pos
                               2: also start_pos and cov
                               3: also seq.id
                               4: only ungapped alignment [3]
 --alignment-output-mode INT   How to compute the alignment:
                               0: automatic
                               1: only score and end_pos
                               2: also start_pos and co

In [None]:
mmseqs easy-search TEST_POS_NEG.fasta TRAIN_POS_NEG.fasta alnResult.m8 tmp

Create directory tmp
easy-search TEST_POS_NEG.fasta TRAIN_POS_NEG.fasta alnResult.m8 tmp 

MMseqs Version:                        	14.7e284
Substitution matrix                    	aa:blosum62.out,nucl:nucleotide.out
Add backtrace                          	false
Alignment mode                         	3
Alignment mode                         	0
Allow wrapped scoring                  	false
E-value threshold                      	0.001
Seq. id. threshold                     	0
Min alignment length                   	0
Seq. id. mode                          	0
Alternative alignments                 	0
Coverage threshold                     	0
Coverage mode                          	0
Max sequence length                    	65535
Compositional bias                     	1
Compositional bias                     	1
Max reject                             	2147483647
Max accept                             	2147483647
Include identical seq. id.             	false
Preload mode                    

In [None]:
f = "alnResult.m8"


seq_above_thres, seq_below_thres, proportion = identity_above_threshold(f, 0.4)
print(f)
print(f'number of proteins in test set > 40% identity of the trainval set: {len(seq_above_thres)}')
print(f'proportion of proteins whose id > 40%: {proportion}')

24474
True
alnResult.m8
number of proteins in test set > 40% identity of the trainval set: 13793
proportion of proteins whose id > 40%: 0.5635776742665686


In [None]:
filtered_test = list(set(test_acc)-set(seq_above_thres))

In [None]:
dataset_metal_binding_summary(filtered_test) 
pass

total seq in the set: 14893
CHEBI:29105  |Zn(2+)                        |#p:       3018|#residue:  12591|%residue/all: 0.0941
CHEBI:18420  |Mg(2+)                        |#p:       3219|#residue:   8140|%residue/all: 0.091
CHEBI:49883  |[4Fe-4S] cluster              |#p:       1237|#residue:   4524|%residue/all: 0.0877
CHEBI:29108  |Ca(2+)                        |#p:        564|#residue:   3925|%residue/all: 0.0931
CHEBI:29035  |Mn(2+)                        |#p:        179|#residue:    762|%residue/all: 0.034
CHEBI:60240  |a divalent metal cation       |#p:        595|#residue:   1940|%residue/all: 0.107
CHEBI:24875  |Fe cation                     |#p:        440|#residue:   1903|%residue/all: 0.109
CHEBI:190135 |[2Fe-2S] cluster              |#p:        510|#residue:   1315|%residue/all: 0.146
CHEBI:23378  |Cu cation                     |#p:         27|#residue:    112|%residue/all: 0.0151
CHEBI:29103  |K(+)                          |#p:        162|#residue:    649|%residue/all: 0.10

In [None]:
check_pos_neg_proportion(filtered_test)

total seq in the set: 14893
proportion over full dataset: 0.08396714157650521
pos: 9597 %: 0.6443966964345665
neg: 5296 %: 0.3556033035654334


(14893, 9597, 5296, 0.6443966964345665, 0.3556033035654334)

In [None]:
write_seq_ls2fasta("test40.fasta", filtered_test, 'TEST_POS_NEG.fasta')

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
writing P03624 to train fasta file.
writing P39155 to train fasta file.
writing O59718 to train fasta file.
writing Q72DA0 to train fasta file.
writing Q937N9 to train fasta file.
writing P48267 to train fasta file.
writing Q55FP1 to train fasta file.
writing A9HKU2 to train fasta file.
writing P55866 to train fasta file.
writing O49515 to train fasta file.
writing P07670 to train fasta file.
writing P38031 to train fasta file.
writing P38127 to train fasta file.
writing P39075 to train fasta file.
writing P60490 to train fasta file.
writing Q3A554 to train fasta file.
writing Q6YXS0 to train fasta file.
writing Q1MH45 to train fasta file.
writing Q40478 to train fasta file.
writing Q4WZB3 to train fasta file.
writing Q5R3F8 to train fasta file.
writing Q7CQD4 to train fasta file.
writing Q9LJ68 to train fasta file.
writing Q8VZG1 to train fasta file.
writing F4HYF3 to train fasta file.
writing O79547 to train fasta file.
writing P0CT91 to train

## Add redundant data in test to train and re-filter

In [None]:
write_seq_ls2fasta("test_above40.fasta", seq_above_thres, 'TEST_POS_NEG.fasta')

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
writing Q8DEZ8 to train fasta file.
writing Q8DFE6 to train fasta file.
writing Q8DG98 to train fasta file.
writing Q8DGG4 to train fasta file.
writing Q8DGH0 to train fasta file.
writing Q8DHK2 to train fasta file.
writing Q8DHN8 to train fasta file.
writing Q8DHR2 to train fasta file.
writing Q8DI46 to train fasta file.
writing Q8DIA7 to train fasta file.
writing Q8DIB4 to train fasta file.
writing Q8DJ26 to train fasta file.
writing Q8DJB1 to train fasta file.
writing Q8DJB8 to train fasta file.
writing Q8DK30 to train fasta file.
writing Q8DKE4 to train fasta file.
writing Q8DL09 to train fasta file.
writing Q8DNT8 to train fasta file.
writing Q8DPH5 to train fasta file.
writing Q8DPV8 to train fasta file.
writing Q8DQ18 to train fasta file.
writing Q8DQ85 to train fasta file.
writing Q8DQG7 to train fasta file.
writing Q8DQT8 to train fasta file.
writing Q8DSF0 to train fasta file.
writing Q8DSF3 to train fasta file.
writing Q8DSX2 to train

In [None]:
mmseqs easy-search test40.fasta test_above40.fasta alnResult1.m8 tmp

easy-search test40.fasta test_above40.fasta alnResult1.m8 tmp 

MMseqs Version:                        	14.7e284
Substitution matrix                    	aa:blosum62.out,nucl:nucleotide.out
Add backtrace                          	false
Alignment mode                         	3
Alignment mode                         	0
Allow wrapped scoring                  	false
E-value threshold                      	0.001
Seq. id. threshold                     	0
Min alignment length                   	0
Seq. id. mode                          	0
Alternative alignments                 	0
Coverage threshold                     	0
Coverage mode                          	0
Max sequence length                    	65535
Compositional bias                     	1
Compositional bias                     	1
Max reject                             	2147483647
Max accept                             	2147483647
Include identical seq. id.             	false
Preload mode                           	0
Pseudo count a   

In [None]:
f = "alnResult1.m8"


seq_above_thres1, seq_below_thres1, proportion1 = target_identity_above_threshold(f, 0.4)
print(f)
print(f'number of proteins in test set > 40% identity of the trainval set: {len(seq_above_thres1)}')
print(f'proportion of proteins whose id > 40%: {proportion1}')

11284
True
alnResult1.m8
number of proteins in test set > 40% identity of the trainval set: 7955
proportion of proteins whose id > 40%: 0.7049805033676001


In [None]:
seq_below_thres1 = list(set(seq_above_thres)-set(seq_above_thres1))

In [None]:
len(seq_below_thres1)

5838

In [None]:
full_pos_acc, _ = fasta2acc_seq_ls('POS_TRAIN_FULL.fasta')

In [None]:
pos_seq_below_thres = list(set(seq_below_thres1).intersection(full_pos_acc))

In [None]:
len(pos_seq_below_thres)

4021

In [None]:

print(len(train_acc))
train_acc.extend(pos_seq_below_thres)
print(len(train_acc))

148681
152702


In [None]:
write_seq_ls2fasta("train40.fasta", train_acc, 'trimed_combined.fasta')

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
writing A8MHJ4 to train fasta file.
writing A9FGS9 to train fasta file.
writing B1LM67 to train fasta file.
writing B1YJL9 to train fasta file.
writing B2FP04 to train fasta file.
writing B2UNS0 to train fasta file.
writing B6J0W1 to train fasta file.
writing B7K8N3 to train fasta file.
writing B8F6S2 to train fasta file.
writing C5BHX0 to train fasta file.
writing E8MF10 to train fasta file.
writing E9EM69 to train fasta file.
writing F2Z5Z6 to train fasta file.
writing F4JGP4 to train fasta file.
writing F7XKY8 to train fasta file.
writing O07776 to train fasta file.
writing O14576 to train fasta file.
writing O46098 to train fasta file.
writing O46685 to train fasta file.
writing O65333 to train fasta file.
writing O65355 to train fasta file.
writing O67520 to train fasta file.
writing O74020 to train fasta file.
writing O74372 to train fasta file.
writing O95371 to train fasta file.
writing O95379 to train fasta file.
writing P08884 to train

In [None]:
!cp train40.fasta $MY_PATH/train40.fasta

## Check non-redundancy of the filtered test set

In [None]:
mmseqs easy-search test40.fasta train40.fasta alnResult2.m8 tmp

easy-search test40.fasta train40.fasta alnResult2.m8 tmp 

MMseqs Version:                        	14.7e284
Substitution matrix                    	aa:blosum62.out,nucl:nucleotide.out
Add backtrace                          	false
Alignment mode                         	3
Alignment mode                         	0
Allow wrapped scoring                  	false
E-value threshold                      	0.001
Seq. id. threshold                     	0
Min alignment length                   	0
Seq. id. mode                          	0
Alternative alignments                 	0
Coverage threshold                     	0
Coverage mode                          	0
Max sequence length                    	65535
Compositional bias                     	1
Compositional bias                     	1
Max reject                             	2147483647
Max accept                             	2147483647
Include identical seq. id.             	false
Preload mode                           	0
Pseudo count a        

In [None]:
f = "alnResult2.m8"


seq_above_thres, seq_below_thres, proportion = identity_above_threshold(f, 0.4)
print(f)
print(f'number of proteins in test set > 40% identity of the trainval set: {len(seq_above_thres)}')
print(f'proportion of proteins whose id > 40%: {proportion}')

10689
True
alnResult2.m8
number of proteins in test set > 40% identity of the trainval set: 28
proportion of proteins whose id > 40%: 0.0026195153896529143


In [None]:
new_filtered_test = list(set(filtered_test)-set(seq_above_thres))

In [None]:
len(new_filtered_test)

14865

In [None]:
dataset_metal_binding_summary(new_filtered_test)
pass

total seq in the set: 14865
CHEBI:29105  |Zn(2+)                        |#p:       3018|#residue:  12591|%residue/all: 0.0941
CHEBI:18420  |Mg(2+)                        |#p:       3217|#residue:   8136|%residue/all: 0.091
CHEBI:49883  |[4Fe-4S] cluster              |#p:       1230|#residue:   4503|%residue/all: 0.0873
CHEBI:29108  |Ca(2+)                        |#p:        564|#residue:   3925|%residue/all: 0.0931
CHEBI:29035  |Mn(2+)                        |#p:        179|#residue:    762|%residue/all: 0.034
CHEBI:60240  |a divalent metal cation       |#p:        576|#residue:   1877|%residue/all: 0.103
CHEBI:24875  |Fe cation                     |#p:        440|#residue:   1903|%residue/all: 0.109
CHEBI:190135 |[2Fe-2S] cluster              |#p:        510|#residue:   1315|%residue/all: 0.146
CHEBI:23378  |Cu cation                     |#p:         27|#residue:    112|%residue/all: 0.0151
CHEBI:29103  |K(+)                          |#p:        162|#residue:    649|%residue/all: 0.10

In [None]:
write_seq_ls2fasta("new_test40.fasta", new_filtered_test, 'test40.fasta')

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
writing P03624 to train fasta file.
writing P39155 to train fasta file.
writing O59718 to train fasta file.
writing Q72DA0 to train fasta file.
writing Q937N9 to train fasta file.
writing P48267 to train fasta file.
writing Q55FP1 to train fasta file.
writing A9HKU2 to train fasta file.
writing P55866 to train fasta file.
writing O49515 to train fasta file.
writing P07670 to train fasta file.
writing P38031 to train fasta file.
writing P38127 to train fasta file.
writing P39075 to train fasta file.
writing P60490 to train fasta file.
writing Q3A554 to train fasta file.
writing Q6YXS0 to train fasta file.
writing Q1MH45 to train fasta file.
writing Q40478 to train fasta file.
writing Q4WZB3 to train fasta file.
writing Q5R3F8 to train fasta file.
writing Q7CQD4 to train fasta file.
writing Q9LJ68 to train fasta file.
writing Q8VZG1 to train fasta file.
writing F4HYF3 to train fasta file.
writing O79547 to train fasta file.
writing P0CT91 to train

In [None]:
!cp new_test40.fasta $MY_PATH/new_test40.fasta

In [None]:
mmseqs easy-search new_test40.fasta train40.fasta alnResult2.m8 tmp

[33malnResult2.m8 exists and will be overwritten
[39measy-search new_test40.fasta train40.fasta alnResult2.m8 tmp 

MMseqs Version:                        	14.7e284
Substitution matrix                    	aa:blosum62.out,nucl:nucleotide.out
Add backtrace                          	false
Alignment mode                         	3
Alignment mode                         	0
Allow wrapped scoring                  	false
E-value threshold                      	0.001
Seq. id. threshold                     	0
Min alignment length                   	0
Seq. id. mode                          	0
Alternative alignments                 	0
Coverage threshold                     	0
Coverage mode                          	0
Max sequence length                    	65535
Compositional bias                     	1
Compositional bias                     	1
Max reject                             	2147483647
Max accept                             	2147483647
Include identical seq. id.             	false
Prelo

In [None]:
f = "alnResult2.m8"


seq_above_thres, seq_below_thres, proportion = identity_above_threshold(f, 0.4)
print(f)
print(f'number of proteins in test set > 40% identity of the trainval set: {len(seq_above_thres)}')
print(f'proportion of proteins whose id > 40%: {proportion}')

10661
True
alnResult2.m8
number of proteins in test set > 40% identity of the trainval set: 0
proportion of proteins whose id > 40%: 0.0


independent test set: /content/drive/MyDrive/FYP/new_test40.fasta \
train set: /content/drive/MyDrive/FYP/train40.fasta

# CD-Hit: another tool for nr dataset generation (not used)

In [None]:
!wget https://ftp.ncbi.nlm.nih.gov/blast/executables/LATEST/ncbi-blast-2.13.0+-x64-linux.tar.gz

--2023-01-10 03:37:33--  https://ftp.ncbi.nlm.nih.gov/blast/executables/LATEST/ncbi-blast-2.13.0+-x64-linux.tar.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 165.112.9.228, 165.112.9.229, 2607:f220:41e:250::13, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 233822517 (223M) [application/x-gzip]
Saving to: ‘ncbi-blast-2.13.0+-x64-linux.tar.gz’


2023-01-10 03:37:50 (14.6 MB/s) - ‘ncbi-blast-2.13.0+-x64-linux.tar.gz’ saved [233822517/233822517]



In [None]:
!tar zxvpf /content/ncbi-blast-2.13.0+-x64-linux.tar.gz

./ncbi-blast-2.13.0+/
./ncbi-blast-2.13.0+/ncbi_package_info
./ncbi-blast-2.13.0+/doc/
./ncbi-blast-2.13.0+/doc/README.txt
./ncbi-blast-2.13.0+/doc/BLAST_PRIVACY
./ncbi-blast-2.13.0+/ChangeLog
./ncbi-blast-2.13.0+/README
./ncbi-blast-2.13.0+/LICENSE
./ncbi-blast-2.13.0+/bin/
./ncbi-blast-2.13.0+/bin/blastn
./ncbi-blast-2.13.0+/bin/dustmasker
./ncbi-blast-2.13.0+/bin/blast_vdb_cmd
./ncbi-blast-2.13.0+/bin/windowmasker
./ncbi-blast-2.13.0+/bin/blastn_vdb
./ncbi-blast-2.13.0+/bin/makeblastdb
./ncbi-blast-2.13.0+/bin/makeprofiledb
./ncbi-blast-2.13.0+/bin/blastp
./ncbi-blast-2.13.0+/bin/psiblast
./ncbi-blast-2.13.0+/bin/blastx
./ncbi-blast-2.13.0+/bin/deltablast
./ncbi-blast-2.13.0+/bin/cleanup-blastdb-volumes.py
./ncbi-blast-2.13.0+/bin/get_species_taxids.sh
./ncbi-blast-2.13.0+/bin/blastdbcmd
./ncbi-blast-2.13.0+/bin/legacy_blast.pl
./ncbi-blast-2.13.0+/bin/blastdbcheck
./ncbi-blast-2.13.0+/bin/tblastn
./ncbi-blast-2.13.0+/bin/tblastn_vdb
./ncbi-blast-2.13.0+/bin/blastdb_aliastool
./ncbi

In [None]:
!export PATH=$PATH:/content/ncbi-blast-2.13.0+/bin

In [None]:
!wget https://github.com/weizhongli/cdhit/releases/download/V4.8.1/cd-hit-v4.8.1-2019-0228.tar.gz

--2023-01-10 03:43:02--  https://github.com/weizhongli/cdhit/releases/download/V4.8.1/cd-hit-v4.8.1-2019-0228.tar.gz
Resolving github.com (github.com)... 20.27.177.113
Connecting to github.com (github.com)|20.27.177.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/35050301/216f6a00-3b6b-11e9-9fec-85005717b86a?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230110T034302Z&X-Amz-Expires=300&X-Amz-Signature=edab2201255eb8a78c57bb3b1a296af4a8631abbc6b9c2a2cacd65aa46ed6bc4&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=35050301&response-content-disposition=attachment%3B%20filename%3Dcd-hit-v4.8.1-2019-0228.tar.gz&response-content-type=application%2Foctet-stream [following]
--2023-01-10 03:43:02--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/35050301/216f6a00-3b6b-11

In [None]:
!tar xvf /content/cd-hit-v4.8.1-2019-0228.tar.gz --gunzip

cd-hit-v4.8.1-2019-0228/
cd-hit-v4.8.1-2019-0228/cd-hit-2d-para.pl
cd-hit-v4.8.1-2019-0228/cd-hit-auxtools/
cd-hit-v4.8.1-2019-0228/cd-hit-auxtools/bioSequence.cxx
cd-hit-v4.8.1-2019-0228/cd-hit-auxtools/bioSequence.hxx
cd-hit-v4.8.1-2019-0228/cd-hit-auxtools/cd-hit-dup-PE-out.pl
cd-hit-v4.8.1-2019-0228/cd-hit-auxtools/cdhit-dup.cxx
cd-hit-v4.8.1-2019-0228/cd-hit-auxtools/cdhit-lap.cxx
cd-hit-v4.8.1-2019-0228/cd-hit-auxtools/Makefile
cd-hit-v4.8.1-2019-0228/cd-hit-auxtools/mintlib/
cd-hit-v4.8.1-2019-0228/cd-hit-auxtools/mintlib/minArray.hxx
cd-hit-v4.8.1-2019-0228/cd-hit-auxtools/mintlib/minBase.hxx
cd-hit-v4.8.1-2019-0228/cd-hit-auxtools/mintlib/minMap.cxx
cd-hit-v4.8.1-2019-0228/cd-hit-auxtools/mintlib/minMap.hxx
cd-hit-v4.8.1-2019-0228/cd-hit-auxtools/mintlib/minString.cxx
cd-hit-v4.8.1-2019-0228/cd-hit-auxtools/mintlib/minString.hxx
cd-hit-v4.8.1-2019-0228/cd-hit-auxtools/mintlib/minUtility.hxx
cd-hit-v4.8.1-2019-0228/cd-hit-auxtools/read-linker.cxx
cd-hit-v4.8.1-2019-0228/cd-hit-

In [None]:
%cd /content/cd-hit-v4.8.1-2019-0228

/content/cd-hit-v4.8.1-2019-0228


In [None]:
!make

g++  -fopenmp -DWITH_ZLIB -O2  cdhit-common.c++ -c
g++  -fopenmp -DWITH_ZLIB -O2  cdhit-utility.c++ -c
g++  -fopenmp -DWITH_ZLIB -O2  cdhit.c++ -c
g++  -fopenmp -DWITH_ZLIB -O2  cdhit.o cdhit-common.o cdhit-utility.o -lz -o cd-hit
g++  -fopenmp -DWITH_ZLIB -O2  cdhit-est.c++ -c
g++  -fopenmp -DWITH_ZLIB -O2  cdhit-est.o cdhit-common.o cdhit-utility.o -lz -o cd-hit-est
g++  -fopenmp -DWITH_ZLIB -O2  cdhit-2d.c++ -c
g++  -fopenmp -DWITH_ZLIB -O2  cdhit-2d.o cdhit-common.o cdhit-utility.o -lz -o cd-hit-2d
g++  -fopenmp -DWITH_ZLIB -O2  cdhit-est-2d.c++ -c
g++  -fopenmp -DWITH_ZLIB -O2  cdhit-est-2d.o cdhit-common.o cdhit-utility.o -lz -o cd-hit-est-2d
g++  -fopenmp -DWITH_ZLIB -O2  cdhit-div.c++ -c
g++  -fopenmp -DWITH_ZLIB -O2  cdhit-div.o cdhit-common.o cdhit-utility.o -lz -o cd-hit-div
g++  -fopenmp -DWITH_ZLIB -O2  cdhit-454.c++ -c
g++  -fopenmp -DWITH_ZLIB -O2  cdhit-454.o cdhit-common.o cdhit-utility.o -lz -o cd-hit-454


In [None]:
%cd cd-hit-auxtools

/content/cd-hit-v4.8.1-2019-0228/cd-hit-auxtools


In [None]:
!make

g++ -c -Wall -Wno-unused -I. -Imintlib -DUNIX -O2 -o mintlib/minString.o mintlib/minString.cxx
g++ -c -Wall -Wno-unused -I. -Imintlib -DUNIX -O2 -o mintlib/minMap.o mintlib/minMap.cxx
g++ -c -Wall -Wno-unused -I. -Imintlib -DUNIX -O2 -o bioSequence.o bioSequence.cxx
[01m[KbioSequence.cxx:[m[K In member function ‘[01m[Kint Bio::SequenceList::RemoveEmptySequences()[m[K’:
 [01;35m[K}[m[K
 [01;35m[K^[m[K
g++ -c -Wall -Wno-unused -I. -Imintlib -DUNIX -O2 -o cdhit-dup.o cdhit-dup.cxx
[01m[Kcdhit-dup.cxx:[m[K In function ‘[01m[Kvoid PrintChimeric(FILE*, Bio::Sequence*, Bio::Sequence*, Bio::Sequence*, int, int)[m[K’:
  A.Insert( "|X|", IX [01;35m[K)[m[K;
                      [01;35m[K^[m[K
  Q.Insert( "|X|", IX [01;35m[K)[m[K;
                      [01;35m[K^[m[K
  B.Insert( "|X|", IX [01;35m[K)[m[K;
                      [01;35m[K^[m[K
[01m[Kcdhit-dup.cxx:[m[K In function ‘[01m[Kvoid PrintChimeric2(FILE*, Bio::Sequence*, Bio::Sequence*, B

In [None]:
%cd /content

/content
