# Check sequence similarity between new datasets and CASP7-10

In [1]:
import pandas as pd
import numpy as np

## Scop-based dataset

In [2]:
scop_final_df = pd.read_csv('../../../../score/scop_cl_equal_globular100_identity95_coverage60/scop_cl_equal_globular100_identity95_coverage60_final.csv', index_col=0)
scop_final_df

Unnamed: 0,model,GDT_TS,GDT_HA,target,template,seq_len,identity,positive,coverage,identity(-misres),...,SF-PDBID,SF-PDBREG,SF-UNIID,SF-UNIREG,TP,CL,CF,SF,FA,Class
0,1AYZ_A_1JAS_A_1_1,0.8366,0.6340,1AYZ_A,1JAS_A_1,153.0,104.0,128.0,150.0,104.0,...,1AYZ,A:2-154,P06104,2-154,1,1000003,2000386,3000570,4001046,alpha + beta
1,1AYZ_A_1JAS_A_1_3,0.8301,0.6291,1AYZ_A,1JAS_A_1,153.0,104.0,128.0,150.0,104.0,...,1AYZ,A:2-154,P06104,2-154,1,1000003,2000386,3000570,4001046,alpha + beta
2,1AYZ_A_1JAS_A_1_4,0.8317,0.6307,1AYZ_A,1JAS_A_1,153.0,104.0,128.0,150.0,104.0,...,1AYZ,A:2-154,P06104,2-154,1,1000003,2000386,3000570,4001046,alpha + beta
3,1AYZ_A_1JAS_A_1_5,0.8301,0.6275,1AYZ_A,1JAS_A_1,153.0,104.0,128.0,150.0,104.0,...,1AYZ,A:2-154,P06104,2-154,1,1000003,2000386,3000570,4001046,alpha + beta
4,1AYZ_A_1Q34_A_1_1,0.9314,0.7680,1AYZ_A,1Q34_A_1,153.0,99.0,127.0,148.0,99.0,...,1AYZ,A:2-154,P06104,2-154,1,1000003,2000386,3000570,4001046,alpha + beta
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14609,6SPF_T_6WQN_F_2_1,0.7686,0.6197,6SPF_T,6WQN_F_2,94.0,31.0,46.0,84.0,29.0,...,6SPF,T:1-94,E2RXT1,1-94,1,1000003,2000195,3000210,4000943,alpha + beta
14610,6SPF_T_6WQN_F_2_5,0.8032,0.6436,6SPF_T,6WQN_F_2,94.0,31.0,46.0,84.0,29.0,...,6SPF,T:1-94,E2RXT1,1-94,1,1000003,2000195,3000210,4000943,alpha + beta
14611,6SPF_T_6ZJ3_Lk_3_2,0.6223,0.5266,6SPF_T,6ZJ3_Lk_3,94.0,23.0,34.0,77.0,23.0,...,6SPF,T:1-94,E2RXT1,1-94,1,1000003,2000195,3000210,4000943,alpha + beta
14612,6SPF_T_6ZJ3_Lk_3_3,0.6250,0.5239,6SPF_T,6ZJ3_Lk_3,94.0,23.0,34.0,77.0,23.0,...,6SPF,T:1-94,E2RXT1,1-94,1,1000003,2000195,3000210,4000943,alpha + beta


In [3]:
scop_target_info = scop_final_df[['target', 'seq_len']].groupby('target').head(1).astype({'seq_len': 'int'})
scop_target_info

Unnamed: 0,target,seq_len
0,1AYZ_A,153
150,1AZ5_A,99
300,1BQU_A,115
450,1DE4_A,178
600,1EA9_C,382
...,...,...
13864,6K9F_B,87
14014,6KNA_A,76
14164,6KWQ_A,462
14314,6O8W_g,154


In [5]:
# query, subject, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
columns = ['query', 'subject', 'identity', 'alignment length', 'mismatches', 'gap opens', 'q.start', 'q.end', 's.start', 's.end', 'evalue', 'bit score']
scop_df = pd.read_csv('scop_overlap_strict.csv', header=None)
scop_df = scop_df.set_axis(columns, axis=1)
scop_df = pd.merge(scop_df, scop_target_info, left_on='query', right_on='target', how='inner')
scop_df = scop_df.sort_values(['query', 'identity']).drop('target', axis=1)
scop_df['identity_seqlen'] = (scop_df['alignment length'] - scop_df['mismatches'] - scop_df['gap opens']) / scop_df['seq_len']
scop_df

Unnamed: 0,query,subject,identity,alignment length,mismatches,gap opens,q.start,q.end,s.start,s.end,evalue,bit score,seq_len,identity_seqlen
1,1AYZ_A,T0677.fasta,18.310,71,56,1,65,135,48,116,3.10,18.9,153,0.091503
0,1AYZ_A,T0534.fasta,30.909,55,33,2,100,149,23,77,0.39,21.9,153,0.130719
4,1AYZ_A,T0521.fasta,36.364,11,7,0,68,78,36,46,3.30,18.9,153,0.026144
2,1AYZ_A,T0341.fasta,37.500,24,15,0,116,139,122,145,3.30,18.9,153,0.058824
3,1AYZ_A,T0341.fasta,38.462,26,12,1,36,57,105,130,8.00,17.7,153,0.084967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541,6SPF_T,T0361.fasta,23.171,82,54,2,3,84,14,86,6.70,16.9,94,0.276596
542,6SPF_T,T0640.fasta,31.818,44,21,2,26,60,132,175,9.60,16.5,94,0.223404
539,6SPF_T,T0526.fasta,35.000,20,13,0,75,94,169,188,5.20,17.3,94,0.074468
540,6SPF_T,T0526.fasta,35.000,20,13,0,74,93,59,78,8.70,16.9,94,0.074468


In [6]:
scop_df.query('identity_seqlen > 0.3')

Unnamed: 0,query,subject,identity,alignment length,mismatches,gap opens,q.start,q.end,s.start,s.end,evalue,bit score,seq_len,identity_seqlen
127,1RT8_A,T0521.fasta,24.742,97,56,3,6,99,30,112,6.7,17.3,102,0.372549
145,1UTI_A,T0318.fasta,29.73,37,12,1,35,57,367,403,0.28,20.0,57,0.421053
308,2VKE_A,T0744.fasta,25.175,143,71,6,13,136,114,239,1.0,20.8,162,0.407407
378,3RHH_C,T0715.fasta,24.46,417,258,13,42,435,6,388,1.31e-15,69.7,476,0.306723
407,4G12_A,T0454.fasta,40.476,42,25,0,2,43,1,42,1.02e-06,34.3,44,0.386364
474,5CZJ_B,T0376.fasta,89.286,308,33,0,2,309,3,310,0.0,556.0,319,0.862069
488,5DE0_C,T0694.fasta,21.488,121,58,5,2,106,164,263,10.0,17.3,135,0.42963
510,6CD6_A,T0456.fasta,31.707,287,162,7,3,287,9,263,1.33e-41,136.0,287,0.41115


In [7]:
scop_df.query('identity_seqlen > 0.3')[['query', 'subject', 'evalue', 'identity_seqlen', 'seq_len', 'alignment length', 'mismatches']]

Unnamed: 0,query,subject,evalue,identity_seqlen,seq_len,alignment length,mismatches
127,1RT8_A,T0521.fasta,6.7,0.372549,102,97,56
145,1UTI_A,T0318.fasta,0.28,0.421053,57,37,12
308,2VKE_A,T0744.fasta,1.0,0.407407,162,143,71
378,3RHH_C,T0715.fasta,1.31e-15,0.306723,476,417,258
407,4G12_A,T0454.fasta,1.02e-06,0.386364,44,42,25
474,5CZJ_B,T0376.fasta,0.0,0.862069,319,308,33
488,5DE0_C,T0694.fasta,10.0,0.42963,135,121,58
510,6CD6_A,T0456.fasta,1.33e-41,0.41115,287,287,162


## Pisces-based dataset

In [8]:
pisces_final_df = pd.read_csv('../../../../score/pisces_multidomain100_identity95_coverage60/pisces_multidomain100_identity95_coverage60_final.csv', index_col=0)
pisces_final_df

Unnamed: 0,model,GDT_TS,GDT_HA,target,template,seq_len,identity,positive,coverage,identity(-misres),...,Exptl.,resolution,R-factor,FreeRvalue,PDB_ID,Chain,Domain_num,1,2,3
0,1BF2_A_2E8Y_A_3_1,0.4207,0.3027,1BF2_A,2E8Y_A_3,750,147,237,729,147,...,XRAY,2.00,0.16,0.21,1BF2,A,3,107.0,323.0,344.0
1,1BF2_A_2E8Y_A_3_2,0.4130,0.2897,1BF2_A,2E8Y_A_3,750,147,237,729,147,...,XRAY,2.00,0.16,0.21,1BF2,A,3,107.0,323.0,344.0
2,1BF2_A_2E8Y_A_3_3,0.4113,0.2923,1BF2_A,2E8Y_A_3,750,147,237,729,147,...,XRAY,2.00,0.16,0.21,1BF2,A,3,107.0,323.0,344.0
3,1BF2_A_2E8Y_A_3_4,0.4167,0.2967,1BF2_A,2E8Y_A_3,750,147,237,729,147,...,XRAY,2.00,0.16,0.21,1BF2,A,3,107.0,323.0,344.0
4,1BF2_A_2E8Y_A_3_5,0.4077,0.2883,1BF2_A,2E8Y_A_3,750,147,237,729,147,...,XRAY,2.00,0.16,0.21,1BF2,A,3,107.0,323.0,344.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17037,6RI6_A_6KLG_A_3_4,0.6305,0.4674,6RI6_A,6KLG_A_3,498,138,211,474,138,...,XRAY,0.93,0.11,0.12,6RI6,A,3,96.0,222.0,241.0
17038,6RI6_A_6VOW_A_3_1,0.4985,0.3519,6RI6_A,6VOW_A_3,498,113,173,458,108,...,XRAY,0.93,0.11,0.12,6RI6,A,3,96.0,222.0,241.0
17039,6RI6_A_6VOW_A_3_2,0.5040,0.3589,6RI6_A,6VOW_A_3,498,113,173,458,108,...,XRAY,0.93,0.11,0.12,6RI6,A,3,96.0,222.0,241.0
17040,6RI6_A_6VOW_A_3_3,0.5120,0.3660,6RI6_A,6VOW_A_3,498,113,173,458,108,...,XRAY,0.93,0.11,0.12,6RI6,A,3,96.0,222.0,241.0


In [9]:
pisces_target_info = pisces_final_df[['target', 'seq_len']].groupby('target').head(1).astype({'seq_len': 'int'})
pisces_target_info['target'] = [tar[: 4].lower() + tar[4:] for tar in pisces_target_info['target']]
pisces_target_info

Unnamed: 0,target,seq_len
0,1bf2_A,750
105,1fhu_A,320
237,1gu7_A,364
334,1hm9_B,468
606,1k5n_A,276
...,...,...
16336,6d0a_A,431
16486,6fme_B,506
16632,6g7n_A,318
16782,6hoa_A,228


In [10]:
pis_df = pd.read_csv('pisces_overlap_strict.csv', header=None)
pis_df = pis_df.set_axis(columns, axis=1)
pis_df = pd.merge(pis_df, pisces_target_info, left_on='query', right_on='target', how='inner')
pis_df['identity_seqlen'] = (pis_df['alignment length'] - pis_df['mismatches'] - pis_df['gap opens']) / pis_df['seq_len']
pis_df

Unnamed: 0,query,subject,identity,alignment length,mismatches,gap opens,q.start,q.end,s.start,s.end,evalue,bit score,target,seq_len,identity_seqlen
0,1bf2_A,T0456.fasta,24.324,74,43,3,565,636,190,252,0.11,26.2,1bf2_A,750,0.037333
1,1bf2_A,T0379.fasta,22.619,84,54,4,99,172,79,161,1.20,22.7,1bf2_A,750,0.034667
2,1bf2_A,T0376.fasta,29.032,62,36,3,157,217,99,153,1.50,22.7,1bf2_A,750,0.030667
3,1bf2_A,T0751.fasta,37.143,35,20,1,106,138,26,60,2.70,21.2,1bf2_A,750,0.018667
4,1bf2_A,T0447.fasta,21.212,66,48,1,263,328,365,426,2.90,21.9,1bf2_A,750,0.022667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
692,6ri6_A,T0732.fasta,30.769,39,24,2,425,462,199,235,2.90,21.2,6ri6_A,498,0.026104
693,6ri6_A,T0447.fasta,50.000,18,9,0,130,147,522,539,3.10,21.2,6ri6_A,498,0.018072
694,6ri6_A,T0671.fasta,28.090,89,46,6,317,390,117,202,4.60,20.4,6ri6_A,498,0.074297
695,6ri6_A,T0715.fasta,39.286,28,16,1,396,423,134,160,4.70,20.4,6ri6_A,498,0.022088


In [11]:
pis_df.query('identity_seqlen > 0.3')

Unnamed: 0,query,subject,identity,alignment length,mismatches,gap opens,q.start,q.end,s.start,s.end,evalue,bit score,target,seq_len,identity_seqlen
58,1x8b_A,T0292.fasta,26.071,280,159,9,12,284,5,243,1.71e-19,77.4,1x8b_A,289,0.387543
59,1x8b_A,T0456.fasta,25.461,271,165,10,19,280,16,258,8.22e-16,67.4,1x8b_A,289,0.33218
358,4c3s_A,T0715.fasta,100.0,435,0,0,11,445,1,435,0.0,894.0,4c3s_A,445,0.977528


In [12]:
pis_df.query('identity_seqlen > 0.3')[['query', 'subject', 'evalue', 'identity_seqlen', 'seq_len', 'alignment length', 'mismatches']]

Unnamed: 0,query,subject,evalue,identity_seqlen,seq_len,alignment length,mismatches
58,1x8b_A,T0292.fasta,1.71e-19,0.387543,289,280,159
59,1x8b_A,T0456.fasta,8.22e-16,0.33218,289,271,165
358,4c3s_A,T0715.fasta,0.0,0.977528,445,435,0


In [13]:
scop_df.query('evalue < 1.0e-4')[['query', 'subject', 'evalue']]

Unnamed: 0,query,subject,evalue
27,1EXR_A,T0521.fasta,5.84e-07
301,2V90_C,T0359.fasta,5.01e-06
302,2V90_C,T0488.fasta,4.13e-05
378,3RHH_C,T0715.fasta,1.31e-15
407,4G12_A,T0454.fasta,1.02e-06
434,4NBU_B,T0640.fasta,1.1699999999999999e-20
450,4PNE_B,T0704.fasta,3.71e-05
474,5CZJ_B,T0376.fasta,0.0
511,6CD6_A,T0292.fasta,1.9e-16
510,6CD6_A,T0456.fasta,1.33e-41


In [14]:
pis_df.query('evalue < 1.0e-4')[['query', 'subject', 'evalue']]

Unnamed: 0,query,subject,evalue
58,1x8b_A,T0292.fasta,1.71e-19
59,1x8b_A,T0456.fasta,8.22e-16
114,2ibd_A,T0454.fasta,6.62e-09
122,2izr_A,T0456.fasta,4.33e-07
358,4c3s_A,T0715.fasta,0.0
