# sort SCOP SuperFamily based on the number of entries

In [1]:
import pandas as pd

## read scop classification text

In [2]:
scop_cla_path = '../../../scop/20210330/scop-cla-latest.txt'
scop_cla_row_df = pd.read_csv(scop_cla_path, header=None, delim_whitespace=True, skiprows=6)
header = ['FA-DOMID', 'FA-PDBID', 'FA-PDBREG' ,'FA-UNIID', 'FA-UNIREG', 'SF-DOMID', 'SF-PDBID', 'SF-PDBREG', 'SF-UNIID', 'SF-UNIREG', 'SCOPCLA']
scop_cla_row_df.columns = header

def parseSCOPCLA(line):
    cla_line_row_list = line.split(',')
    cla_line_list = [cla.split('=')[1] for cla in cla_line_row_list]
    return cla_line_list

cla_list = [parseSCOPCLA(line) for line in scop_cla_row_df['SCOPCLA']]
SCOPCLA_df = pd.DataFrame(cla_list, columns=['TP', 'CL', 'CF', 'SF', 'FA'])
scop_cla_df = pd.concat([scop_cla_row_df, SCOPCLA_df], axis=1).drop(['SCOPCLA'], axis=1)

In [3]:
scop_cla_df

Unnamed: 0,FA-DOMID,FA-PDBID,FA-PDBREG,FA-UNIID,FA-UNIREG,SF-DOMID,SF-PDBID,SF-PDBREG,SF-UNIID,SF-UNIREG,TP,CL,CF,SF,FA
0,8045703,3H8D,C:1143-1264,Q64331,1143-1264,8091604,3H8D,C:1143-1264,Q64331,1143-1264,1,1000003,2001470,3002524,4004627
1,8094330,6J56,A:1158-1282,Q9UM54,1167-1291,8094331,6J56,A:1158-1282,Q9UM54,1167-1291,1,1000003,2001470,3002524,4004627
2,8017835,3FKQ,A:1-116,D0VX10,1-116,8017836,3FKQ,A:1-116,D0VX10,1-116,1,1000002,2000016,3001156,4003986
3,8021315,1XHF,A:2-122,P0A9Q1,2-122,8033695,1XHF,A:2-122,P0A9Q1,2-122,1,1000002,2000016,3001156,4003632
4,8021787,1Y7P,B:79-215,O28869,79-215,8034167,1Y7P,B:79-215,O28869,79-215,1,1000002,2000016,3001156,4002481
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34725,8022870,1SCJ,B:307-377,P04189,36-106,8035250,1SCJ,B:307-377,P04189,36-106,1,1000003,2000014,3001155,4001246
34726,8027500,1T1E,A:12-188,Q8RR56,12-188,8039879,1T1E,A:12-188,Q8RR56,12-188,1,1000003,2000014,3001155,4001246
34727,8027593,1JQG,A:4P-100P,O97389,18-109,8039972,1JQG,A:4P-100P,O97389,18-109,1,1000003,2000014,3001155,4001250
34728,8028119,1KN6,A:4-76,P63239,31-103,8040498,1KN6,A:4-76,P63239,31-103,1,1000003,2000014,3001155,4002630


In [4]:
scop_cla_csv_path = scop_cla_path.rsplit('.', 1)[0] + '.csv'
scop_cla_df.to_csv(scop_cla_csv_path)

## sort SuperFamily based on the number of entries

In [5]:
sorted_sf_list = scop_cla_df.value_counts('SF')

In [6]:
sorted_sf_list

SF
3000038    498
3000313    437
3000066    407
3000034    395
3000118    335
          ... 
3001982      1
3001140      1
3001984      1
3001985      1
3000409      1
Length: 2681, dtype: int64

## Aggregate Protein type and Class
### Protein type
* 1 Globular proteins
* 2 Membrane proteins
* 3 Fibrous proteins
* 4 Non-globular/Intrinsically unstructured proteins
### Protein Class
* 1000000 All alpha proteins
* 1000001 All beta proteins
* 1000002 Alpha and beta proteins (a/b)
* 1000003 Alpha and beta proteins (a+b)
* 1000004 Small proteins

### Check the correspondence between classes and superfamilies

In [7]:
# Within a single superfamily, there are cases where it belongs to multiple classes.
for sf, num in sorted_sf_list[: 200].items():
    sf_df = scop_cla_df.query('SF == @sf')
    tp_count = sf_df.value_counts('TP')
    cl_count = sf_df.value_counts('CL')
    if len(tp_count) != 1 or len(cl_count) != 1:
        print(sf)
        print(tp_count)
        print(cl_count)
        print()

3000066
TP
1    407
dtype: int64
CL
1000003    406
1000000      1
dtype: int64

3000154
TP
1    79
dtype: int64
CL
1000003    78
1000000     1
dtype: int64

3001848
TP
1    60
dtype: int64
CL
1000001    59
1000003     1
dtype: int64

3000161
TP
1    53
dtype: int64
CL
1000003    34
1000001    19
dtype: int64

3000119
TP
1    51
dtype: int64
CL
1000000    49
1000003     2
dtype: int64



### aggregate superfamily
If more than one class exists in a superfamily, the class with the largest number is aggregated.

In [8]:
sf_agg_df = scop_cla_df.groupby('SF').apply(lambda x: pd.Series([x.value_counts('TP').index[0], x.value_counts('CL').index[0], len(x.value_counts('CL')), len(x)]))
sf_agg_df.columns = ['TP', 'CL', 'len_CL', 'len_SF']

In [9]:
sf_agg_df

Unnamed: 0_level_0,TP,CL,len_CL,len_SF
SF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3000001,1,1000000,1,253
3000002,1,1000001,1,36
3000003,1,1000002,1,9
3000006,1,1000003,1,33
3000011,1,1000002,1,2
...,...,...,...,...
3002732,3,1000000,1,1
3002733,2,1000000,1,7
3002734,1,1000001,1,2
3002735,4,1000004,1,1


In [10]:
num_target = 100
sf_agg_df.query('TP == "1"').sort_values('len_SF', ascending=False)[: num_target].value_counts('CL')

CL
1000002    37
1000003    24
1000001    21
1000000    14
1000004     4
dtype: int64

In [11]:
num_target = 110
sf_agg_df.query('TP == "1"').sort_values('len_SF', ascending=False)[: num_target].value_counts('CL')

CL
1000002    40
1000003    24
1000001    23
1000000    16
1000004     7
dtype: int64

In [12]:
sf_agg_df.query('TP == "1"').sort_values('len_SF', ascending=False)[: num_target].query('len_CL != 1')

Unnamed: 0_level_0,TP,CL,len_CL,len_SF
SF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3000066,1,1000003,2,407
3000154,1,1000003,2,79


In [13]:
select_superfamily_globular_df = sf_agg_df.query('TP == "1"').sort_values('len_SF', ascending=False)
select_superfamily_globular_df

Unnamed: 0_level_0,TP,CL,len_CL,len_SF
SF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3000038,1,1000002,1,498
3000313,1,1000002,1,437
3000066,1,1000003,2,407
3000034,1,1000003,1,395
3000118,1,1000002,1,335
...,...,...,...,...
3001432,1,1000000,1,1
3000391,1,1000001,1,1
3002082,1,1000001,1,1
3001430,1,1000001,1,1


In [14]:
save_path = '../../../scop/20210330/SF_globular_list.csv'
select_superfamily_globular_df.to_csv(save_path)

In [15]:
select_superfamily_globular_df.query('CL != "1000004"')[: 100].value_counts('CL')

CL
1000002    39
1000003    24
1000001    23
1000000    14
dtype: int64

## Select superfamily equally from each class of protein

In [16]:
cl_equal_df = select_superfamily_globular_df.reset_index().query('CL != "1000004"').groupby('CL').apply(lambda x: x.sort_values('len_SF', ascending=False)[: 25]).reset_index(drop=True)
cl_equal_df

Unnamed: 0,SF,TP,CL,len_CL,len_SF
0,3000554,1,1000000,1,297
1,3000001,1,1000000,1,253
2,3001983,1,1000000,1,186
3,3001658,1,1000000,1,158
4,3001061,1,1000000,1,154
...,...,...,...,...,...
95,3000738,1,1000003,1,75
96,3000739,1,1000003,1,74
97,3000210,1,1000003,1,72
98,3000197,1,1000003,1,71


In [17]:
cl_equal_df.value_counts('CL')

CL
1000000    25
1000001    25
1000002    25
1000003    25
dtype: int64

In [18]:
cl_equal_df.sort_values('len_SF')[: 20]

Unnamed: 0,SF,TP,CL,len_CL,len_SF
24,3001284,1,1000000,1,45
23,3000264,1,1000000,1,48
22,3000799,1,1000000,1,49
21,3000119,1,1000000,2,51
20,3000305,1,1000000,1,52
19,3001717,1,1000000,1,55
18,3001843,1,1000000,1,58
17,3000069,1,1000000,1,59
49,3001848,1,1000001,2,60
48,3000965,1,1000001,1,62


### A minority entry in a super family that belongs to multiple classes

In [19]:
cl_equal_df.query('len_CL != 1')

Unnamed: 0,SF,TP,CL,len_CL,len_SF
21,3000119,1,1000000,2,51
49,3001848,1,1000001,2,60
75,3000066,1,1000003,2,407
94,3000154,1,1000003,2,79


In [20]:
from IPython.core.display import display
for sf in cl_equal_df.query('len_CL != 1')['SF']:
    sf_df = scop_cla_df.query('SF == @sf')
    sf_counts = sf_df.value_counts('CL')
    minor_cl = sf_counts.keys()[1]
    display(sf_df.query('CL == @minor_cl')[['SF', 'SF-DOMID']])

Unnamed: 0,SF,SF-DOMID
11154,3000119,8003968
11184,3000119,8056853


Unnamed: 0,SF,SF-DOMID
22155,3001848,8092965


Unnamed: 0,SF,SF-DOMID
7163,3000066,8091280


Unnamed: 0,SF,SF-DOMID
13032,3000154,8004057


## Select small proteins

In [21]:
select_superfamily_globular_df.query('CL == "1000004"').sort_values('len_SF', ascending=False)[: 30]

Unnamed: 0_level_0,TP,CL,len_CL,len_SF
SF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3000068,1,1000004,1,192
3000309,1,1000004,1,128
3001077,1,1000004,1,107
3001320,1,1000004,1,99
3000255,1,1000004,1,68
3000160,1,1000004,1,67
3000412,1,1000004,1,67
3000545,1,1000004,1,64
3000846,1,1000004,1,48
3001459,1,1000004,1,40
