In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from read_pdb_file import read_pdb

In [2]:
## read in a sample of protein data
X_list, Y_list, Z_list, atomtype_list=read_pdb("./training_data/0002_pro_cg.pdb")
print(X_list)
print(Y_list)
print(Z_list)
print(atomtype_list)

[27.591, 28.872, 28.03, 28.461, 28.642, 27.518, 32.013, 32.923, 30.395, 29.329, 26.166, 28.603, 27.386, 31.743, 32.582, 33.524, 33.921, 33.575, 30.75, 29.726, 29.321, 29.116, 32.326, 33.79, 31.228, 30.844, 29.219, 29.632, 32.635, 26.811, 27.093, 26.15, 26.699, 25.049, 22.502, 21.883, 21.239, 20.715, 19.378, 15.82, 15.384, 16.385, 17.547, 18.691, 14.756, 13.256, 15.984, 15.665, 19.582, 20.174, 19.226, 18.038, 19.402, 18.466, 22.747, 23.981, 23.285, 23.187, 21.822, 21.856, 24.805, 24.402, 27.923, 27.934, 30.782, 28.094, 27.088, 27.657, 26.47, 30.429, 30.352, 32.778, 32.7, 36.089, 32.27, 31.1, 31.588, 30.102, 33.144, 31.561, 32.108, 31.672, 32.965, 29.706, 29.578, 26.246, 25.17, 25.51, 25.108, 24.399, 24.174, 24.956, 20.839, 20.646, 19.618, 20.323, 20.765, 22.217, 18.987, 19.349, 15.779, 15.708, 16.64, 15.939, 17.124, 17.483, 16.506, 17.74, 13.544, 13.516, 11.224, 11.253, 12.376, 13.836, 11.604, 12.162, 8.154, 7.973, 7.366, 7.729, 8.201, 9.484, 11.58, 5.169, 4.187, 0.008, 4.669, 3.865, 5.

In [3]:
## read in a sample of ligand data
X_list, Y_list, Z_list, atomtype_list=read_pdb("./training_data/0002_lig_cg.pdb")
print(X_list)
print(Y_list)
print(Z_list)
print(atomtype_list)

[12.491, 10.724, 6.649, 12.411, 14.561]
[18.459, 18.646, 18.186, 17.288, 14.848]
[43.185, 39.281, 35.357, 43.934, 43.97]
['h', 'h', 'h', 'h', 'h']


In [4]:
## read in all protein data
meta_pro = []
for i in range(1, 3001):    
    file_number = ("0000"+str(i))[-4:]
    file_name = "{}_{}_cg.pdb".format(file_number, "pro")
    X_list, Y_list, Z_list, atomtype_list=read_pdb("./training_data/{}".format(file_name))
    meta_pro.append([X_list, Y_list, Z_list])

In [5]:
## read in all ligand data
meta_lig = []
for i in range(1, 3001):    
    file_number = ("0000"+str(i))[-4:]
    file_name = "{}_{}_cg.pdb".format(file_number, "lig")
    X_list, Y_list, Z_list, atomtype_list=read_pdb("./training_data/{}".format(file_name))
    meta_lig.append([X_list, Y_list, Z_list])

In [6]:
## check the number of atoms in each protein
len_pro = np.array([])
for i in meta_pro:
    len_pro = np.append(len_pro, len(i[0]))
stats.describe(len_pro)

DescribeResult(nobs=3000, minmax=(38.0, 14644.0), mean=1295.8136666666667, variance=1789880.0529642103, skewness=4.20036543134986, kurtosis=26.4620407523816)

In [7]:
## check the number of atoms in each ligand
len_lig = np.array([])
for i in meta_lig:
    len_lig = np.append(len_lig, len(i[0]))
stats.describe(len_lig)

DescribeResult(nobs=3000, minmax=(1.0, 24.0), mean=4.59, variance=4.0212404134711575, skewness=1.9312449247643564, kurtosis=9.099097270068455)

In [8]:
## check the size of atoms in each protein
size_pro = pd.DataFrame(columns=['min_x', 'max_x',
                                 'min_y', 'max_y',
                                 'min_z', 'max_z'])
for n in range(0, len(meta_pro)):
    i = meta_pro[n]
    min_x, max_x = min(i[0]), max(i[0])
    min_y, max_y = min(i[1]), max(i[1])
    min_z, max_z = min(i[2]), max(i[2])
    res = [min_x, max_x, min_y, max_y, min_z, max_z]
    size_pro.loc[n+1] = res
size_pro

Unnamed: 0,min_x,max_x,min_y,max_y,min_z,max_z
1,-5.261,60.674,8.412,55.901,-5.582,38.354
2,-10.362,39.273,-6.703,38.873,24.180,81.848
3,-2.184,53.127,-10.030,36.556,4.549,41.588
4,-13.479,65.705,-3.792,86.794,88.406,148.018
5,-5.925,35.052,-10.469,42.279,-7.977,62.900
6,-31.523,25.637,-20.214,23.185,8.962,71.330
7,-11.067,29.758,-64.424,0.260,-15.202,37.795
8,20.091,57.700,-18.652,46.844,-27.526,10.647
9,-12.901,40.316,7.596,56.596,-14.881,24.604
10,-21.169,22.523,-34.356,4.268,-40.699,8.647


In [9]:
## statistics of the size of atoms in each protein
size_pro.describe()

Unnamed: 0,min_x,max_x,min_y,max_y,min_z,max_z
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,-10.035343,52.773649,-8.472856,53.930833,-5.555268,60.817726
std,31.91159,36.561386,34.066018,38.498137,34.389585,40.301701
min,-244.401,-160.384,-229.648,-110.569,-177.028,-63.484
25%,-25.69425,31.566,-26.057,29.778,-21.24525,37.10325
50%,-10.8135,45.809,-10.9765,46.6735,-8.6255,50.8755
75%,2.22075,68.87775,7.6665,68.4485,5.98925,77.89825
max,209.681,310.935,285.075,432.956,341.849,435.107


In [10]:
## check the size of atoms in each ligand
size_lig = pd.DataFrame(columns=['min_x', 'max_x',
                                 'min_y', 'max_y',
                                 'min_z', 'max_z'])
for n in range(0, len(meta_lig)):
    i = meta_lig[n]
    min_x, max_x = min(i[0]), max(i[0])
    min_y, max_y = min(i[1]), max(i[1])
    min_z, max_z = min(i[2]), max(i[2])
    res = [min_x, max_x, min_y, max_y, min_z, max_z]
    size_lig.loc[n+1] = res
size_lig

Unnamed: 0,min_x,max_x,min_y,max_y,min_z,max_z
1,32.552,37.468,34.280,37.915,18.924,27.112
2,6.649,14.561,14.848,18.646,35.357,43.970
3,3.308,14.336,1.305,10.315,18.220,26.728
4,22.340,31.901,52.620,66.448,111.713,116.154
5,15.397,23.845,4.221,12.772,30.752,36.852
6,1.938,13.848,5.646,10.223,41.733,46.654
7,6.130,12.832,-35.320,-30.998,9.837,10.391
8,36.354,41.209,8.522,12.820,-12.969,-11.109
9,2.418,4.908,30.252,34.022,5.039,8.738
10,-0.970,1.813,-1.806,0.811,-10.935,-3.304


In [11]:
## statistics of the size of atoms in each ligand
size_lig.describe()

Unnamed: 0,min_x,max_x,min_y,max_y,min_z,max_z
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,18.333524,24.304946,19.14891,24.797984,23.625621,29.446306
std,33.392967,33.190767,35.603648,35.631966,35.78896,35.987657
min,-187.322,-185.045,-140.466,-132.349,-100.571,-97.593
25%,-0.57675,5.332,-0.93025,4.48225,3.84925,9.554
50%,13.7625,19.932,13.5515,18.9615,17.218,23.2485
75%,31.54175,36.79,34.218,39.9285,35.359,41.5505
max,248.504,253.608,379.675,384.916,414.336,424.654


In [79]:
## END