In [93]:
import Bio
import os
import csv
import random
import numpy as np
import pandas as pd
from Bio.PDB.PDBParser import PDBParser
from glob import glob

### Distinct files w/ w/o missing residues

In [94]:
#set parser in BioPython
parser = PDBParser(PERMISSIVE=True)
# create pdb_files as all .pdb file in the folder
pdb_files = glob('C:/Users/Henry Lee/Desktop/Direct/project/PDB_data/SamplePDB_3000/*.pdb')
#pdb_files = glob('C:/Users/Henry Lee/Desktop/Lab/PDB/*.pdb')

# create null lists
res_missing = []
res_nomissing = []
reslist = []

for filename in pdb_files:
    base = os.path.basename(filename) # grab filename contained extension from path
    structure_id = os.path.splitext(base)[0] # split filename and Extension, take the former
    data = parser.get_structure(structure_id, filename) # get structure

    # Get informaation from header that if files have missing residues
    # Append to different list
    keywords = data.header["has_missing_residues"]
    reslist.append(keywords)
    if keywords == True:
        res_missing.append(structure_id)
    elif keywords == False:
        res_nomissing.append(structure_id)











































































































































































































































































































































































In [39]:
print('Total number of samples:', len(reslist))
print('Samples have missing residues:', len(res_missing))
print('Samples without missing residues::', len(res_nomissing))
print('There are {:.2%} samples have missing residue'.format(len(res_missing)/(len(res_nomissing)+len(res_missing))))

Total nuumber of samples: 2970
Samples have missing residues: 2422
Samples without missing residues:: 548
There are 81.55% samples have missing residue


### Make dataset with same number of samples w/ w/o missing residues to prevent imbalanced class

In [47]:
# Randomly choose 500 samples from each list, make one data list

sample_missing = random.sample(res_missing, 500)
sample_nomissing = random.sample(res_nomissing, 500)

In [50]:
Samples_pdb = sample_missing + sample_nomissing
# Samples_pdb = sample_missing.extend(sample_nomissing)

In [76]:
print('Samples_pdb has {} files'.format(len(Samples_pdb)))

Samples_pdb has 1000 files


In [73]:
# Output 
Samples_1000 = pd.DataFrame(Samples_pdb, columns = ['Protein'])
Samples_1000.to_csv('C:/Users/Henry Lee/Desktop/Direct/project/PDB_data/Samples_1000.csv', index = False)

### Start getting structures from samples we chose

### Extract residues, headers(name, head,  structure_method, resolution, compound , has_missing_residues)

In [75]:
pdb_files = []
for i in range(len(Samples_pdb)):
    pdb_files.append('C:/Users/Henry Lee/Desktop/Direct/project/PDB_data/SamplePDB_3000/' + Samples_pdb[i] + '.pdb')
print('pdb_files has {} files'.format(len(pdb_files)))

pdb_files has 1000 files


In [85]:
# Get residues, headers(missing/nomissing, )
res_data = pd.DataFrame()
headers = pd.DataFrame()
name_list = []
head_list = []
structure_method_list = []
resolution_list = []
has_missing_residues_list = []
compound_list = []


for filename in pdb_files:
    base = os.path.basename(filename) # grab filename contained extension from path
    structure_id = os.path.splitext(base)[0] # split filename and Extension, take the former
    data = parser.get_structure(structure_id, filename) # get structure
    
    #get residues
    res_list = [] # create null list of residues
    for model in data:
        for chain in model:
            for residue in chain.get_residues(): # Iterate over all residues in the structure
                if residue.id[0] == ' ': # check hetero-flag, omit cases of a glucose molecule and cases of a water molecule
                    #res_name = res_trans[residue.get_resname()] #trans residue from 3-letters to 1-letter
                    #res_list.append(res_name)
                    res_list.append(residue.get_resname())
    # Transform to dataframe and combine by column
    df_res = pd.DataFrame(data = res_list, columns = [structure_id]) 
    res_data = pd.concat([res_data, df_res], axis=1)
    
    # get headers
    name_list.append(data.header["name"])
    head_list.append(data.header["head"])
    structure_method_list.append(data.header["structure_method"])
    resolution_list.append(data.header["resolution"])
    has_missing_residues_list.append(data.header["has_missing_residues"])
    compound_list.append(data.header["compound"])

# combine headers to one dataframe
data_name = pd.DataFrame(data = name_list, columns = ["name"])
data_head = pd.DataFrame(data = head_list, columns = ["head"]) 
data_structure_method = pd.DataFrame(data = structure_method_list, columns = ["structure_method"]) 
data_resolution = pd.DataFrame(data = resolution_list, columns = ["resolution"]) 
data_has_missing_residues = pd.DataFrame(data = has_missing_residues_list, columns = ["has_missing_residues"]) 
data_compound = pd.DataFrame(data = compound_list, columns = ["compound"]) 

headers = pd.concat([Samples_1000, data_name, data_head,data_structure_method, data_resolution, 
                     data_has_missing_residues], axis=1)






















































































































In [86]:
res_data.head()

Unnamed: 0,2y39,2o73,3d5m,1gey,4y79,3gem,2z91,2jfk,3ueo,3qun,...,2qqt,7fdr,2x82,5uez,2hi2,1gzc,4msw,4bal,4wji,1ufo
0,GLY,ASP,SER,THR,ILE,SER,GLN,GLN,GLY,SER,...,SER,ALA,PRO,SER,THR,VAL,GLN,ALA,ALA,ARG
1,ASP,ILE,MET,ILE,VAL,ALA,LEU,SER,LEU,GLY,...,TRP,PHE,VAL,HIS,LEU,GLU,VAL,THR,GLN,VAL
2,LEU,ASN,SER,THR,GLY,PRO,LEU,MET,PHE,LEU,...,GLU,VAL,GLN,MET,ILE,THR,GLN,PHE,GLN,ARG
3,HIS,VAL,TYR,ASP,GLY,ILE,GLU,ARG,SER,VAL,...,VAL,VAL,HIS,GLU,GLU,ILE,LEU,GLU,PHE,THR
4,GLU,VAL,THR,LEU,GLN,LEU,SER,LEU,GLN,PRO,...,GLY,THR,VAL,GLN,LEU,SER,GLN,ILE,GLN,GLU


In [91]:
headers.head()

Unnamed: 0,Protein,name,head,structure_method,resolution,has_missing_residues
0,2y39,ni-bound form of cupriavidus metallidurans ch...,metal binding protein,x-ray diffraction,1.41,True
1,2o73,structure of ohcu decarboxylase in complex wi...,lyase,x-ray diffraction,1.8,True
2,3d5m,crystal structure of hcv ns5b polymerase with...,transferase,x-ray diffraction,2.2,True
3,1gey,crystal structure of histidinol-phosphate ami...,transferase,x-ray diffraction,2.3,True
4,4y79,factor xa complex with gtc000406,hydrolase,x-ray diffraction,2.1,True


In [92]:
res_data.to_csv('C:/Users/Henry Lee/Desktop/Direct/project/PDB_data/res_data_1000.csv', index = False)
headers.to_csv('C:/Users/Henry Lee/Desktop/Direct/project/PDB_data/headers_1000.csv', index = False)

In [1]:
data.header["missing_residues"]

NameError: name 'data' is not defined