# `pp5` demo notebook


In [1]:
import sys
sys.path.append("..")

import pandas as pd
pd.set_option('display.max_rows', None)

In [2]:
from pp5.protein import ProteinRecord, ProteinGroup

## `ProteinRecord`

This class scrapes all data from a single protein.

In [3]:
prec = ProteinRecord.from_pdb('2WUR:A')

[2020-03-28 15:51:13,938          pp5.protein]    INFO >> P42212: Initializing protein record...
[2020-03-28 15:51:13,950          pp5.protein]    INFO >> P42212: PDB XREF = 2WUR:A (res=0.90Å, len=236)
[2020-03-28 15:51:13,951          pp5.protein]    INFO >> (P42212, 2WUR:A): GREEN FLUORESCENT PROTEIN, org=AEQUOREA VICTORIA (6100), expr=ESCHERICHIA COLI (511693), res=0.90Å, entity_id=1
[2020-03-28 15:51:13,952 pp5.external_dbs.pdb]    INFO >> Loading PDB file /Users/aviv/dev/phd/proteins/data/pdb/2wur.cif...
[2020-03-28 15:51:15,360          pp5.protein]    INFO >> (P42212, 2WUR:A): ENA ID = ENA|CAA65278|CAA65278.1
[2020-03-28 15:51:15,361          pp5.protein]    INFO >> (P42212, 2WUR:A): Translated DNA to PDB alignment (norm_score=8.36, num=1)
--KGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLXXXVQCFSRYPDHMKRHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKTRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHG------
--||

`ProteinRecord`s can be indexed to get the information about their specific residues.

In [4]:
print(f'len={len(prec)}')

prec[0:10]

len=230


(K3    [AAA][-] (ɸ=nan°,ψ=-166.5±67.3°,ω=nan°) b=0.46,
 G4    [GGA][G] (ɸ=-54.1±64.0°,ψ=-40.0±43.7°,ω=180.0±51.8°) b=0.20,
 E5    [GAA][G] (ɸ=-58.0±41.1°,ψ=-32.6±45.6°,ω=-175.7±28.8°) b=0.18,
 E6    [GAA][G] (ɸ=-66.6±44.5°,ψ=-18.1±40.8°,ω=-176.8±31.0°) b=0.16,
 L7    [CTT][G] (ɸ=-73.7±31.9°,ψ=-7.6±31.6°,ω=175.7±26.8°) b=0.12,
 F8    [TTC][G] (ɸ=-114.4±35.7°,ψ=12.3±46.1°,ω=-173.1±26.6°) b=0.10,
 T9    [ACT][S] (ɸ=-69.7±44.5°,ψ=-19.6±54.4°,ω=-179.7±28.7°) b=0.12,
 G10   [GGA][S] (ɸ=-123.2±35.2°,ψ=-166.5±24.8°,ω=-179.2±31.3°) b=0.11,
 V11   [GTT][-] (ɸ=-79.9±31.1°,ψ=127.9±32.0°,ω=-179.2±24.2°) b=0.10,
 V12   [GTC][E] (ɸ=-118.7±42.2°,ψ=129.7±33.7°,ω=-176.0±32.7°) b=0.08)

There are other useful things exposed by `ProteinRecord`.

In [5]:
prec.unp_id, prec.pdb_id, prec.ena_id

('P42212', '2WUR:A', 'ENA|CAA65278|CAA65278.1')

In [6]:
prec.protein_seq

Seq('KGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVP...THG')

In [7]:
prec.dna_seq

Seq('ATGGGTAAGGGAGAGGAACTTTTCACTGGAGTTGTCCCAATCTTGGTTGAGCTC...TAA')

In [8]:
prec.pdb_meta

PDBMetadata(pdb_id='2WUR', title='Atomic resolution structure of GFP measured on a rotating anode', description='GREEN FLUORESCENT PROTEIN', src_org='AEQUOREA VICTORIA', src_org_id=6100, host_org='ESCHERICHIA COLI', host_org_id=511693, resolution=0.9, resolution_low=6.0, r_free=0.174, r_work=None, space_group='P 21 21 21', ligands='IPA,GYS,EOH', cg_ph=8.0, cg_temp=None, chain_entities={'A': 1}, entity_sequence={1: 'MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKTRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYN'})

`ProteinRecord`s can be converted into a `DataFrame` with all residue info.

In [9]:
prec.to_dataframe()

Unnamed: 0,res_id,name,codon,codon_score,codon_opts,bfactor,secondary,phi,psi,omega,phi_std,psi_std,omega_std
0,3,K,AAA,0.8,AAG/AAA,0.460758,-,,-166.496951,,,67.342567,
1,4,G,GGA,0.8,GGA/GGT,0.202938,G,-54.128596,-39.977391,179.971024,63.984889,43.730279,51.804231
2,5,E,GAA,0.8,GAG/GAA,0.176383,G,-58.03795,-32.616022,-175.704262,41.079524,45.592354,28.795041
3,6,E,GAA,1.0,GAA,0.155697,G,-66.611128,-18.085576,-176.840148,44.531199,40.841774,31.048227
4,7,L,CTT,0.8,CTT/TTA,0.118208,G,-73.691639,-7.57273,175.653639,31.916723,31.616308,26.825675
5,8,F,TTC,1.0,TTC,0.103348,G,-114.414332,12.283753,-173.092909,35.741351,46.145673,26.579978
6,9,T,ACT,1.0,ACT,0.117828,S,-69.715195,-19.558807,-179.689955,44.537511,54.415012,28.662523
7,10,G,GGA,0.8,GGA/GGT,0.114831,S,-123.22814,-166.540146,-179.152422,35.218754,24.756746,31.323565
8,11,V,GTT,1.0,GTT,0.096128,-,-79.873452,127.888303,-179.208002,31.060029,32.035548,24.155972
9,12,V,GTC,1.0,GTC,0.07692,E,-118.728453,129.682649,-175.958572,42.194604,33.683237,32.675075


You can also write this information to CSV.

In [10]:
prec.to_csv()

[2020-03-28 15:51:56,138          pp5.protein]    INFO >> Wrote (P42212, 2WUR:A) to /Users/aviv/dev/phd/proteins/out/prec/2WUR_A.csv


PosixPath('/Users/aviv/dev/phd/proteins/out/prec/2WUR_A.csv')

## `ProteinGroup`

This class constructs a protein group given a reference structure, and performs per-residue alignment.

In [11]:
# See documentation for all options (also supports args of __init__)
pgroup = ProteinGroup.from_pdb_ref('1nkd:a', resolution_query=2.5, blast_identity_cutoff=0., context_len=1)
print(pgroup)

[2020-03-28 15:51:58,661 pp5.external_dbs.pdb]    INFO >> Executing PDB query: (Expression system contains Escherichia Coli) AND (Resolution between 0.0 and 2.5) AND (Sequence Search (Structure:Chain = 1NKD:A, Expectation Value = 1.0, Sequence Identity = 0, Search Tool = blast, Mask Low Complexity=yes))
[2020-03-28 15:52:04,137          pp5.protein]    INFO >> Initializing ProteinGroup for 1NKD:A with 14 query results...
[2020-03-28 15:52:04,139          pp5.protein]    INFO >> Loaded cached ProteinRecord: /Users/aviv/dev/phd/proteins/data/prec/1NKD_A.prec
[2020-03-28 15:52:04,140         pp5.parallel]    INFO >> Starting global pool with 8 processes
[2020-03-28 15:52:07,750         pp5.parallel]    INFO >> Moved 0 data files from /var/folders/cy/jr9f_bmd7h1dnjxr3pvrw_p80000gn/T/pp5_data/_global into /Users/aviv/dev/phd/proteins/data
ProteinGroup 1NKD:A, #structures=9 #matches=427


In [12]:
# Per-structure metadata
pgroup.to_struct_dataframe()

Unnamed: 0,unp_id,pdb_id,resolution,struct_rmse,n_stars,seq_len,description,src_org,src_org_id,host_org,host_org_id,ligands,space_group,r_free,r_work,cg_ph,cg_temp,ref_group
0,P03051,1NKD:A,1.09,0.0,59,65,ROP,Escherichia coli,562,Escherichia coli,562,,C 1 2 1,0.134,,,,True
1,P03051,3K79:A,1.96,0.371052,52,63,Regulatory protein rop,Escherichia coli,562,Escherichia coli,562,,C 1 2 1,0.25,0.163,,298.0,True
2,P03051,4DO2:A,1.401,0.39151,54,70,Regulatory protein rop,Escherichia coli,562,Escherichia coli,562,,P 1 21 1,0.1877,0.1587,6.4,291.0,True
3,P03051,1GTO:A,1.82,0.419637,50,62,ROP,Escherichia coli,562,Escherichia coli BL21(DE3),469008,,I 4 2 2,0.28,0.224,,,True
4,P03051,2IJI:A,2.3,0.459617,52,63,Regulatory protein rop,Escherichia coli,562,Escherichia coli BL21(DE3),469008,,P 31 2 1,0.29163,0.23435,5.5,,True
5,P03051,2IJH:A,1.8,0.475776,50,63,Regulatory protein rop,Escherichia coli,562,Escherichia coli BL21(DE3),469008,,P 21 21 2,0.23038,0.19181,5.5,273.0,True
6,P03051,2IJJ:A,1.9,0.504891,51,63,Regulatory protein rop,Escherichia coli,562,Escherichia coli BL21(DE3),469008,,P 21 21 2,0.26257,0.20323,5.5,273.0,True
7,P03051,2IJK:A,1.55,0.507994,53,63,Regulatory protein rop,Escherichia coli,562,Escherichia coli BL21(DE3),469008,,C 1 2 1,0.21616,0.16942,6.0,,True
8,P03051,2GHY:A,2.5,0.536335,53,63,Regulatory protein rop,Escherichia coli,562,Escherichia coli,562,,C 1 2 1,0.263,0.179,4.6,295.0,True


In [13]:
# per-residue alignment
pgroup.to_residue_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,resolution,unp_id,type,res_id,name,codon,codon_score,codon_opts,bfactor,secondary,ang_dist,phi,psi,phi_std,psi_std
ref_idx,query_pdb_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,1NKD:A,1.09,P03051,REFERENCE,2,T,ACC,0.5,ACC/ACT,0.189428,-,0.0,-73.427455,164.862176,55.030156,40.083436
1,4DO2:A,1.401,P03051,VARIANT,2,T,ACC,0.5,ACC/ACT,0.204795,-,3.566872,-74.659132,161.514707,56.474552,41.018859
2,1NKD:A,1.09,P03051,REFERENCE,3,K,AAA,1.0,AAA,0.165998,H,0.0,-64.316011,-37.66441,53.438435,52.70498
2,2GHY:A,2.5,P03051,VARIANT,3,K,AAA,1.0,AAA,0.288301,T,140.299618,75.761936,-45.548017,74.531643,62.690472
2,2IJI:A,2.3,P03051,VARIANT,3,K,AAA,1.0,AAA,0.623294,H,23.513411,-81.280517,-21.382947,95.858726,97.501828
2,2IJJ:A,1.9,P03051,VARIANT,3,K,AAA,1.0,AAA,0.469581,H,3.886475,-60.949045,-35.723219,88.04639,84.526417
2,2IJK:A,1.55,P03051,VARIANT,3,K,AAA,1.0,AAA,0.197365,H,6.255178,-62.888585,-31.574278,55.489676,54.980155
2,3K79:A,1.96,P03051,VARIANT,3,K,AAA,1.0,AAA,0.3534,H,12.416543,-76.540757,-39.838376,63.174817,68.446089
2,4DO2:A,1.401,P03051,VARIANT,3,K,AAA,1.0,AAA,0.167096,H,2.404612,-62.279609,-36.385653,53.32747,51.910602
3,1NKD:A,1.09,P03051,REFERENCE,4,Q,CAG,1.0,CAG,0.151391,H,0.0,-65.504321,-38.24163,48.504364,49.020934


You can also write this information to CSV.

In [14]:
pgroup.to_csv()

[2020-03-28 15:53:22,800          pp5.protein]    INFO >> Wrote ProteinGroup 1NKD:A, #structures=9 #matches=427 to /Users/aviv/dev/phd/proteins/out/pgroup/1NKD_A-struct.csv
[2020-03-28 15:53:22,819          pp5.protein]    INFO >> Wrote ProteinGroup 1NKD:A, #structures=9 #matches=427 to /Users/aviv/dev/phd/proteins/out/pgroup/1NKD_A-residue.csv


[PosixPath('/Users/aviv/dev/phd/proteins/out/pgroup/1NKD_A-struct.csv'),
 PosixPath('/Users/aviv/dev/phd/proteins/out/pgroup/1NKD_A-residue.csv')]