# `pp5` demo notebook


In [1]:
import sys
sys.path.append("..")

import pandas as pd
pd.set_option('display.max_rows', None)

In [2]:
from pp5.prec import ProteinRecord
from pp5.pgroup import ProteinGroup

In [3]:
from pp5.external_dbs import ena, pdb, unp

## `ProteinRecord`

This class scrapes all data from a single protein.

In [4]:
prec = ProteinRecord.from_pdb('2WUR:A')

[2021-04-01 11:46:57,693             pp5.prec]    INFO >> P42212: Initializing protein record...
[2021-04-01 11:46:57,705             pp5.prec]    INFO >> P42212: PDB XREF = 2WUR:A (res=0.90Å, len=236)
[2021-04-01 11:46:57,706             pp5.prec]    INFO >> (P42212, 2WUR:A): GREEN FLUORESCENT PROTEIN, org=AEQUOREA VICTORIA (6100), expr=ESCHERICHIA COLI (511693), res=0.90Å, entity_id=1
[2021-04-01 11:46:57,707 pp5.external_dbs.pdb]    INFO >> Loading PDB file /Users/aviv/dev/phd/proteins/data/pdb/2wur.cif...
[2021-04-01 11:46:59,029             pp5.prec]    INFO >> (P42212, 2WUR:A): PDB to UNP sequence alignment score=1914.0
[2021-04-01 11:46:59,043             pp5.prec]    INFO >> (P42212, 2WUR:A): ENA ID = ENA|CAA65278|CAA65278.1
[2021-04-01 11:46:59,045             pp5.prec]    INFO >> (P42212, 2WUR:A): Translated DNA to PDB alignment (norm_score=8.36, num=1)
--KGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLXXXVQCFSRYPDHMKRHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIE

`ProteinRecord`s can be indexed to get the information about their specific residues.

In [5]:
print(f'len={len(prec)}')

prec[0:10]

len=230


(K3    [AAA][-] (ɸ=nan°,ψ=-166.5±67.3°,ω=nan°) b=36.38, unp_idx=2,
 G4    [GGA][G] (ɸ=-54.1±64.0°,ψ=-40.0±43.7°,ω=180.0±51.8°) b=16.02, unp_idx=3,
 E5    [GAA][G] (ɸ=-58.0±41.1°,ψ=-32.6±45.6°,ω=-175.7±28.8°) b=13.93, unp_idx=4,
 E6    [GAA][G] (ɸ=-66.6±44.5°,ψ=-18.1±40.8°,ω=-176.8±31.0°) b=12.29, unp_idx=5,
 L7    [CTT][G] (ɸ=-73.7±31.9°,ψ=-7.6±31.6°,ω=175.7±26.8°) b=9.33, unp_idx=6,
 F8    [TTC][G] (ɸ=-114.4±35.7°,ψ=12.3±46.1°,ω=-173.1±26.6°) b=8.16, unp_idx=7,
 T9    [ACT][S] (ɸ=-69.7±44.5°,ψ=-19.6±54.4°,ω=-179.7±28.7°) b=9.30, unp_idx=8,
 G10   [GGA][S] (ɸ=-123.2±35.2°,ψ=-166.5±24.8°,ω=-179.2±31.3°) b=9.07, unp_idx=9,
 V11   [GTT][-] (ɸ=-79.9±31.1°,ψ=127.9±32.0°,ω=-179.2±24.2°) b=7.59, unp_idx=10,
 V12   [GTC][E] (ɸ=-118.7±42.2°,ψ=129.7±33.7°,ω=-176.0±32.7°) b=6.07, unp_idx=11)

There are other useful things exposed by `ProteinRecord`.

In [6]:
prec.unp_id, prec.pdb_id, prec.ena_id

('P42212', '2WUR:A', 'ENA|CAA65278|CAA65278.1')

In [7]:
# Sequence of AAs as a SeqRecord
seq_rec = prec.protein_seq
print(f"{seq_rec.seq!s}, len={len(seq_rec)}")

KGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLXXXVQCFSRYPDHMKRHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKTRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHG, len=230


In [8]:
# Codons
seq_codons = prec.codons
print(f"{str.join(', ', seq_codons)}, len={len(seq_codons)}")

AAA, GGA, GAA, GAA, CTT, TTC, ACT, GGA, GTT, GTC, CCA, ATT, CTT, GTT, GAA, TTA, GAT, GGT, GAT, GTT, AAT, GGG, CAC, AAA, TTT, TCT, GTC, AGT, GGA, GAG, GGT, GAA, GGT, GAT, GCA, ACA, TAC, GGA, AAA, CTT, ACC, CTT, AAA, TTT, ATT, TGC, ACT, ACT, GGA, AAA, CTA, CCT, GTT, CCA, TGG, CCA, ACA, CTT, GTC, ACT, ACT, ---, ---, ---, ---, GTT, CAA, TGC, TTT, TCA, AGA, TAC, CCA, GAT, CAT, ATG, AAA, CGT, CAT, GAC, TTT, TTC, AAG, AGT, GCC, ATG, CCC, GAA, GGT, TAT, GTA, CAG, GAA, AGA, ACT, ATA, TTT, TTC, AAA, GAT, GAC, GGG, AAC, TAC, AAG, ACA, CGT, GCT, GAA, GTC, AAG, TTT, GAA, GGT, GAT, ACC, CTT, GTT, AAT, AGA, ATC, GAG, TTA, AAA, GGT, ATT, GAT, TTT, AAA, GAA, GAT, GGA, AAC, ATT, CTT, GGA, CAC, AAA, TTG, GAA, TAC, AAC, TAT, AAC, TCA, CAC, AAT, GTA, TAC, ATC, ATG, GCA, GAC, AAA, CAA, AAG, AAT, GGA, ATC, AAA, GTT, AAC, TTC, AAA, ---, AGA, CAC, AAC, ATT, GAA, GAT, GGA, AGC, GTT, CAA, CTA, GCA, GAC, CAT, TAT, CAA, CAA, AAT, ACT, CCA, ATT, GGC, GAT, GGC, CCT, GTC, CTT, TTA, CCA, GAC, AAC, CAT, TAC, CTG, TCC, 

In [9]:
# Metadata about the structure
from pprint import pprint

meta = prec.pdb_meta
pprint(meta.as_dict(), indent=2)

{ 'cg_ph': 8.0,
  'cg_temp': None,
  'chain_entities': {'A': 1},
  'description': 'GREEN FLUORESCENT PROTEIN',
  'entity_sequence': { 1: 'MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKTRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYN'},
  'host_org': 'ESCHERICHIA COLI',
  'host_org_id': 511693,
  'ligands': 'GYS,EOH,IPA',
  'pdb_id': '2WUR',
  'r_free': 0.174,
  'r_work': None,
  'resolution': 0.9,
  'resolution_low': 6.0,
  'space_group': 'P 21 21 21',
  'src_org': 'AEQUOREA VICTORIA',
  'src_org_id': 6100,
  'title': 'Atomic resolution structure of GFP measured on a rotating anode'}


`ProteinRecord`s can be converted into a `DataFrame` with all residue info.

In [10]:
prec.to_dataframe()

Unnamed: 0,res_id,name,unp_idx,codon,codon_score,codon_opts,bfactor,secondary,phi,psi,phi_std,psi_std
0,3,K,2.0,AAA,0.8,AAG/AAA,36.380001,-,,-166.496951,,67.342567
1,4,G,3.0,GGA,0.8,GGA/GGT,16.023333,G,-54.128596,-39.977391,63.984889,43.730279
2,5,E,4.0,GAA,0.8,GAG/GAA,13.926667,G,-58.03795,-32.616022,41.079524,45.592354
3,6,E,5.0,GAA,1.0,GAA,12.293334,G,-66.611128,-18.085576,44.531199,40.841774
4,7,L,6.0,CTT,0.8,CTT/TTA,9.333333,G,-73.691639,-7.57273,31.916723,31.616308
5,8,F,7.0,TTC,1.0,TTC,8.16,G,-114.414332,12.283753,35.741351,46.145673
6,9,T,8.0,ACT,1.0,ACT,9.303333,S,-69.715195,-19.558807,44.537511,54.415012
7,10,G,9.0,GGA,0.8,GGA/GGT,9.066667,S,-123.22814,-166.540146,35.218754,24.756746
8,11,V,10.0,GTT,1.0,GTT,7.59,-,-79.873452,127.888303,31.060029,32.035548
9,12,V,11.0,GTC,1.0,GTC,6.073333,E,-118.728453,129.682649,42.194604,33.683237


You can also write this information to CSV.

In [11]:
prec.to_csv()

[2021-04-01 11:46:59,534             pp5.prec]    INFO >> Wrote (P42212, 2WUR:A) to /Users/aviv/dev/phd/proteins/out/prec/2WUR_A.csv


PosixPath('/Users/aviv/dev/phd/proteins/out/prec/2WUR_A.csv')

## `ProteinGroup`

This class constructs a protein group given a reference structure, and performs per-residue alignment.

In [12]:
# See documentation for all options (also supports args of __init__)
pgroup = ProteinGroup.from_pdb_ref('1nkd:a', resolution_cutoff=2.5, blast_identity_cutoff=0., context_len=1)
print(pgroup)

[2021-04-01 11:46:59,580 pp5.external_dbs.pdb_api]    INFO >> Executing PDB query: (('X-Ray Resolution'  LESS_OR_EQUAL '2.5') AND ('Method'  EXACT_MATCH 'X-RAY DIFFRACTION')) AND ('Expression System'  CONTAINS_PHRASE 'Escherichia Coli')
[2021-04-01 11:47:03,098           pp5.pgroup]    INFO >> Got 105306 query initial results, running BLAST for sequence alignment to reference 1NKD:A...
[2021-04-01 11:47:05,377           pp5.pgroup]    INFO >> Initializing ProteinGroup for 1NKD:A with 11 query structures: ['2IJJ:A', '2IJK:A', '4DO2:A', '1GTO:A', '2IJI:A', '2IJH:A', '3K79:A', '1B6Q:A', '1NKD:A', '1QX8:A', '1F4M:A']
[2021-04-01 11:47:05,431             pp5.prec]    INFO >> P03051: Initializing protein record...
[2021-04-01 11:47:05,433             pp5.prec]    INFO >> P03051: PDB XREF = 1NKD:A (res=1.09Å, len=62)
[2021-04-01 11:47:05,433             pp5.prec]    INFO >> (P03051, 1NKD:A): ROP, org=Escherichia coli (562), expr=Escherichia coli (562), res=1.09Å, entity_id=1
[2021-04-01 11:47

In [13]:
# Per-structure metadata
pgroup.to_struct_dataframe()

Unnamed: 0,unp_id,pdb_id,resolution,struct_rmse,n_stars,seq_len,description,src_org,src_org_id,host_org,host_org_id,ligands,space_group,r_free,r_work,cg_ph,cg_temp,ref_group
0,P03051,1NKD:A,1.09,0.0,59,65,ROP,Escherichia coli,562,Escherichia coli,562,,C 1 2 1,0.134,,,,True
1,P03051,4DO2:A,1.401,0.337985,53,70,Regulatory protein rop,Escherichia coli,562,Escherichia coli,562,,P 1 21 1,0.1877,0.1587,6.4,291.0,True


In [14]:
# per-residue alignment
pgroup.to_residue_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,unp_id,idx,type,ang_dist,context,res_id,name,unp_idx,codon,codon_score,codon_opts,bfactor,secondary,phi,psi,phi_std,psi_std
ref_idx,query_pdb_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,1NKD:A,P03051,1,REFERENCE,0.0,0,2,T,1.0,ACC,0.5,ACC/ACT,14.956667,-,-73.427455,164.862176,55.030156,40.083436
1,4DO2:A,P03051,1,VARIANT,3.566872,0,2,T,1.0,ACC,0.5,ACC/ACT,16.17,-,-74.659132,161.514707,56.474552,41.018859
2,1NKD:A,P03051,2,REFERENCE,0.0,1,3,K,2.0,AAA,1.0,AAA,13.106667,H,-64.316011,-37.66441,53.438435,52.70498
2,4DO2:A,P03051,2,VARIANT,2.404612,1,3,K,2.0,AAA,1.0,AAA,13.193334,H,-62.279609,-36.385653,53.32747,51.910602
3,1NKD:A,P03051,3,REFERENCE,0.0,2,4,Q,3.0,CAG,1.0,CAG,11.953333,H,-65.504321,-38.24163,48.504364,49.020934
3,4DO2:A,P03051,3,VARIANT,3.816857,2,4,Q,3.0,CAG,1.0,CAG,11.423333,H,-68.607149,-36.018827,45.094469,48.831444
4,1NKD:A,P03051,4,REFERENCE,0.0,3,5,E,4.0,GAA,1.0,GAA,11.956667,H,-63.58747,-39.946621,48.658688,49.727834
4,4DO2:A,P03051,4,VARIANT,6.060081,3,5,E,4.0,GAA,1.0,GAA,12.453333,H,-64.16758,-45.978873,48.743084,50.055131
5,1NKD:A,P03051,5,REFERENCE,0.0,4,6,K,5.0,AAA,1.0,AAA,12.66,H,-63.387963,-42.099578,50.231578,51.993457
5,4DO2:A,P03051,5,VARIANT,4.646885,4,6,K,5.0,AAA,1.0,AAA,13.553333,H,-58.912625,-43.350536,51.69317,50.824951


You can also write this information to CSV.

In [15]:
# Match-groups for each codon at each unique unp_id:unp_idx
pgroup.to_groups_dataframe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,type,name,secondary,group_size,phi,psi,phi_std,psi_std,ang_dist,norm_factor,...,contexts,prev_phis,curr_phis,next_phis,prev_psis,curr_psis,next_psis,codon_opts,prev_codons,next_codons
ref_idx,unp_id,unp_idx,codon,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1,P03051,1,ACC,VARIANT,T,-,2,-74.043294,163.188451,0.615809,1.673863,0.0,1.783546,...,0;0,nan;nan,-73.427;-74.659,-64.316;-62.280,155.578;161.629,164.862;161.515,-37.664;-36.386,ACT,---;---,AAA;AAA
2,P03051,2,AAA,VARIANT,K,H,2,-63.297808,-37.025031,1.018206,0.639503,0.0,1.202376,...,1;1,-73.427;-74.659,-64.316;-62.280,-65.504;-68.607,164.862;161.515,-37.664;-36.386,-38.242;-36.019,,ACC;ACC,CAG;CAG
3,P03051,3,CAG,VARIANT,Q,H,2,-67.055731,-37.13023,1.551512,1.11139,0.0,1.908501,...,2;2,-64.316;-62.280,-65.504;-68.607,-63.587;-64.168,-37.664;-36.386,-38.242;-36.019,-39.947;-45.979,,AAA;AAA,GAA;GAA
4,P03051,4,GAA,VARIANT,E,H,2,-63.877521,-42.962744,0.290067,3.016863,0.0,3.030776,...,3;3,-65.504;-68.607,-63.587;-64.168,-63.388;-58.913,-38.242;-36.019,-39.947;-45.979,-42.100;-43.351,,CAG;CAG,AAA;AAA
5,P03051,5,AAA,VARIANT,K,H,2,-61.150294,-42.725056,2.237844,0.625269,0.0,2.323554,...,4;4,-63.587;-64.168,-63.388;-58.913,-56.870;-63.920,-39.947;-45.979,-42.100;-43.351,-46.662;-41.208,,GAA;GAA,ACC;ACC
6,P03051,6,ACC,VARIANT,T,H,2,-60.39465,-43.935404,3.526258,2.727652,0.0,4.458092,...,5;5,-63.388;-58.913,-56.870;-63.920,-63.587;-68.118,-42.100;-43.351,-46.662;-41.208,-43.348;-41.645,,AAA;AAA,GCC;GCC
7,P03051,7,GCC,VARIANT,A,H,2,-65.852832,-42.496361,2.265931,0.851354,0.0,2.420588,...,6;6,-56.870;-63.920,-63.587;-68.118,-62.706;-61.103,-46.662;-41.208,-43.348;-41.645,-45.309;-40.125,,ACC;ACC,CTT;CTT
8,P03051,8,CTT,VARIANT,L,H,2,-61.904115,-42.716932,0.801632,2.592679,0.0,2.713779,...,7;7,-63.587;-68.118,-62.706;-61.103,-60.486;-67.047,-43.348;-41.645,-45.309;-40.125,-41.400;-37.644,,GCC;GCC,AAC;AAC
9,P03051,9,AAC,VARIANT,N,H,2,-63.766899,-39.522149,3.281338,1.878221,0.0,3.780859,...,8;8,-62.706;-61.103,-60.486;-67.047,-66.022;-69.403,-45.309;-40.125,-41.400;-37.644,-39.746;-40.403,,CTT;CTT,ATG;ATG
10,P03051,10,ATG,VARIANT,M,H,2,-67.71246,-40.074653,1.690272,0.32865,0.0,1.721926,...,9;9,-60.486;-67.047,-66.022;-69.403,-62.543;-63.079,-41.400;-37.644,-39.746;-40.403,-40.350;-39.567,,AAC;AAC,GCC;GCC


In [16]:
pgroup.to_csv()

[2021-04-01 11:47:13,318           pp5.pgroup]    INFO >> Wrote ProteinGroup 1NKD:A to ['/Users/aviv/dev/phd/proteins/out/pgroup/1NKD_A-structs.csv', '/Users/aviv/dev/phd/proteins/out/pgroup/1NKD_A-residues.csv', '/Users/aviv/dev/phd/proteins/out/pgroup/1NKD_A-groups.csv', '/Users/aviv/dev/phd/proteins/out/pgroup/1NKD_A-pairwise.csv', '/Users/aviv/dev/phd/proteins/out/pgroup/1NKD_A-pointwise.csv']


{'structs': PosixPath('/Users/aviv/dev/phd/proteins/out/pgroup/1NKD_A-structs.csv'),
 'residues': PosixPath('/Users/aviv/dev/phd/proteins/out/pgroup/1NKD_A-residues.csv'),
 'groups': PosixPath('/Users/aviv/dev/phd/proteins/out/pgroup/1NKD_A-groups.csv'),
 'pairwise': PosixPath('/Users/aviv/dev/phd/proteins/out/pgroup/1NKD_A-pairwise.csv'),
 'pointwise': PosixPath('/Users/aviv/dev/phd/proteins/out/pgroup/1NKD_A-pointwise.csv')}