# Examples: Processing PDB Files

In [1]:
%load_ext autoreload
%autoreload 2

from IPython.display import display
from pathlib import Path

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

from opencadd.api.web import rcsb
from opencadd.io import pdb
from opencadd.io.pdb import _records


In [None]:
pdb_paths = list(Path("./pdbs").glob("*.pdb"))

In [None]:
parsers = [pdb.parser.from_filepath(pdb_path, parse=False) for pdb_path in pdb_paths[:100]]

In [None]:
def print_compare(record_name, begin=0, end=100):
    for i, p in enumerate(parsers[begin:end]):
        record = getattr(records, record_name.upper())
        if p.has_record(record):
            print(f"\nIndex: {i+begin}; PDB-ID: {p.record_header['pdb_id']}\n")
            print("","".join([str(i) for i in range(0,10)]*8))
            print(p.record_lines(record, False), "\n")
            data = getattr(p, f"record_{record_name}")
            #if isinstance(data, pd.DataFrame):
            display(data)  
            print("\n","-"*84)
        

### HEADER

In [None]:
print_compare("header")

### OBSLTE

In [None]:
obslte_pdb_ids = rcsb.data_holdings(status="removed")

for obslte_pdb_id in obslte_pdb_ids[:10]:
    p = pdb.parser.from_pdb_id(obslte_pdb_id)
    print(p.record_obslte)
    print(p.record_lines(records.OBSLTE, False), "\n\n")

### TITLE

In [None]:
print_compare("title")

### CAVEAT

In [None]:
print_compare("caveat")

### COMPND

In [None]:
print_compare("compnd")

In [None]:
for i, p in enumerate(parsers):
    try:
        if not p.record_compnd.engineered.dropna().isin(["YES", "NO"]).all():
            print(p.record_lines(records.COMPND, False))
    except AttributeError as e:
        pass
    except Exception as e:
        print(i)
        print(p.record_header["pdb_id"])
        print(p.record_lines(records.COMPND, False))

### SOURCE 

In [None]:
print_compare("source")

### KEYWDS

In [None]:
print_compare("keywds")

### EXPDTA

In [None]:
print_compare("expdta")

### NUMMDL

In [None]:
print_compare("nummdl")

### MDLTYP

In [None]:
print_compare("mdltyp")

### AUTHOR

In [None]:
print_compare("author")

### REVDAT

In [None]:
print_compare("revdat")

### SPRSDE

In [None]:
print_compare("sprsde")

### JRNL

In [None]:
parsers[0].record_jrnl

In [None]:
parsers[0].record_lines(pdb.records.JRNL, False)

### DBREF

In [None]:
print_compare("dbref")

### SEQADV

In [None]:
print_compare("seqadv")

### SEQRES

In [None]:
print_compare("seqres")

### MODRES

In [None]:
print_compare("modres")

### HET

In [None]:
print_compare("het")

### HETNAM

In [None]:
print_compare("hetnam")

### HETSYN

In [None]:
print_compare("hetsyn")

### FORMUL

In [None]:
print_compare("formul")

In [None]:
for i, p in enumerate(parsers):
    try:
        if p.has_record(records.FORMUL) and np.any(p.record_formul.count_rest != 0):
            display(p.record_formul)
    except:
        print(i)
        print(p.record_lines(records.FORMUL, False))

In [None]:
parsers[5233].record_formul

In [None]:
import re
re.split(r'[()]+', "2()")

### HELIX

In [None]:
print_compare("helix")

### SHEET

In [None]:
print_compare("sheet")

### SSBOND

In [None]:
print_compare("ssbond")

### LINK

In [None]:
print_compare("link")

### CISPEP

In [None]:
print_compare("cispep")

### SITE

In [None]:
print_compare("site")

### CRYST1

In [None]:
print_compare("cryst1")

### ORIGX1

In [None]:
print_compare("origx2")

In [None]:
for p in parsers:
    print(p.records_origx)

### SCALE

In [None]:
for p in parsers:
    print(p.records_scale)

In [None]:
print_compare("scale1")

In [None]:
for p in parsers:
    display(p.records_mtrix)

In [None]:
parsers[-5].records_mtrix.loc[1,"transformation_matrix"]

In [None]:
print_compare("mtrix1")

### ATOM/HETATM

In [None]:
for p in parsers:
    print(p.record_header["pdb_id"])
    display(p.records_atom_hetatm)
    

In [None]:
print_compare("anisou")

In [None]:
c = parsers[59].record_lines(records.ANISOU)[:, 78:80].view(dtype=(str, 2))
c[(c!="  ")&(c!="1-")]

### TER

In [None]:
print_compare("ter")

### CONECT

In [None]:
print_compare("conect")

### MASTER

In [None]:
print_compare("master")

In [None]:
r = parsers[0].record_remark
print("\n".join(r.loc[800, "content"]))

In [None]:
r.index.unique()

In [None]:
"REMARK 2 RESOLUTION. NOT APPLICABLE. "[23:30]

In [None]:
parsers[0].record_remark4