Several of the structures have structures that were determined with NMR. These structures are normally monomers and not so good for determining what their actual state is - should be filtered out 

In [39]:
# imports 
import pandas as pd
from Bio import SeqIO
import re
import requests
from collections import Counter
from tqdm import tqdm

In [14]:
df = pd.read_csv("../files/phrog_represenative_pdb_seqres_minseqid0.3_c0.7.m8", sep="\t", header=None)
df.columns = [
    "query", "target", "pident", "alnlen", "mismatch", "gapopen",
    "qstart", "qend", "tstart", "tend", "evalue", "bitscore"
]

# Load query and target lengths from FASTA files
def load_lengths(fasta_file):
    return {record.id: len(record.seq) for record in SeqIO.parse(fasta_file, "fasta")}

query_lengths = load_lengths("../files/nonsingleton_representative_sequences.fasta") # PHROG representative sequences
target_lengths = load_lengths("../files/pdb_seqres.cleaned.txt") # Sequences in the protein databank 

# Map lengths to dataframe
df["qlen"] = df["query"].map(query_lengths)
df["tlen"] = df["target"].map(target_lengths)

# Compute coverage
df["query_coverage"] = (df["qend"] - df["qstart"] + 1) / df["qlen"]
df["target_coverage"] = (df["tend"] - df["tstart"] + 1) / df["tlen"]
df['pdb'] = [re.split('_',i)[0] for i in df['target']]

In [9]:
# code for determining the oligomeric state 

In [40]:
pdb_ids = list(set(df['pdb'].to_list()))

def get_experimental_method(pdb_id):
    url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id.lower()}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        method = data.get("exptl", [{}])[0].get("method", "Not available")
        return method
    else:
        return f"Error {response.status_code}"

# Fetch methods
results = []
for pdb_id in tqdm(pdb_ids, desc="Fetching experimental methods"):
    method = get_experimental_method(pdb_id)
    results.append({'PDB_ID': pdb_id.upper(), 'Experimental_Method': method})

# Convert to DataFrame
method_df = pd.DataFrame(results)
print(method_df)

# Optional: Save to CSV
method_df.to_csv("pdb_experimental_methods.csv", index=False)


Fetching experimental methods:   0%|          | 5/22234 [00:03<3:58:45,  1.55it/s]


KeyboardInterrupt: 

In [38]:
pdb_ids

['1o03',
 '3tab',
 '1awd',
 '4ani',
 '2zb2',
 '6xc1',
 '7ct6',
 '8g9b',
 '3cla',
 '3zmo',
 '8qmt',
 '2ro5',
 '3tms',
 '3jqk',
 '4a7s',
 '1l85',
 '5xr2',
 '2vpx',
 '7a8d',
 '8dh5',
 '1dg5',
 '3ia5',
 '5fu0',
 '6lgm',
 '5aqu',
 '9g2y',
 '1vvj',
 '2c2a',
 '6d15',
 '3pyx',
 '2dlb',
 '6wu9',
 '3g3w',
 '6mtc',
 '7doo',
 '4s2i',
 '9l9p',
 '4elv',
 '4ktq',
 '5ijx',
 '1kqa',
 '5ou2',
 '1hl5',
 '8jlt',
 '8bc2',
 '7fq1',
 '7tq1',
 '7pld',
 '8uqp',
 '8h40',
 '2kmg',
 '6g7m',
 '1c9d',
 '2e6b',
 '2a5w',
 '3m0i',
 '7ngh',
 '8xqu',
 '3dnv',
 '8iwv',
 '1geq',
 '8yo3',
 '7bni',
 '8umt',
 '8e7u',
 '5rp3',
 '1kfj',
 '6dd7',
 '8bav',
 '8idd',
 '8wdb',
 '5mvr',
 '8i1v',
 '4rnp',
 '8qdd',
 '8gmt',
 '8pcg',
 '3it9',
 '3ga6',
 '1m6s',
 '4lfc',
 '7cr6',
 '4r54',
 '5bt0',
 '2fzd',
 '3hta',
 '4kh3',
 '1f3w',
 '9iil',
 '4yed',
 '6igz',
 '5jwt',
 '5ndv',
 '3sa2',
 '6zjj',
 '4go2',
 '5h6l',
 '2eii',
 '5cdm',
 '1mj0',
 '3slt',
 '4ckl',
 '6lue',
 '8w8l',
 '7l0k',
 '4rmf',
 '5v79',
 '5oxj',
 '5j4b',
 '2a69',
 '2gyy',
 

In [19]:
# then once this has finished - use this df to filter out structures that were only determined using NMR 

[{'PDB_ID': '1O03', 'Experimental_Method': 'X-RAY DIFFRACTION'},
 {'PDB_ID': '3TAB', 'Experimental_Method': 'X-RAY DIFFRACTION'},
 {'PDB_ID': '1AWD', 'Experimental_Method': 'X-RAY DIFFRACTION'},
 {'PDB_ID': '4ANI', 'Experimental_Method': 'X-RAY DIFFRACTION'},
 {'PDB_ID': '2ZB2', 'Experimental_Method': 'X-RAY DIFFRACTION'},
 {'PDB_ID': '6XC1', 'Experimental_Method': 'X-RAY DIFFRACTION'},
 {'PDB_ID': '7CT6', 'Experimental_Method': 'X-RAY DIFFRACTION'},
 {'PDB_ID': '8G9B', 'Experimental_Method': 'ELECTRON MICROSCOPY'},
 {'PDB_ID': '3CLA', 'Experimental_Method': 'X-RAY DIFFRACTION'},
 {'PDB_ID': '3ZMO', 'Experimental_Method': 'X-RAY DIFFRACTION'},
 {'PDB_ID': '8QMT', 'Experimental_Method': 'X-RAY DIFFRACTION'},
 {'PDB_ID': '2RO5', 'Experimental_Method': 'SOLUTION NMR'},
 {'PDB_ID': '3TMS', 'Experimental_Method': 'X-RAY DIFFRACTION'},
 {'PDB_ID': '3JQK', 'Experimental_Method': 'X-RAY DIFFRACTION'},
 {'PDB_ID': '4A7S', 'Experimental_Method': 'X-RAY DIFFRACTION'},
 {'PDB_ID': '1L85', 'Experim