# Analysis of Protein Structure Prediction Results

## Colab Setup

1. Check the runtime instance
2. Choose whether to use Google Drive (persistence) or temp `/content`
3. Install required Python packages
4. Get course files/data into the runtime
5. Mount Google Drive for persistent storage


## 1) Check the runtime


In [3]:
import sys, platform, os, textwrap
print("Python:", sys.version.split()[0])
print("Platform:", platform.platform())
print("Working dir:", os.getcwd())


Python: 3.12.12
Platform: Linux-6.6.105+-x86_64-with-glibc2.35
Working dir: /content


In [4]:
!arch

x86_64


## 2) Choose whether to use Google Drive (persistence)

- If you want your edits to persist across sessions, use **Drive**.
- If you're just running a quick exercise, you can skip Drive and use the temporary Colab filesystem (`/content`).


In [5]:
USE_DRIVE = True  # set False if you want to skip Drive mounting

if USE_DRIVE:
    from google.colab import drive
    drive.mount('/content/drive')
    print("Drive mounted at /content/drive")
else:
    print("Skipping Drive mount.")


Mounted at /content/drive
Drive mounted at /content/drive


In [7]:
import os
from pathlib import Path

# Change this folder name once; everything else uses it.
COURSE_DIR_NAME = "structbio_course"  # you can rename for your course

if "drive" in str(Path("/content/drive")) and os.path.exists("/content/drive") and USE_DRIVE:
    ROOT = Path("/content/drive/MyDrive") / COURSE_DIR_NAME
else:
    ROOT = Path("/content") / COURSE_DIR_NAME

ROOT.mkdir(parents=True, exist_ok=True)

DATA_DIR = ROOT / "data"
(DATA_DIR).mkdir(exist_ok=True)

OUTPUTS_DIR = ROOT / "outputs"
(OUTPUTS_DIR).mkdir(exist_ok=True)

print("ROOT:", ROOT)
print("data:", DATA_DIR)
print("outputs:", OUTPUTS_DIR)


ROOT: /content/drive/MyDrive/structbio_course
data: /content/drive/MyDrive/structbio_course/data
outputs: /content/drive/MyDrive/structbio_course/outputs


## 2) Get a copy of the AF2, AF3 and ESMFold results that we will analyze

... and unzip the archive


In [17]:
import os, pathlib, urllib.request

RAW_URL = "https://raw.githubusercontent.com/vvoelz/chem5412-spring2026/main/data/assignment01.zip"
zip_path = pathlib.Path(DATA_DIR) / "assignment01.zip"

urllib.request.urlretrieve(RAW_URL, zip_path)
print("Saved to:", zip_path)



Saved to: /content/drive/MyDrive/structbio_course/data/assignment01.zip


In [34]:
dest_path = pathlib.Path(DATA_DIR) / "assignment01"

!unzip {zip_path}
!mv {dest_path} {DATA_DIR}

Archive:  /content/drive/MyDrive/structbio_course/data/assignment01.zip
replace assignment01/ESMFold_demo/ESMFold.ipynb? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: assignment01/ESMFold_demo/ESMFold.ipynb  
  inflating: assignment01/ESMFold_demo/T1027_CASP14_VAV_3b382/ptm0.444_r3_default.pdb  
  inflating: assignment01/ESMFold_demo/T1027_CASP14_VAV_3b382/ptm0.444_r3_default.pae.txt  
  inflating: assignment01/ESMFold_demo/T1027_CASP14_VAV_3b382/ptm0.444_r3_default.png  
 extracting: assignment01/ESMFold_demo/README.md  
 extracting: assignment01/ESMFold_demo/T1027_CASP14_VAV_3b382.zip  
 extracting: assignment01/AF3_demo/README.md  
  inflating: assignment01/AF3_demo/fold_vav_2026_01_31_15_42/fold_vav_2026_01_31_15_42_summary_confidences_1.json  
  inflating: assignment01/AF3_demo/fold_vav_2026_01_31_15_42/msas/fold_vav_2026_01_31_15_42_paired_msa_chains_a.a3m  
  inflating: assignment01/AF3_demo/fold_vav_2026_01_31_15_42/msas/fold_vav_2026_01_31_15_42_unpaired_msa_chains_a.a3

In [31]:
A!pwd

/content


## 3) Get the experimental PDB file

We'll need the following packages:

In [26]:
!pip -q install biopython

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/3.2 MB[0m [31m8.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━[0m [32m1.8/3.2 MB[0m [31m26.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [28]:
from Bio.PDB import PDBParser, PDBIO, Select
import requests
from pathlib import Path

# ---- paths ----
OUT = pathlib.Path(DATA_DIR) / "assignment01"

!ls {OUT}

pdb_id = "7D2O"
full_pdb = OUT / f"{pdb_id}_full.pdb"
model1_pdb = OUT / f"{pdb_id}_model1.pdb"

# ---- download ----
url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
resp = requests.get(url, timeout=30)
resp.raise_for_status()
full_pdb.write_text(resp.text)

print("Downloaded:", full_pdb)

# ---- select only MODEL 1 ----
class ModelSelect(Select):
    def accept_model(self, model):
        return model.id == 0   # MODEL 1 → id == 0 in Biopython

parser = PDBParser(QUIET=True)
structure = parser.get_structure(pdb_id, str(full_pdb))

io = PDBIO()
io.set_structure(structure)
io.save(str(model1_pdb), select=ModelSelect())

print("Saved MODEL 1 only to:", model1_pdb)



ls: cannot access '/content/drive/MyDrive/structbio_course/data/assignment01': No such file or directory


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/structbio_course/data/assignment01/7D2O_full.pdb'

## 3) Install Python packages

Colab already includes numpy/scipy/matplotlib/pandas, but we will install extras.
This may take 1–3 minutes.


In [None]:
# Keep this lightweight; add packages as your course needs.
# Examples: biopython, mdtraj, nglview, py3Dmol, prody
!pip -q install biopython mdtraj py3Dmol

# Optional: if your repo includes a requirements file:
# !pip -q install -r "{REPO_DIR}/environment/colab_requirements.txt"


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.0/8.0 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import Bio
import mdtraj as md

print("numpy:", np.__version__)
print("biopython:", Bio.__version__)
print("mdtraj:", md.__version__)


numpy: 2.0.2
biopython: 1.86
mdtraj: 1.11.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

## 4) Quick structure visualization check

We’ll use simple viewers in Colab:
- `py3Dmol` (browser-based, easy)
- (optional) `nglview` (more featureful, sometimes finicky)

Below: fetch a PDB and render it.


In [None]:
import requests, textwrap
import py3Dmol

PDB_ID = "1CRN"  # crambin (small test structure)
url = f"https://files.rcsb.org/download/{PDB_ID}.pdb"
pdb_txt = requests.get(url).text

view = py3Dmol.view(width=600, height=450)
view.addModel(pdb_txt, "pdb")
# view.setStyle({"cartoon": {}})
view.setStyle({"stick": {}})
view.zoomTo()
view.show()


## 5) Saving your work and submitting

**Saving**
- If you opened from a link, click: `File → Save a copy in Drive` (recommended)
- Or download: `File → Download → .ipynb`

**Submitting**
- You will submit either:
  - a `.ipynb` file (preferred), or
  - a PDF export of the notebook, depending on the assignment.

**If you get stuck**
- Restart runtime: `Runtime → Restart runtime`
- Re-run the install cell(s)
- Post the error message + what cell it came from


## Next steps
Open the first exercise notebook from the course site and repeat the same workflow:
1. Install packages (if needed)
2. Pull data from the repo (or download as instructed)
3. Run analysis
4. Save a copy to Drive


In [None]:
import os, sys, glob
import json

import numpy as np
from matplotlib import pyplot as plt

VERBOSE = True


### Load in the predicted pLDDT values from AF2

AF2_dir = '../AF2_predictions'
json_files = glob.glob( os.path.join(AF2_dir,'af168_scores_rank_00?_alphafold2_ptm_model_?_seed_000.json'))
json_files.sort()   # sort in place

n = len(json_files)
af2_labels = [f'AF2 model{i}' for i in range(n)]
print('json_files', json_files)

all_data  = []

# Open and read the JSON file
for json_file in json_files:
    with open(json_file, 'r') as file:
        data = json.load(file)

    if VERBOSE:
        print(data["plddt"])
    all_data.append( data["plddt"] )   # these are per-residue c

all_data = np.array(all_data).transpose()
print('all_data.shape', all_data.shape)

### Alternatively, we could read in the pLDDT values from the `best_model.pdb' B-factors
if (0):
    # NOTE: these pLDDT values are per-atom, not per-residue, so we need to have a mapping from
    #       atomindex to residue index
    import subprocess
    cmd = f'cat {os.path.join(AF2_dir,"best_model.pdb")} | grep ATOM'
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    print('result', result)

    lines = result.stdout.split('\n')
    chain_of_atomnum, resnum_of_atomnum = {}, {}
    atomnums, chains, resnums = [], [], []
    for line in lines:
        try:
            fields = line.split()   # ATOM   3244  CD2 HIS B 209
            atomnum, chain, resnum = int(fields[1]), fields[4], int(fields[5])
            chain_of_atomnum[atomnum] = chain
            resnum_of_atomnum[atomnum] = resnum
            atomnums.append(atomnum)
            chains.append(chain)
            resnums.append(resnum)
        except:
            pass

### Load the ACTUAL lddt from a csv  (obtained from https://swissmodel.expasy.org/assess )
import pandas as pd
actual_lddt_csv = 'SuORj1_01_lddt.csv'
actual = pd.read_csv(actual_lddt_csv)
print(actual["lddt"])
print(actual["mdl_res_no"])


plt.figure(figsize=(10.,4.))
plt.plot(actual["mdl_res_no"], all_data/100.0, '.', label=af2_labels, ms=1)    # divide by 100 to convert from percent
plt.plot(actual['mdl_res_no'], actual["lddt"], label='actual LDDT')
plt.xlabel('model residue number')
plt.ylabel('pLDDT / LDDT')
plt.legend(loc='best')
plt.tight_layout()
# plt.show()

outpng = 'AF2_monomer_pLDDT_vs_actual.png'
plt.savefig(outpng)
print('Wrote:', outpng)
