In [1]:
import datetime
from mdcrow.utils import _make_llm

In [2]:
prompt = "Write me a code to run simulation of fibronectin."
model = "gpt-4o-2024-08-06"

In [3]:
# Parameters
prompt = "Simulate deoxygenated hemoglobin (1A3N) and oxygenated hemoglobin (6BB5). Plot the PCA of both trajectories."


In [4]:
llm = _make_llm(model, temp=0.1, streaming=True)

system_prompt = (
    "You are an expert molecular dynamics scientist, and your "
    "task is to respond to the question or "
    "solve the problem in its entirety to the best of your ability. "
    "If any part of the task requires you to perform an action that "
    "you are not capable of completing, please write a runnable "
    "Python script for that step and move on. For literature papers, "
    "use and process papers from the `paper_collection` folder. "
    "For .pdb files, download them from the RSCB website using `requests`. "
    "To preprocess PDB files, you will use PDBFixer. "
    "To get information about proteins, retrieve data from the UniProt database. "
    "For anything related to simulations, you will use OpenMM, "
    "and for anything related to analyses, you will use MDTraj. "
    "At the end, combine any scripts into one script. "
)
messages = [
    ("system", system_prompt),
    ("human", prompt),
]

now = datetime.datetime.now()
date = now.strftime("%Y-%m-%d")
print("date:",date)
time = now.strftime("%H:%M:%S")
print("time:",time)
print("LLM: ",llm.model_name,"\nTemperature: ",llm.temperature)

date: 2024-10-16
time: 20:45:41
LLM:  gpt-4o-2024-08-06 
Temperature:  0.1


In [5]:
ai_msg = llm.invoke(messages)
print(ai_msg.content)

To

 simulate

 de

oxygen

ated

 hem

oglobin

 (

P

DB

 ID

:

 

1

A

3

N

)

 and

 oxygen

ated

 hem

oglobin

 (

P

DB

 ID

:

 

6

BB

5

),

 and

 then

 perform

 a

 Principal

 Component

 Analysis

 (

P

CA

)

 on

 the

 trajectories

,

 we

 will

 follow

 these

 steps

:



1

.

 Download

 the

 P

DB

 files

 for

 both

 hem

oglobin

 structures

.


2

.

 Pre

process

 the

 P

DB

 files

 using

 P

DB

Fix

er

 to

 ensure

 they

 are

 suitable

 for

 simulation

.


3

.

 Set

 up

 and

 run

 molecular

 dynamics

 simulations

 using

 Open

MM

.


4

.

 Analyze

 the

 trajectories

 using

 MDT

raj

 to

 perform

 PCA

.


5

.

 Plot

 the

 PCA

 results

.



Below

 is

 the

 complete

 Python

 script

 to

 perform

 these

 tasks

:



```

python




import

 os




import

 requests




from

 pdb

fix

er

 import

 P

DB

Fix

er




from

 open

mm

.app

 import

 *


from

 open

mm

 import

 *


from

 open

mm

.unit

 import

 *


import

 md

traj

 as

 md




import

 numpy

 as

 np




import

 matplotlib

.pyplot

 as

 plt





def

 download

_p

db

(p

db

_id

):


   

 url

 =

 f

"https

://

files

.rc

sb

.org

/download

/{

p

db

_id

}.

p

db

"


   

 response

 =

 requests

.get

(url

)


   

 with

 open

(f

"{

p

db

_id

}.

p

db

",

 "

wb

")

 as

 file

:


       

 file

.write

(response

.content

)



def

 preprocess

_p

db

(p

db

_id

):


   

 fixer

 =

 P

DB

Fix

er

(filename

=f

"{

p

db

_id

}.

p

db

")


   

 fixer

.find

Missing

Resid

ues

()


   

 fixer

.find

Missing

Atoms

()


   

 fixer

.add

Missing

Atoms

()


   

 fixer

.add

Missing

Hyd

rog

ens

()


   

 P

DB

File

.write

File

(f

ixer

.top

ology

,

 fixer

.positions

,

 open

(f

"{

p

db

_id

}_

fixed

.p

db

",

 "

w

"))



def

 run

_sim

ulation

(p

db

_id

,

 simulation

_time

_ns

=

10

):


   

 pdb

 =

 P

DB

File

(f

"{

p

db

_id

}_

fixed

.p

db

")


   

 force

field

 =

 Force

Field

('

amber

14

-all

.xml

',

 '

amber

14

/t

ip

3

p

.xml

')


   

 system

 =

 force

field

.create

System

(p

db

.top

ology

,

 non

bond

ed

Method

=

PM

E

,

 non

bond

ed

Cut

off

=

1

*

nan

ometer

,

 constraints

=

HB

onds

)


   

 integr

ator

 =

 Lange

vin

Integrator

(

300

*

kel

vin

,

 

1

/p

ic

ose

cond

,

 

0

.

002

*

pic

oseconds

)


   

 simulation

 =

 Simulation

(p

db

.top

ology

,

 system

,

 integr

ator

)


   

 simulation

.context

.set

Positions

(p

db

.positions

)


   

 simulation

.min

imize

Energy

()


   

 simulation

.report

ers

.append

(D

CD

Reporter

(f

'{

p

db

_id

}_

trajectory

.d

cd

',

 

100

0

))


   

 simulation

.report

ers

.append

(State

Data

Reporter

(stdout

,

 

100

0

,

 step

=True

,

 potential

Energy

=True

,

 temperature

=True

))


   

 simulation

.step

(int

(sim

ulation

_time

_ns

 *

 

500

000

))

 

 #

 

500

,

000

 steps

 for

 

1

 ns





def

 perform

_p

ca

(p

db

_id

):


   

 traj

 =

 md

.load

_d

cd

(f

'{

p

db

_id

}_

trajectory

.d

cd

',

 top

=f

'{

p

db

_id

}_

fixed

.p

db

')


   

 traj

.super

pose

(tr

aj

,

 

0

)


   

 atom

_indices

 =

 traj

.top

ology

.select

('

protein

 and

 name

 CA

')


   

 traj

_re

duced

 =

 traj

.atom

_slice

(atom

_indices

)


   

 mean

_structure

 =

 np

.mean

(tr

aj

_re

duced

.xyz

,

 axis

=

0

)


   

 centered

_data

 =

 traj

_re

duced

.xyz

 -

 mean

_structure




   

 covariance

_matrix

 =

 np

.c

ov

(center

ed

_data

.reshape

(center

ed

_data

.shape

[

0

],

 -

1

).

T

)


   

 eigen

values

,

 eigen

vectors

 =

 np

.linalg

.e

igh

(c

ov

ariance

_matrix

)


   

 idx

 =

 np

.args

ort

(e

igen

values

)[

::-

1

]


   

 eigen

vectors

 =

 eigen

vectors

[:,

 idx

]


   

 p

ca

_projection

 =

 np

.dot

(center

ed

_data

.reshape

(center

ed

_data

.shape

[

0

],

 -

1

),

 eigen

vectors

[:,

 :

2

])


   

 return

 p

ca

_projection





def

 plot

_p

ca

(p

ca

1

,

 p

ca

2

):


   

 plt

.figure

(figsize

=(

10

,

 

5

))


   

 plt

.scatter

(p

ca

1

[:,

 

0

],

 p

ca

1

[:,

 

1

],

 label

='

De

oxygen

ated

 Hem

oglobin

 (

1

A

3

N

)',

 alpha

=

0

.

5

)


   

 plt

.scatter

(p

ca

2

[:,

 

0

],

 p

ca

2

[:,

 

1

],

 label

='

O

xygen

ated

 Hem

oglobin

 (

6

BB

5

)',

 alpha

=

0

.

5

)


   

 plt

.xlabel

('

PC

1

')


   

 plt

.ylabel

('

PC

2

')


   

 plt

.title

('

P

CA

 of

 Hem

oglobin

 Tra

ject

ories

')


   

 plt

.legend

()


   

 plt

.show

()



#

 Main

 script




p

db

_ids

 =

 ['

1

A

3

N

',

 '

6

BB

5

']


for

 pdb

_id

 in

 pdb

_ids

:


   

 download

_p

db

(p

db

_id

)


   

 preprocess

_p

db

(p

db

_id

)


   

 run

_sim

ulation

(p

db

_id

)



p

ca

_

1

A

3

N

 =

 perform

_p

ca

('

1

A

3

N

')


p

ca

_

6

BB

5

 =

 perform

_p

ca

('

6

BB

5

')


plot

_p

ca

(p

ca

_

1

A

3

N

,

 p

ca

_

6

BB

5

)


``

`



This

 script

 will

 download

 the

 P

DB

 files

,

 preprocess

 them

,

 run

 simulations

,

 perform

 PCA

 on

 the

 trajectories

,

 and

 plot

 the

 PCA

 results

.

 Make

 sure

 you

 have

 the

 necessary

 Python

 packages

 installed

 (`

requests

`,

 `

p

db

fix

er

`,

 `

open

mm

`,

 `

md

traj

`,

 `

numpy

`,

 `

mat

plotlib

`)

 before

 running

 the

 script

.

To simulate deoxygenated hemoglobin (PDB ID: 1A3N) and oxygenated hemoglobin (PDB ID: 6BB5), and then perform a Principal Component Analysis (PCA) on the trajectories, we will follow these steps:

1. Download the PDB files for both hemoglobin structures.
2. Preprocess the PDB files using PDBFixer to ensure they are suitable for simulation.
3. Set up and run molecular dynamics simulations using OpenMM.
4. Analyze the trajectories using MDTraj to perform PCA.
5. Plot the PCA results.

Below is the complete Python script to perform these tasks:

```python
import os
import requests
from pdbfixer import PDBFixer
from openmm.app import *
from openmm import *
from openmm.unit import *
import mdtraj as md
import numpy as np
import matplotlib.pyplot as plt

def download_pdb(pdb_id):
    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    response = requests.get(url)
    with open(f"{pdb_id}.pdb", "wb") as file:
        file.write(response.content)

def preprocess_pdb(pdb_id):
    fixer = 

In [6]:
now = datetime.datetime.now()
date = now.strftime("%Y-%m-%d")
print("date:",date)
time = now.strftime("%H:%M:%S")
print("time:",time)

date: 2024-10-16
time: 20:46:06


In [1]:
# TEST THE CODE 

import requests
from pdbfixer import PDBFixer
from openmm.app import *
from openmm import *
from openmm.unit import *
import mdtraj as md
import numpy as np
import matplotlib.pyplot as plt

def download_pdb(pdb_id):
    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    response = requests.get(url)
    with open(f"{pdb_id}.pdb", "wb") as file:
        file.write(response.content)

def preprocess_pdb(pdb_id):
    fixer = PDBFixer(filename=f"{pdb_id}.pdb")
    fixer.findMissingResidues()
    fixer.findMissingAtoms()
    fixer.addMissingAtoms()
    fixer.addMissingHydrogens()
    PDBFile.writeFile(fixer.topology, fixer.positions, open(f"{pdb_id}_fixed.pdb", "w"))

def run_simulation(pdb_id, simulation_time_ns=10):
    pdb = PDBFile(f"{pdb_id}_fixed.pdb")
    forcefield = ForceField('amber14-all.xml', 'amber14/tip3p.xml')
    system = forcefield.createSystem(pdb.topology, nonbondedMethod=PME, nonbondedCutoff=1*nanometer, constraints=HBonds)
    integrator = LangevinIntegrator(300*kelvin, 1/picosecond, 0.002*picoseconds)
    simulation = Simulation(pdb.topology, system, integrator)
    simulation.context.setPositions(pdb.positions)
    simulation.minimizeEnergy()
    simulation.reporters.append(DCDReporter(f'{pdb_id}_trajectory.dcd', 1000))
    simulation.reporters.append(StateDataReporter(stdout, 1000, step=True, potentialEnergy=True, temperature=True))
    simulation.step(int(simulation_time_ns * 500000))  # 500,000 steps for 1 ns

def perform_pca(pdb_id):
    traj = md.load_dcd(f'{pdb_id}_trajectory.dcd', top=f'{pdb_id}_fixed.pdb')
    traj.superpose(traj, 0)
    atom_indices = traj.topology.select('protein and name CA')
    traj_reduced = traj.atom_slice(atom_indices)
    mean_structure = np.mean(traj_reduced.xyz, axis=0)
    centered_data = traj_reduced.xyz - mean_structure
    covariance_matrix = np.cov(centered_data.reshape(centered_data.shape[0], -1).T)
    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
    idx = np.argsort(eigenvalues)[::-1]
    eigenvectors = eigenvectors[:, idx]
    pca_projection = np.dot(centered_data.reshape(centered_data.shape[0], -1), eigenvectors[:, :2])
    return pca_projection

def plot_pca(pca1, pca2):
    plt.figure(figsize=(10, 5))
    plt.scatter(pca1[:, 0], pca1[:, 1], label='Deoxygenated Hemoglobin (1A3N)', alpha=0.5)
    plt.scatter(pca2[:, 0], pca2[:, 1], label='Oxygenated Hemoglobin (6BB5)', alpha=0.5)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('PCA of Hemoglobin Trajectories')
    plt.legend()
    plt.show()

# Main script
pdb_ids = ['1A3N', '6BB5']
for pdb_id in pdb_ids:
    download_pdb(pdb_id)
    preprocess_pdb(pdb_id)
    run_simulation(pdb_id)

pca_1A3N = perform_pca('1A3N')
pca_6BB5 = perform_pca('6BB5')
plot_pca(pca_1A3N, pca_6BB5)

ValueError: No template found for residue 575 (HEM).  This might mean your input topology is missing some atoms or bonds, or possibly that you are using the wrong force field.  For more information, see https://github.com/openmm/openmm/wiki/Frequently-Asked-Questions#template

In [1]:
!ls

1A3N_fixed.pdb	1A3N.pdb  exp_14.ipynb
