In [1]:
import datetime
from mdcrow.utils import _make_llm

In [2]:
prompt = "Write me a code to run simulation of fibronectin."
model = "gpt-4o-2024-08-06"

In [3]:
# Parameters
prompt = "Download the PDB files for 8PFK and 8PFQ. Then, compare the secondary structures of the two proteins, including the number of atoms, secondary structures, number of chains, etc. "


In [4]:
llm = _make_llm(model, temp=0.1, streaming=True)

system_prompt = (
    "You are an expert molecular dynamics scientist, and your "
    "task is to respond to the question or "
    "solve the problem in its entirety to the best of your ability. "
    "If any part of the task requires you to perform an action that "
    "you are not capable of completing, please write a runnable "
    "Python script for that step and move on. For literature papers, "
    "use and process papers from the `paper_collection` folder. "
    "For .pdb files, download them from the RSCB website using `requests`. "
    "To preprocess PDB files, you will use PDBFixer. "
    "To get information about proteins, retrieve data from the UniProt database. "
    "For anything related to simulations, you will use OpenMM, "
    "and for anything related to analyses, you will use MDTraj. "
    "At the end, combine any scripts into one script. "
)
messages = [
    ("system", system_prompt),
    ("human", prompt),
]

now = datetime.datetime.now()
date = now.strftime("%Y-%m-%d")
print("date:",date)
time = now.strftime("%H:%M:%S")
print("time:",time)
print("LLM: ",llm.model_name,"\nTemperature: ",llm.temperature)

date: 2024-10-16
time: 20:44:32
LLM:  gpt-4o-2024-08-06 
Temperature:  0.1


In [5]:
ai_msg = llm.invoke(messages)
print(ai_msg.content)

To

 compare

 the

 secondary

 structures

 of

 the

 two

 proteins

 

8

PF

K

 and

 

8

PF

Q

,

 we

 will

 follow

 these

 steps

:



1

.

 Download

 the

 P

DB

 files

 for

 

8

PF

K

 and

 

8

PF

Q

.


2

.

 Pre

process

 the

 P

DB

 files

 using

 P

DB

Fix

er

 to

 ensure

 they

 are

 suitable

 for

 analysis

.


3

.

 Analyze

 the

 secondary

 structures

 using

 MDT

raj

.


4

.

 Compare

 the

 number

 of

 atoms

,

 secondary

 structures

,

 and

 number

 of

 chains

.



Let's

 start

 by

 writing

 a

 Python

 script

 to

 perform

 these

 tasks

.



```

python




import

 requests




from

 pdb

fix

er

 import

 P

DB

Fix

er




from

 open

mm

.app

 import

 P

DB

File




import

 md

traj

 as

 md





def

 download

_p

db

(p

db

_id

):


   

 url

 =

 f

"https

://

files

.rc

sb

.org

/download

/{

p

db

_id

}.

p

db

"


   

 response

 =

 requests

.get

(url

)


   

 if

 response

.status

_code

 ==

 

200

:


       

 with

 open

(f

"{

p

db

_id

}.

p

db

",

 "

wb

")

 as

 file

:


           

 file

.write

(response

.content

)


   

 else

:


       

 raise

 Exception

(f

"

Failed

 to

 download

 P

DB

 file

 for

 {

p

db

_id

}")



def

 preprocess

_p

db

(p

db

_id

):


   

 fixer

 =

 P

DB

Fix

er

(filename

=f

"{

p

db

_id

}.

p

db

")


   

 fixer

.find

Missing

Resid

ues

()


   

 fixer

.find

Missing

Atoms

()


   

 fixer

.add

Missing

Atoms

()


   

 fixer

.add

Missing

Hyd

rog

ens

()


   

 with

 open

(f

"{

p

db

_id

}_

fixed

.p

db

",

 "

w

")

 as

 file

:


       

 P

DB

File

.write

File

(f

ixer

.top

ology

,

 fixer

.positions

,

 file

)



def

 analyze

_structure

(p

db

_id

):


   

 traj

 =

 md

.load

_p

db

(f

"{

p

db

_id

}_

fixed

.p

db

")


   

 num

_atoms

 =

 traj

.n

_atoms




   

 num

_ch

ains

 =

 traj

.top

ology

.n

_ch

ains




   

 secondary

_structure

 =

 md

.compute

_d

ssp

(tr

aj

)


   

 return

 num

_atoms

,

 num

_ch

ains

,

 secondary

_structure





def

 main

():


   

 pdb

_ids

 =

 ["

8

PF

K

",

 "

8

PF

Q

"]


   

 for

 pdb

_id

 in

 pdb

_ids

:


       

 print

(f

"

Processing

 {

p

db

_id

}

...")


       

 download

_p

db

(p

db

_id

)


       

 preprocess

_p

db

(p

db

_id

)


    


   

 results

 =

 {}


   

 for

 pdb

_id

 in

 pdb

_ids

:


       

 num

_atoms

,

 num

_ch

ains

,

 secondary

_structure

 =

 analyze

_structure

(p

db

_id

)


       

 results

[p

db

_id

]

 =

 {


           

 "

num

_atoms

":

 num

_atoms

,


           

 "

num

_ch

ains

":

 num

_ch

ains

,


           

 "

secondary

_structure

":

 secondary

_structure




       

 }


    


   

 for

 pdb

_id

,

 data

 in

 results

.items

():


       

 print

(f

"\

n

Results

 for

 {

p

db

_id

}:

")


       

 print

(f

"

Number

 of

 atoms

:

 {

data

['

num

_atoms

']

}")


       

 print

(f

"

Number

 of

 chains

:

 {

data

['

num

_ch

ains

']

}")


       

 print

(f

"

Secondary

 structure

 (

first

 

10

 residues

):

 {

data

['

secondary

_structure

'][

0

][:

10

]}

")



if

 __

name

__

 ==

 "__

main

__":


   

 main

()


``

`



This

 script

 will

 download

 the

 P

DB

 files

 for

 

8

PF

K

 and

 

8

PF

Q

,

 preprocess

 them

,

 and

 then

 analyze

 their

 secondary

 structures

 using

 MDT

raj

.

 The

 results

 will

 include

 the

 number

 of

 atoms

,

 number

 of

 chains

,

 and

 a

 brief

 look

 at

 the

 secondary

 structure

 for

 the

 first

 

10

 residues

.

 You

 can

 run

 this

 script

 in

 a

 Python

 environment

 with

 the

 necessary

 packages

 installed

.

To compare the secondary structures of the two proteins 8PFK and 8PFQ, we will follow these steps:

1. Download the PDB files for 8PFK and 8PFQ.
2. Preprocess the PDB files using PDBFixer to ensure they are suitable for analysis.
3. Analyze the secondary structures using MDTraj.
4. Compare the number of atoms, secondary structures, and number of chains.

Let's start by writing a Python script to perform these tasks.

```python
import requests
from pdbfixer import PDBFixer
from openmm.app import PDBFile
import mdtraj as md

def download_pdb(pdb_id):
    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    response = requests.get(url)
    if response.status_code == 200:
        with open(f"{pdb_id}.pdb", "wb") as file:
            file.write(response.content)
    else:
        raise Exception(f"Failed to download PDB file for {pdb_id}")

def preprocess_pdb(pdb_id):
    fixer = PDBFixer(filename=f"{pdb_id}.pdb")
    fixer.findMissingResidues()
    fixer.findMissingAtoms()
    fixer.a

In [6]:
now = datetime.datetime.now()
date = now.strftime("%Y-%m-%d")
print("date:",date)
time = now.strftime("%H:%M:%S")
print("time:",time)

date: 2024-10-16
time: 20:44:40


In [1]:
# TEST THE CODE

import requests
from pdbfixer import PDBFixer
from openmm.app import PDBFile
import mdtraj as md

def download_pdb(pdb_id):
    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    response = requests.get(url)
    if response.status_code == 200:
        with open(f"{pdb_id}.pdb", "wb") as file:
            file.write(response.content)
    else:
        raise Exception(f"Failed to download PDB file for {pdb_id}")

def preprocess_pdb(pdb_id):
    fixer = PDBFixer(filename=f"{pdb_id}.pdb")
    fixer.findMissingResidues()
    fixer.findMissingAtoms()
    fixer.addMissingAtoms()
    fixer.addMissingHydrogens()
    with open(f"{pdb_id}_fixed.pdb", "w") as file:
        PDBFile.writeFile(fixer.topology, fixer.positions, file)

def analyze_structure(pdb_id):
    traj = md.load_pdb(f"{pdb_id}_fixed.pdb")
    num_atoms = traj.n_atoms
    num_chains = traj.topology.n_chains
    secondary_structure = md.compute_dssp(traj)
    return num_atoms, num_chains, secondary_structure

def main():
    pdb_ids = ["8PFK", "8PFQ"]
    for pdb_id in pdb_ids:
        print(f"Processing {pdb_id}...")
        download_pdb(pdb_id)
        preprocess_pdb(pdb_id)
    
    results = {}
    for pdb_id in pdb_ids:
        num_atoms, num_chains, secondary_structure = analyze_structure(pdb_id)
        results[pdb_id] = {
            "num_atoms": num_atoms,
            "num_chains": num_chains,
            "secondary_structure": secondary_structure
        }
    
    for pdb_id, data in results.items():
        print(f"\nResults for {pdb_id}:")
        print(f"Number of atoms: {data['num_atoms']}")
        print(f"Number of chains: {data['num_chains']}")
        print(f"Secondary structure (first 10 residues): {data['secondary_structure'][0][:10]}")

if __name__ == "__main__":
    main()

Processing 8PFK...
Processing 8PFQ...

Results for 8PFK:
Number of atoms: 677
Number of chains: 2
Secondary structure (first 10 residues): ['NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA']

Results for 8PFQ:
Number of atoms: 1284
Number of chains: 6
Secondary structure (first 10 residues): ['NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA']


In [7]:
# check DSSP objects
traj = md.load("8PFK_fixed.pdb")
secondary_structure = md.compute_dssp(traj,simplified=True)[0] # last frame
print("Number of residues in total: ",traj.n_atoms)
print("Number of chains: ",traj.n_chains)
print("Number of residues in sheets: ",len([i for i in secondary_structure if i == 'E']))
print("Number of residues in helices: ",len([i for i in secondary_structure if i == 'H']))
print("Number of residues in coils: ",len([i for i in secondary_structure if i == 'C']))
print(secondary_structure[:10])

Number of residues in total:  677
Number of chains:  2
Number of residues in sheets:  0
Number of residues in helices:  0
Number of residues in coils:  0
['NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA']


In [8]:
import mdtraj as md
traj = md.load("8PFQ_fixed.pdb")
secondary_structure = md.compute_dssp(traj,simplified=True)[0]
print("Number of residues in total: ",traj.n_atoms)
print("Number of chains: ",traj.n_chains)
print("Number of residues in sheets: ",len([i for i in secondary_structure if i == 'E']))
print("Number of residues in helices: ",len([i for i in secondary_structure if i == 'H']))
print("Number of residues in coils: ",len([i for i in secondary_structure if i == 'C']))
print(secondary_structure[:10])

Number of residues in total:  1284
Number of chains:  6
Number of residues in sheets:  0
Number of residues in helices:  0
Number of residues in coils:  0
['NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA']


In [6]:
# look at raw files
import mdtraj as md
traj = md.load("8PFK.pdb")
secondary_structure = md.compute_dssp(traj,simplified=True)[0]
print("Number of residues in total: ",traj.n_atoms)
print("Number of chains: ",traj.n_chains)
print("Number of residues in sheets: ",len([i for i in secondary_structure if i == 'E']))
print("Number of residues in helices: ",len([i for i in secondary_structure if i == 'H']))
print("Number of residues in coils: ",len([i for i in secondary_structure if i == 'C']))
print(secondary_structure[:10])

Number of residues in total:  487
Number of chains:  2
Number of residues in sheets:  0
Number of residues in helices:  0
Number of residues in coils:  0
['NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA']


In [9]:
# look at raw files
import mdtraj as md
traj = md.load("8PFQ.pdb")
secondary_structure = md.compute_dssp(traj,simplified=True)[0]
print("Number of residues in total: ",traj.n_atoms)
print("Number of chains: ",traj.n_chains)
print("Number of residues in sheets: ",len([i for i in secondary_structure if i == 'E']))
print("Number of residues in helices: ",len([i for i in secondary_structure if i == 'H']))
print("Number of residues in coils: ",len([i for i in secondary_structure if i == 'C']))
print(secondary_structure[:10])

Number of residues in total:  950
Number of chains:  6
Number of residues in sheets:  0
Number of residues in helices:  0
Number of residues in coils:  0
['NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA' 'NA']


In [1]:
!ls

8PFK_fixed.pdb	8PFK.pdb  8PFQ_fixed.pdb  8PFQ.pdb  exp_11.ipynb
