# Notebook to extract frames from metadynamics simulations using reduced bias info contained in the colvar file.

In [None]:
#I made this notebook-variation to be used in combination with cmip calculations. To do so, I extracted frames in several directories according 
#to the dz interval.

## Importing

In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import os
from pathlib import Path


from matplotlib.pyplot import figure

import pytraj as pt

## Functions

In [3]:
def select_colvar_bylargest_rbias(colvar_df,system,dvalues,nframes):
#This function helps me to select the colvar-file data subgroup with -->largest reduced rbias<-- values in the dz range of interest 
    "select nframes of colvar in d1.z and d2.z range with lowest rbias"
    colvar_dz=colvar_df.loc[(colvar_df['system']==system) & (colvar_df['d2.z'] <= dvalues[0]) & 
                    (colvar_df['d2.z'] >= dvalues[1]) & (colvar_df['d1.z'] <= dvalues[2]) &
                    (colvar_df['d1.z'] >= dvalues[3])]
    #print(colvar_dz[:10])
    colvar_largest_rbias=colvar_dz.sort_values(by=['rbias'], ascending=False)[:nframes]
    return colvar_largest_rbias

In [4]:
def select_alldata_bytime(colvar_sel,alldata_df,sys):
#With this function "alldata" frames are selected if they correspond to previously-selected frames in the colvar subgroup and return an index.
#This is done searching for variables that are in common (system, chain, walker and time).
#alldata is created by analyzing the trajectory from which the frames have to be extracted, to ensure that the correspondance between
#index (needed by pytraj) and extracted frame is correct. 
#In theory this passage may be skipped (select with colvar-->extract frame), but it helps avoiding mistakes.
    
    sel_rbias=[]
    selframes=[]
    for ind,row in colvar_sel.iterrows():
        #print(row)
        selframes.append(alldata_df.loc[(alldata_df['system']==row['system']) & (alldata_df['chain']==row['chain'])
        & (alldata_df['walker']==row['walker']) & (alldata_df['time (ps)']==row['time (ps)'])].index.to_list())

        sel_rbias.append(row['rbias'])
    sel_frames=[frame for sublist in selframes for frame in sublist]
        
    return(sel_frames, sel_rbias)

In [41]:
def write_pdb(alldata_df, sel_frames, sel_frames_rbias, trajspath, strip_mask, align_mask):
    for n,fr in enumerate(sel_frames):
        system=alldata_df['system'].iloc[fr:fr+1].to_string(index=False)
        walker=alldata_df['walker'].iloc[fr:fr+1].to_string(index=False)
        chain=alldata_df['chain'].iloc[fr:fr+1].to_string(index=False)
        traj_frame_from0=int(float(alldata_df['time'].iloc[fr:fr+1].to_string(index=False)))
        topname='{trajspath}/{system}_mon.pdb'.format(trajspath=trajspath,system=system)
        trajname='{trajspath}/{system}_mw{walker}_ch{chain}.xtc'.format(trajspath=trajspath, 
                                                                        system=system, walker=walker,
                                                                        chain=chain)
        print('extracting frame with rbias '+str(sel_frames_rbias[n]))
        current_structure=pt.load(trajname,top=topname,frame_indices=[traj_frame_from0])
        stripped_current=pt.strip(current_structure, strip_mask)
        if (n == 0):
            ref=stripped_current.copy()
            aligned_structure=ref.copy()
        elif (n != 0):
            aligned_structure=pt.align(stripped_current,ref=ref,mask=align_mask, ref_mask=align_mask)
        outpdb='{n}.pdb'.format(n=n)
        pt.write_trajectory(outpdb,aligned_structure,format='pdb',overwrite=True)


In [30]:
def write_file(alldata_df, sel_frames, sel_frames_rbias,outfile):
    with open(outfile,'w') as out:
        out.write('PDB index, walker  chain  rbias  d1.z   d2.z\n')
        out.close()
    for n,fr in enumerate(sel_frames):
        system=alldata_df['system'].iloc[fr:fr+1].to_string(index=False)
        walker=alldata_df['walker'].iloc[fr:fr+1].to_string(index=False)
        chain=alldata_df['chain'].iloc[fr:fr+1].to_string(index=False)
        d1z=alldata_df['d1.z'].iloc[fr:fr+1].to_string(index=False)
        d2z=alldata_df['d2.z'].iloc[fr:fr+1].to_string(index=False)
        with open(outfile,'a') as out:
            out.write('{PDB_index},{walker},{chain},{rbias},{d1z},{d2z}\n'.format(PDB_index=n,walker=walker, chain=chain, 
                                                                    rbias=sel_frames_rbias[n],d1z=d1z,d2z=d2z))
    out.close()
    
        

## Defining global variables

In [55]:
titles=['wild Gext0','mut Gext0','mut Gext-', 'wild Gext-']
titles2=['WP','WD','MP', 'MD']
names=['wt_Glu0','mut_Glu0','wt_Glu-', 'mut_Glu-']
walkers=np.arange(0,8)
chains=['A','B']

## Loading COLVAR data for rbias
I use colvar file to be able (later) to select frames with largest/smallest/random reduced bias.

In [8]:
data=[]
path='/orozco/projects/E-Dent/MILOSZ/meta/phase2/colvars'
plumed_files=[]
for index,name in enumerate(titles2):
    data.append([])
    for chain in chains:
        for walker in walkers:
            ftemp='{path}/{name}/{w}/COLVAR'.format(path=path,name=name,w=walker)
            if (chain=='A'):
              dtemp=pd.read_csv(ftemp,delimiter=" ",comment='#',skipinitialspace=True,usecols=[0,3,8,25],names=['time','d1.z','d2.z','rbias'])  
            elif (chain=='B'):
              dtemp=pd.read_csv(ftemp,delimiter=" ",comment='#',skipinitialspace=True, usecols=[0,13,18,29],names=['time','d1.z','d2.z','rbias'])
            dtemp['chain']=chain
            dtemp['name']=name
            dtemp['walker']=walker  
            dtemp['time (ps)']=np.round(dtemp['time']).astype(int)
            if (name=='WP'):
                dtemp['system']='wt_Glu0'
            if (name=='WD'):
                dtemp['system']='wt_Glu-'
            if (name=='MP'):
                dtemp['system']='mut_Glu0'
            if (name=='MD'):
                dtemp['system']='mut_Glu-'
            data[index].append(dtemp)
            #plumed_files.append(temp)
colvar_sys=[]
#for each system sys I concatenate the walkers n, poi li appendo. L'ordine è quello di names
for sys,name in enumerate(titles):
    tmp=pd.concat(data[sys][n] for n,m in enumerate(data[sys]))
    colvar_sys.append(tmp)
    
#ora concateno i 4 sistemi    
colvar_allconc=pd.concat((colvar_sys[n] for n,m in enumerate(colvar_sys)),ignore_index=True)
colvar=colvar_allconc.loc[colvar_allconc['time (ps)']%500==0]

In [9]:
colvar_allconc[0:100:25]['time (ps)'].values.tolist()

[0, 500, 1000, 1500]

## Loading dz data for index
I calculated dz values on my trajectories to easily 

In [10]:
dz_allconc=[]
data=[]
for index,name in enumerate(names):
    data.append([])
    for chain in chains:
        for walker in walkers:
            ftemp='/orozco/projects/E-Dent/VERONICA/DIMER_LARGER/mw_metad/analysis_phase2/dz/{system}/{system}.mw{w}.{ch}.analysis.dat'.format(system=name,w=walker,ch=chain)
            dtemp=pd.read_csv(ftemp,delimiter=" ",skipinitialspace=True,usecols=[0,1,2],names=['time','d1.z','d2.z'],skiprows=1)
            dtemp['ctrl']=name
            if (name=='wt_Glu0'):
                dtemp['charge']='0'
            if (name=='wt_Glu-'):
                dtemp['charge']='-1'
            if (name=='mut_Glu0'):
                dtemp['charge']='0'
            if (name=='mut_Glu-'):
                dtemp['charge']='-1'
            dtemp['chain']=chain
            dtemp['system']=name
            dtemp['walker']=walker
            dtemp['time (ps)']=(dtemp['time']*500).astype(int)
            data[index].append(dtemp)
data[0][8].head(2)

data_sys=[]
for sys,name in enumerate(names):
    tmp=pd.concat(data[sys][n] for n,m in enumerate(data[sys]))
    data_sys.append(tmp)
alldata=pd.concat((data_sys[n] for n,m in enumerate(data_sys)),ignore_index=True)

In [57]:
alldata.head()

Unnamed: 0,time,d1.z,d2.z,ctrl,charge,chain,system,walker,time (ps)
0,0.0,1.722141,12.372143,wt_Glu0,0,A,wt_Glu0,0,0
1,1.0,-2.002858,12.02714,wt_Glu0,0,A,wt_Glu0,0,500
2,2.0,-4.301784,11.988217,wt_Glu0,0,A,wt_Glu0,0,1000
3,3.0,-4.937858,10.472145,wt_Glu0,0,A,wt_Glu0,0,1500
4,4.0,-4.530357,10.499642,wt_Glu0,0,A,wt_Glu0,0,2000


In [22]:
current_path=Path.cwd()
if (current_path=='/orozco/projects/E-Dent/VERONICA/DIMER_LARGER/mw_metad/colvars_pka'):
    print('ok')

In [23]:
print(current_path)

/orozco/projects/E-Dent/VERONICA/DIMER_LARGER/mw_metad/colvars_pka


## Select and write frames for each phase-space bin

In [54]:
### change here ####
nframes=2
dmin=0
dmax=3
dx=1.5
align_mask=':1-510@CA,C,N,O'
####################

sys='mut_Glu0'
if (sys=='wt_Glu0'):
    strip_mask='!((:1-675)|(@62678-62681))'
elif (sys=='mut_Glu0'):
    strip_mask='!((:1-675)|(@63049-63052))'

colvar=colvar_allconc.loc[colvar_allconc['time (ps)']%500==0]
trajspath="/orozco/projects/E-Dent/VERONICA/DIMER_LARGER/mw_metad/trajs_mon_wat/phase2"

#always start from here
os.chdir('/orozco/projects/E-Dent/VERONICA/DIMER_LARGER/mw_metad/colvars_pka')

#if sys dir does not exist, create it!
path=Path(sys)
path.mkdir(parents=True, exist_ok=True)
os.chdir(path)

for i in np.linspace(dmin, dmax-dx, int((dmax-dmin)/dx)):
    for j in np.linspace(dmin, dmax-dx, int((dmax-dmin)/dx)): 
        #print(j)
        #it can be chainA or chainB
        colvar_largest_rbias=select_colvar_bylargest_rbias(colvar,sys,[j+dx,j,i+dx,i],nframes)
        sel_frames, sel_frames_rbias=select_alldata_bytime(colvar_largest_rbias,alldata,sys)
        outdir='frames'+'_'+str(i)+'_'+str(j) #i=dz1, j=dz2
        outfile='frames_{dz1}_{dz2}.dat'.format(dz1=i,dz2=j)
        #if bin dir doesn't exist, create it!
        path=Path(outdir)
        path.mkdir(parents=True, exist_ok=True)
        os.chdir(path)
        
        write_file(alldata,sel_frames,sel_frames_rbias,outfile)
        write_pdb(alldata,sel_frames,sel_frames_rbias,trajspath,strip_mask,align_mask)
        os.chdir("..")
#colvar_largest_rbias.head()          
#write_pdb(alldata, sel_frames, sel_frames_rbias, trajspath,align_mask, min_name, "frames_extracted_lowest_rbias")


extracting frame with rbias -19.818764
extracting frame with rbias -19.984807
extracting frame with rbias -14.799626
extracting frame with rbias -15.581884
extracting frame with rbias -23.276037
extracting frame with rbias -23.924541
extracting frame with rbias -40.143647
extracting frame with rbias -41.436974
