# For processing HBond xvg files

In [1]:
# Define a function to read xvg files

import pandas as pd

def read_xvg(file_path):
    data = []

    with open(file_path, 'r') as f:
        for line in f:
            if not line.startswith("#") and not line.startswith('@'):
                new_list = [elem for elem in line.split()]
                data.append(new_list)
    # Convert the list of lists into a Pandas DataFrame
    df = pd.DataFrame(data)
    return df

In [8]:
import os
import pandas as pd
import fnmatch # looking for a specific string in a file name
import re 
import numpy as np

## Read the xvg files for Hbonds and save them to dictionaries

xvgpath = 'HBonds300823/'

directory = xvgpath

# hbond files
hbond = [x for x in os.listdir(directory) if fnmatch.fnmatch(x, '*hbond.xvg')] # fnmatch look for rmsd filename specifically

## example for fnmatch: the files that start with hello and end with .js `if fnmatch.fnmatch(file, "hello*.js"):`
d_hbond = {} # dictionary that will hold them 

for file_name in hbond: # loop over files
    
   # read data into a dataframe and add it to dict with file_name as it key
    d_hbond[file_name] = read_xvg(os.path.join(directory,file_name)) # to note, directory and file_name need to be joint or else the file can't be found
    
    
## Drop the first 500 frames and calculate the average 

avg = []

# loop over the dictionary. df is the key of the dictionary https://www.w3schools.com/python/python_dictionaries_loop.aspb
for df in d_hbond: 
    
    # drop frames
    drop = d_hbond[df].drop(range(0,501))
    
    # convert all the data to numeric
    drop = drop.apply(pd.to_numeric)
    
    # calculate mean
    mean = drop.mean(axis = 0)
    
    # assign mean values and condition into a list
    avg.append([df,mean[1]])

# convert into a dataframe

df_avg = pd.DataFrame(avg)

# Assign names
df_avg.columns = ['Condition','Number of hydrogen bonds in the last 50ns']


# set conditions as index 
df_avg.set_index('Condition', inplace=True)
    
# Read the 54 condition names 

run54 = pd.read_excel("54ConditionName120923.xlsx",index_col=0)

# and convert it into a list
run54 = list(run54['Condition'])


## calculate the mean of 6 replicas

hbond_avg = []

for i in run54:

    avg = []
    
    # loop over the rows in a dataframe
    for index, row in df_avg.iterrows():
        
        # search for any matching
        if re.search(i, index):
            
            data = row['Number of hydrogen bonds in the last 50ns']
            
            avg.append(data)
            
    # calculate the mean value of the six replicas for each condition
    avg = np.mean(avg)
    
    # append the condition info as well as the calculated mean native contacts into a list
    hbond_avg.append([i,avg])
    
    
    
# convert the list to a dataframe   
df_hbond_avg = pd.DataFrame(hbond_avg)

# assign column names
df_hbond_avg.columns = ['Condition','Number of hydrogen bonds in the last 50ns']

# save to an excel file
#df_hbond_avg.to_excel("AvgHbondsNumberLast50ns.xlsx")