## Manipulate csv file using pandas to create a timeseries for each trajectory/residue-pair

In [68]:
import pandas as pd

data_WT = '/data/chodera/glassw/covid_vir/clustering_output/pooling/proj17311/RUN3/longest_1000_clones/PROJ17311_gathered_strided1clones_rbd_ace2_data_RMSD_SALTBRIDGES_WT.csv'
# data_N439K = '/data/chodera/glassw/covid_vir/clustering_output/pooling/proj17311/RUN0/longest_1000_clones/PROJ17311_gathered_strided1clones_rbd_ace2_data_RMSD_SALTBRIDGES_N439K.csv'
# data_K417V = '/data/chodera/glassw/covid_vir/clustering_output/pooling/proj17311/RUN1/longest_1000_clones/PROJ17311_gathered_strided1clones_rbd_ace2_data_RMSD_SALTBRIDGES_K417V.csv'
# data_double = '/data/chodera/glassw/covid_vir/clustering_output/pooling/proj17311/RUN2/longest_1000_clones/PROJ17311_gathered_strided1clones_rbd_ace2_data_RMSD_SALTBRIDGES_double.csv'

df_WT = pd.read_csv(data_WT)
# df_N439K = pd.read_csv(data_N439K)
# df_K417V = pd.read_csv(data_K417V)
# df_double = pd.read_csv(data_double)

In [69]:
# Drop duplicate column
df_WT = df_WT.drop(columns=['Unnamed: 0'])

In [71]:
# Split 'index' column into separate columns for RUN, CLONE, GEN, FRAME
df_names = df_WT['index'].apply(lambda x: pd.Series([i for i in x.split('/')])) # Split strings with /
df_names_gen = df_names[2].apply(lambda x: pd.Series([i for i in x.split('_')])) # Split strings with _
df_names_gen = df_names_gen.rename(columns={0:2, 1:3}) # Rename columns so there aren't duplicate names
df_names_all = pd.concat([df_names[df_names.columns[:2]], df_names_gen], axis=1) # Merge all columns from index

In [72]:
# Merge all columns into one dataframe
df_WT = pd.concat([df_names_all, df_WT[df_WT.columns[1:]]], axis=1)

In [78]:
# Rename columns
df_WT = df_WT.rename(columns={0: 'RUN', 1: 'CLONE', 2: 'GEN', 3: 'FRAME'})

In [79]:
df_WT

Unnamed: 0,RUN,CLONE,GEN,FRAME,d30_res417_mindist,e329_res439_mindist,e484_k31_mindist,e35_k31_mindist,e35_q493_mindist,q493_k31_mindist,k353_g496bb_mindist,d38_y449_dist_mindist,q42_y449_dist_mindist,k353bb_g502bb_dist_mindist
0,RUN3,CLONE90,results15,0,4.165311,12.436439,8.087312,4.591140,2.956972,6.363913,2.739529,2.628121,7.141880,2.907815
1,RUN3,CLONE921,results11,0,4.453142,7.399364,3.055828,6.303565,3.640519,2.698202,2.884891,4.155452,8.115247,2.882045
2,RUN3,CLONE1278,results15,0,2.669323,7.707051,6.748406,2.866097,2.845933,2.892717,2.823538,6.592561,5.447143,3.006423
3,RUN3,CLONE1109,results16,0,3.160269,13.217012,5.451661,2.694839,2.785712,3.186666,2.802255,7.641176,11.515643,3.086524
4,RUN3,CLONE739,results14,0,2.638952,8.415782,5.600727,5.401315,2.711570,4.662431,2.821253,5.334280,6.254385,3.170771
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27512,RUN3,CLONE92,results16,0,2.940364,11.033089,4.536322,5.155936,2.960145,5.213446,2.808574,2.713929,3.526672,3.016897
27513,RUN3,CLONE1940,results18,5,2.860862,10.678180,6.027110,2.784712,2.755525,2.966498,4.493479,5.091852,7.202576,2.951714
27514,RUN3,CLONE92,results16,5,2.784937,8.153837,8.886942,2.679869,3.678442,2.923084,2.746068,2.709133,3.032016,2.859286
27515,RUN3,CLONE1329,results18,0,2.708019,7.699651,7.616856,2.918454,2.845758,2.727394,5.615990,5.333676,3.759479,3.001877


In [89]:
df_WT.columns

Index(['RUN', 'CLONE', 'GEN', 'FRAME', 'd30_res417_mindist',
       'e329_res439_mindist', 'e484_k31_mindist', 'e35_k31_mindist',
       'e35_q493_mindist', 'q493_k31_mindist', 'k353_g496bb_mindist',
       'd38_y449_dist_mindist', 'q42_y449_dist_mindist',
       'k353bb_g502bb_dist_mindist'],
      dtype='object')

In [91]:
# Create nested dict of timeseries data
# Key: clone, Value: dict of Key: residue-pair, Value: distances for each frame
d_timeseries = {}
for clone in range(2000):
    if clone not in d_timeseries:
        d_timeseries[clone] =  {}
        for residue_pair in df_WT.columns[4:]:
            d_timeseries[clone][residue_pair] = list(df_WT[df_WT['CLONE'] == f'CLONE{clone}'].sort_values(by=['GEN', 'FRAME'])[residue_pair])
      

In [100]:
import pickle

with open("17311_RUN3_longest1000_timeseries.pickle", 'wb') as f:
    pickle.dump(d_timeseries, f)