# Group Results by Name

> This notebook merges the CSV files containing photon and burst data 
> for the repeats of each sample.
>
> Before running this notebook, 
> you should run the notebook [Batch run notebook](Batch run notebook.ipynb) 
> to execute the burst search on each individual file and to export 
> the CSV files for photon and burst data.

In [1]:
from pathlib import Path
import pandas as pd
import json
from collections import defaultdict
import numpy as np

# Collect the data files

Folder of Photon-HDF5 files:

In [7]:
data_folder = Path('../Relevant BH measurements/lacCONS Cy3B Atto647N RNAP/photonHDF5/minus8TA_minus5NTD/') 
assert data_folder.is_dir(), f'Folder not found: "{str(data_folder)}"'

Folder of CSV files containing burst data:

In [12]:
# folder of the CSV files
results_folder = data_folder.parent
results_folder = results_folder.parent
results_folder = data_folder.parent / 'results' / 'minus8TA_minus5NTD'
results_folder

PosixPath('../Relevant BH measurements/lacCONS Cy3B Atto647N RNAP/photonHDF5/results/minus8TA_minus5NTD')

Collect filenames of Photon-HDF5 files with matching CSV files (to generate the CSV files you need to run the burst search for each file using the "Batch" notebook):

In [13]:
fret_suffix = '_FRET_bursts'
donly_suffix = '_Donly_bursts'

In [14]:
# List of HDF5 files corresponding to existing CSV files
filelist = [Path(f.parents[1], 'photonHDF5', f.stem[:-len(fret_suffix)] + '.hdf5') 
            for f in results_folder.glob(f'*{fret_suffix}.csv') if 'merge' not in f.name]
[f.name for f in filelist]

[]

Now we create a table of files. The index will be the file name, the column "name" is a label used to identify the sample and "repeat" is the repeat number for each sample.
We also add a few more columns that will be filled later.

In [15]:
df_all = pd.DataFrame(
    columns=['fname', 'name', 'repeat', 'num_bursts_fret', 'num_bursts_donly', 
             'size', 'duration'],
    index=range(len(filelist)))
df_all.fname = [f.stem for f in filelist]

Customize below if you want to extract a sample "name" 
according to a different pattern:

In [7]:
df_all.name = (df_all.fname
                .str.replace('minus8ta_minus5ntd_', '')
df_all.repeat = df_all.fname.str.split('_').str.get(-1).astype('int')
df_all = df_all.set_index('fname')
df_all

Unnamed: 0_level_0,name,repeat,num_bursts_fret,num_bursts_donly,size,duration
fname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dsdna_d17_1,d17,1,,,,
dsdna_d17_2,d17,2,,,,
dsdna_d17_3,d17,3,,,,
dsdna_d7+d17_50_50_1,d7+d17,1,,,,
dsdna_d7+d17_50_50_2,d7+d17,2,,,,
dsdna_d7+d17_50_50_3,d7+d17,3,,,,
dsdna_d7_1,d7,1,,,,
dsdna_d7_2,d7,2,,,,
dsdna_d7_3,d7,3,,,,


Now we fill all the other columns:

In [8]:
for f in filelist[::-1]:
    df_all.loc[f.stem, 'size'] = f.stat().st_size / 1024 / 1024
    
    burst_fname = Path(results_folder, f.stem + f'{fret_suffix}.csv')
    dx = pd.read_csv(burst_fname, index_col=0)
    df_all.loc[f.stem, 'num_bursts_fret'] = dx.shape[0]
    duration = np.round(dx.t_start.iloc[-1] - dx.t_start.iloc[0])
    df_all.loc[f.stem, 'duration'] = duration
    
    burst_fname = Path(results_folder, f.stem + f'{donly_suffix}.csv')
    dx = pd.read_csv(burst_fname, index_col=0)
    df_all.loc[f.stem, 'num_bursts_donly'] = dx.shape[0]
df_all

Unnamed: 0_level_0,name,repeat,num_bursts_fret,num_bursts_donly,size,duration
fname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dsdna_d17_1,d17,1,1689,2234,14.3285,1368
dsdna_d17_2,d17,2,1419,2190,12.9113,1245
dsdna_d17_3,d17,3,509,767,4.71971,444
dsdna_d7+d17_50_50_1,d7+d17,1,5981,6062,34.6641,1885
dsdna_d7+d17_50_50_2,d7+d17,2,4772,5967,29.8074,1590
dsdna_d7+d17_50_50_3,d7+d17,3,4218,6386,29.4961,1555
dsdna_d7_1,d7,1,575,2807,11.4511,788
dsdna_d7_2,d7,2,585,3162,12.7444,879
dsdna_d7_3,d7,3,322,1875,7.50941,511


# Concatenate photon data

All measurements with same "name" will be merged in a single file.

We start merging the photon data (timestamps, nanotimes etc..):

In [9]:
fret_suffixp = '_FRET_burst_photons'
donly_suffixp = '_Donly_burst_photons'

In [10]:
def merge_by_concentration(suffix, df_all):
    # `burstph_dict1`: key is concentration, values lists of DataFrame
    burstph_dict1 = defaultdict(list)
    for (name, repeat), df_c in df_all.groupby(['name', 'repeat']):
        for fname, s in df_c.iterrows():
            burstph_fname = Path(results_folder, fname + f'{suffix}.csv')
            header = burstph_fname.read_text().split('\n')[0]
            meta = json.loads(header)
            df_all.loc[fname, 'donly_lifetime'] = meta['donly_lifetime']
            df_all.loc[fname, 'timestamp_unit'] = meta['timestamp_unit']
            df_all.loc[fname, 'nanotime_unit'] = meta['nanotime_unit']
            dx = pd.read_csv(burstph_fname, skiprows=1, index_col=(0, 1))
            assert (np.diff(dx.timestamp) >= 0).all()
            dx['repeat'] = np.uint8(repeat)
            burstph_dict1[name].append(dx)
    assert np.allclose(df_all.timestamp_unit, df_all.timestamp_unit.mean())
    assert np.allclose(df_all.nanotime_unit, df_all.nanotime_unit.mean())

    # `burstph_dict`: like `burstph_dict1` but with unique burst ids at a given concentration
    burstph_dict = defaultdict(list)
    for name, burstph_list in burstph_dict1.items():
        df_list2 = []
        burst_offset = 0
        ts_offset = 0
        for i, df in enumerate(burstph_list):
            df2 = df.reset_index('burst')
            num_bursts = len(df2.burst.unique())
            df2.burst += burst_offset
            burst_offset += num_bursts
            df2.timestamp += ts_offset
            ts_offset += df2.timestamp.values[-1]
            burstph_dict[name].append(
                df2.set_index('burst', append=True)
                   .reorder_levels(['burst', 'ph']))    

    # Test consistency of burstph_dict and burstph_dict1
    for name, burstph_list in burstph_dict.items():
        burstph_list1 = burstph_dict1[name]
        prev_burst = -1
        for df1, df2 in zip(burstph_list1, burstph_list):
            df1 = df1.reset_index()
            df2 = df2.reset_index()
            assert df2.burst.iloc[0] == df2.burst.min()
            assert df2.burst.iloc[-1] == df2.burst.max()
            assert (np.diff(df2.burst.unique()) == 1).all()
            assert df2.burst.iloc[0] == prev_burst + 1
            prev_burst = df2.burst.iloc[-1]

            assert (df1.burst == df2.burst - df2.burst.min()).all()
            for c in df1.columns:
                if c == 'burst': break
                assert (df1[c] == df2[c]).all()

    # Create a dict of DataFrame merging by concentration
    burstph_dict_merge = {n: pd.concat(df_list) 
                          for n, df_list in burstph_dict.items()}

    # Test consistency of `burstph_dict_merge` and `burstph_dict`
    for n, df in burstph_dict_merge.items():
        assert df.shape[0] == sum([x.shape[0] for x in burstph_dict[n]])
        num_bursts = len(df.reset_index().burst.unique())
        assert num_bursts == sum([len(x.reset_index().burst.unique())
                                 for x in burstph_dict[n]])
        df2 = df.copy().reset_index()
        assert df2.burst.iloc[0] == df2.burst.min()
        assert df2.burst.iloc[-1] == df2.burst.max()
        assert (np.diff(df2.burst.unique()) == 1).all()
    return burstph_dict_merge

## Merge photon data FRET

In [11]:
suffix = fret_suffixp
suffix.split('_')[1]

'FRET'

In [12]:
burstph_dict_merge = merge_by_concentration(suffix, df_all)

## Save photon data FRET

In [13]:
df_merge = pd.DataFrame(index=df_all.name.unique())
df_merge.index.name = 'name'
df_merge['donly_lifetime'] = df_all.groupby('name')['donly_lifetime'].mean()
df_merge['timestamp_unit'] = np.round(df_all.groupby('name')['timestamp_unit'].mean(), 9)
df_merge['nanotime_unit'] = df_all.groupby('name')['nanotime_unit'].mean()
df_merge

Unnamed: 0_level_0,donly_lifetime,timestamp_unit,nanotime_unit
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
d17,3.905157,5e-08,1.467823e-11
d7+d17,3.939998,5e-08,1.467823e-11
d7,3.992275,5e-08,1.467823e-11


In [14]:
for name in df_merge.index:
    print(f'Name {name}:', np.where(burstph_dict_merge[name].timestamp.diff() < 0))
    assert (burstph_dict_merge[name].timestamp.diff()[1:] >= 0).all()

Name d17: (array([], dtype=int64),)
Name d7+d17: (array([], dtype=int64),)
Name d7: (array([], dtype=int64),)


In [15]:
for name, row in df_merge.iterrows():
    burstph_fname = Path(results_folder, name + f'_merge{suffix}.csv')
    print(f' - Saving file "{burstph_fname.name}"')
    meta = dict(
        description=f"Merged photon data for {suffix.split('_')[1]} bursts for '{name}' measurement.",
        timestamp_unit=row.timestamp_unit,
        nanotime_unit=row.nanotime_unit,
        donly_lifetime=row.donly_lifetime)
    
    with open(burstph_fname, mode='wt') as f:
        json.dump(meta, f)
        f.write('\n')
        burstph_dict_merge[name].to_csv(f)

 - Saving file "d17_merge_FRET_burst_photons.csv"
 - Saving file "d7+d17_merge_FRET_burst_photons.csv"
 - Saving file "d7_merge_FRET_burst_photons.csv"


## Merge photon data D-only

In [16]:
suffix = donly_suffixp
suffix.split('_')[1]

'Donly'

In [17]:
suffix

'_Donly_burst_photons'

In [18]:
burstph_dict_mergeD = merge_by_concentration(suffix, df_all)

## Save photon data D-only

In [19]:
for name, row in df_merge.iterrows():
    burstph_fname = Path(results_folder, name + f'_merge{suffix}.csv')
    print(f' - Saving file "{burstph_fname.name}"')
    meta = dict(
        description=f"Merged photon data for {suffix.split('_')[1]} bursts for '{name}' measurement.",
        timestamp_unit=row.timestamp_unit,
        nanotime_unit=row.nanotime_unit,
        donly_lifetime=row.donly_lifetime)
    
    with open(burstph_fname, mode='wt') as f:
        json.dump(meta, f)
        f.write('\n')
        burstph_dict_mergeD[name].to_csv(f)

 - Saving file "d17_merge_Donly_burst_photons.csv"
 - Saving file "d7+d17_merge_Donly_burst_photons.csv"
 - Saving file "d7_merge_Donly_burst_photons.csv"


## Read the photon data

Load the photon data for all the concentrations
using only a variable `folder`:

In [20]:
suffix = fret_suffixp
suffix

'_FRET_burst_photons'

In [21]:
burstph_dict_merge2 = {}
for fname in Path(results_folder).glob(f'*_merge{suffix}.csv'):
    print(f'- Loading {fname}')
    name = fname.stem.split('_')[0]
    burstph_dict_merge2[name] = pd.read_csv(fname, skiprows=1, index_col=(0, 1))
    header = fname.read_text().split('\n')[0]
    burstph_dict_merge2[name].meta = json.loads(header)

- Loading ../Relevant BH measurements/dsDNA/results/d17_merge_FRET_burst_photons.csv


  import sys


- Loading ../Relevant BH measurements/dsDNA/results/d7+d17_merge_FRET_burst_photons.csv


  mask |= (ar1 == a)


- Loading ../Relevant BH measurements/dsDNA/results/d7_merge_FRET_burst_photons.csv


In [22]:
# Test roundtrip
for name in burstph_dict_merge:
    assert (burstph_dict_merge[name] == burstph_dict_merge2[name]).all().all()

In [23]:
suffix = donly_suffixp
suffix

'_Donly_burst_photons'

In [24]:
burstph_dict_mergeD2 = {}
for fname in Path(results_folder).glob(f'*_merge{suffix}.csv'):
    print(f'- Loading {fname}')
    name = fname.stem.split('_')[0]
    burstph_dict_mergeD2[name] = pd.read_csv(fname, skiprows=1, index_col=(0, 1))
    header = fname.read_text().split('\n')[0]
    burstph_dict_mergeD2[name].meta = json.loads(header)

- Loading ../Relevant BH measurements/dsDNA/results/d17_merge_Donly_burst_photons.csv


  import sys


- Loading ../Relevant BH measurements/dsDNA/results/d7+d17_merge_Donly_burst_photons.csv
- Loading ../Relevant BH measurements/dsDNA/results/d7_merge_Donly_burst_photons.csv


In [25]:
# Test roundtrip
for name in burstph_dict_merge:
    assert (burstph_dict_mergeD[name] == burstph_dict_mergeD2[name]).all().all()

# Concatenate burst data

## Merge FRET bursts

In [26]:
suffix = fret_suffix
suffix

'_FRET_bursts'

In [27]:
# `burst_dict`: key is concentration, values lists of DataFrame
burst_dict = defaultdict(list)
for fname, row in df_all.iterrows():
    burst_fname = Path(results_folder, fname + f'{suffix}.csv')
    dx = pd.read_csv(burst_fname, index_col=0)
    dx['repeat'] = np.uint8(row['repeat'])
    burst_dict[row['name']].append(dx)

In [28]:
burst_dict.keys()

dict_keys(['d17', 'd7+d17', 'd7'])

In [29]:
burst_dict_merge = {}
for n in burst_dict:
    bursts_merge = pd.concat(burst_dict[n], ignore_index=True)
    bursts_merge.index.name = 'burst'
    burst_dict_merge[n] = bursts_merge

In [30]:
burst_dict_merge.keys()

dict_keys(['d17', 'd7+d17', 'd7'])

In [31]:
# Test consistency of burstph_dict_merge and burst_dict_merge
for n in burstph_dict_merge:
    ph = burstph_dict_merge[n]
    bu = burst_dict_merge[n]
    assert (ph.groupby('burst')['timestamp'].count() == bu.size_raw).all()

    width = (ph.groupby('burst')['timestamp'].max()
             - ph.groupby('burst')['timestamp'].min())*50e-9*1e3

    assert np.allclose(width, bu.width_ms)
    size = bu.nd + bu.na + bu.nda + bu.naa + bu.bg_dd + bu.bg_ad + bu.bg_da + bu.bg_aa
    assert np.allclose(size, bu.size_raw)

## Save FRET burst data

In [32]:
suffix = fret_suffix
suffix

'_FRET_bursts'

In [33]:
for name, row in df_merge.iterrows():
    fname = Path(results_folder, name + f'_merge{suffix}.csv')
    print(f' - Saving file "{fname.name}"')
    burst_dict_merge[name].to_csv(fname)

 - Saving file "d17_merge_FRET_bursts.csv"
 - Saving file "d7+d17_merge_FRET_bursts.csv"
 - Saving file "d7_merge_FRET_bursts.csv"


## Merge D-only bursts

In [34]:
suffix = donly_suffix
suffix

'_Donly_bursts'

In [35]:
# `burst_dict`: key is concentration, values lists of DataFrame
burst_dict = defaultdict(list)
for fname, row in df_all.iterrows():
    burst_fname = Path(results_folder, fname + f'{suffix}.csv')
    dx = pd.read_csv(burst_fname, index_col=0)
    dx['repeat'] = np.uint8(row['repeat'])
    burst_dict[row['name']].append(dx)

In [36]:
burst_dict_mergeD = {}
for n in burst_dict:
    bursts_merge = pd.concat(burst_dict[n], ignore_index=True)
    bursts_merge.index.name = 'burst'
    burst_dict_mergeD[n] = bursts_merge

In [37]:
# Test consistency of burstph_dict_merge and burst_dict_merge
for n in burstph_dict_mergeD:
    ph = burstph_dict_mergeD[n]
    bu = burst_dict_mergeD[n]
    assert (ph.groupby('burst')['timestamp'].count() == bu.size_raw).all()

    width = (ph.groupby('burst')['timestamp'].max()
             - ph.groupby('burst')['timestamp'].min())*50e-9*1e3

    assert np.allclose(width, bu.width_ms)
    size = bu.nd + bu.na + bu.nda + bu.naa + bu.bg_dd + bu.bg_ad + bu.bg_da + bu.bg_aa
    assert np.allclose(size, bu.size_raw)

## Save Donly burst data

In [38]:
suffix = donly_suffix
suffix

'_Donly_bursts'

In [39]:
for name, row in df_merge.iterrows():
    fname = Path(results_folder, name + f'_merge{suffix}.csv')
    print(f' - Saving file "{fname.name}"')
    burst_dict_mergeD[name].to_csv(fname)

 - Saving file "d17_merge_Donly_bursts.csv"
 - Saving file "d7+d17_merge_Donly_bursts.csv"
 - Saving file "d7_merge_Donly_bursts.csv"
