In [52]:
"""Conversion of flow cytometry data into conditional output distributions
   and response function with quintiles.

Author: Swarnavo Sarkar
Email: swarnavo.sarkar@nist.gov

If you are using SEMIL and any of the pre and postprocessing code, please cite 
'Mutual Information Landscapes as a Performance Metric for Biochemical Reaction Networks'
"""


import glob #filenames and pathnames utility
import os   #operating sytem utility
import numpy as np
import pandas as pd
import pickle
import random as rand
import sys
import math

In [53]:
# Directory where the cytometry dataframes are located
data_directory = '/Users/sns9/Research/IMS_project/FeedbackExpDec18/WTA'

output_directory = '/Users/sns9/Research/IMS_project/FeedbackExpDec18'

os.chdir(data_directory)

In [54]:
# Subsamples (with replacements) for obtaining output distributions
# will be used for correcting for finite-sampling
data_fractions = [1,2,5,10]
n_samples = list(range(1,6))

# Number of bins to bin cytometry data
n_bins = 100

# Quintiles for error bounds to the response curve
q_up, q_low = 95.0, 5.0

In [55]:
# Labels for plate and experimental replicate
plate_label = ['B','E']
rep_label = 'lacX'
tag = 'lacXWT_ml'
filter_string = 'lacX-'
conc_separator = '-'
plate_separator = '_'
data_fractions = [1,2,5,10]
n_samples = list(range(1,6))

In [56]:
coli_files = glob.glob('*.frame_pkl')

filenames = [file.rsplit('.',1)[0] for file in coli_files]

coli_frame = [ pickle.load(open(file, 'rb')) for file in coli_files ]

singlet_data = [frame.loc[frame['is_singlet']] for frame in coli_frame]

In [57]:
data_covered = []

location_string = {}
wt_string = {}

means = {}
quints = {}
percents = {}

In [58]:
index_set = None
data_size = 0

conclist = []
datas = {}

In [59]:
fl_channel = 'BL1-A-MEF'
glob_min = 1000000
glob_max = 0

for i, singlet in zip(range(len(singlet_data)), singlet_data):
    index_set = None
    for j in range(1):
        label, plate_no = filenames[i].split(plate_separator)
        this_plate = plate_no[0]

        if (plate_label[0] in plate_no or plate_label[1] in plate_no) and rep_label in label: # or plate_label[1] in plate_no
            #print(label)

            conc_v = float(label.lstrip(filter_string))#conc_separator)[1])
            if conc_v!=0.0:
                expo = math.log(conc_v)/math.log(2.0)
                if abs(expo-int(expo))<1e-16:
                    conc_value = str(conc_v)
                else:
                    conc_value = str(conc_v*1000)

                if conc_value not in data_covered:
                    data_covered.append(conc_value)
                    conclist.append(float(conc_value))

                    datas[conc_value] = singlet[fl_channel]
                    #print(len(singlet[fl_channel]))

                    glob_max = max(glob_max,max(singlet[fl_channel]))
                    glob_min = min(glob_min,min(singlet[fl_channel]))

In [60]:
conclist.sort()
print(conclist)

print(glob_max,glob_min)

[1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0, 2048.0, 4096.0, 8192.0, 16384.0, 32768.0, 65536.0]
67470.46121929903 -555.8822283365369


In [61]:
bin_edge = np.linspace(0.0, glob_max-glob_min,n_bins)
bin_locs = np.zeros(shape=(bin_edge.shape[0]-1))

for k in range(0,len(bin_edge)-1):
    bin_locs[k] = 0.5*(bin_edge[k]+bin_edge[k+1])
    if k==0:
        locstring = str(0.5*(bin_edge[k]+bin_edge[k+1]))
    else:
        locstring += ','+str(0.5*(bin_edge[k]+bin_edge[k+1]))

dir_tag = plate_label[0]+plate_label[1]

os.chdir(output_directory)
print(os.getcwd())

/Users/sns9/Research/IMS_project/FeedbackExpDec18


In [62]:
# Write values of input at which the flow cytometry data was measured

try:
    os.mkdir(dir_tag+tag)
except OSError:
    pass
    
os.chdir(dir_tag+tag)

f = open('samples.txt','w')

for c in conclist:
    #if conclist.index(c)==0:
    print(c,file=f)

f.close()

In [63]:
response_set = np.zeros(shape=(len(conclist),4))

for i in range(0,len(conclist)):
    cs = str(conclist[i])
    darray = datas[cs].values

    darray = darray - glob_min

    pc = np.percentile(darray,[q_low,q_up])
    
    response_set[i,0] = conclist[i]
    response_set[i,1] = np.mean(darray)
    response_set[i,2] = q_up - response_set[i,1]
    response_set[i,3] = response_set[i,1] - q_low
    

np.savetxt('response.csv',response_set,delimiter=',',header='i,g,+,-',comments='')

In [64]:
conditional_distributions = np.zeros(shape=(len(conclist)+1,bin_locs.shape[0]))

conditional_distributions[0,:] = bin_locs

for i in range(0,len(conclist)):
    cs = str(conclist[i])
    darray = datas[cs].values
    darray_list = list(darray)

    hist, b_edges = np.histogram(darray_list,bin_edge)

    total_wt = np.sum(hist)
    
    conditional_distributions[i+1,:] = hist/total_wt

np.savetxt('expressions.csv',conditional_distributions,delimiter=',')

In [65]:
for df in data_fractions:
    for k in n_samples:
        if df>=1:
            df_str = str(int(df))
        else:
            df_s = str(df).split('.')
            df_str = df_s[0]+'p'+df_s[1]
            
        file_name = 'expressions'+df_str+'_'+str(k)+'.csv'
            
        conditional_distributions = np.zeros(shape=(len(conclist)+1,bin_locs.shape[0]))
        
        conditional_distributions[0,:] = bin_locs

        for i in range(0,len(conclist)):
            cs = str(c)
            darray = datas[cs].values
            darray_list = list(darray)
            sample_size = int(len(darray_list)/df)

            d_sampled = rand.choices(darray_list,k=sample_size)

            hist, b_edges = np.histogram(np.array(d_sampled),bin_edge)

            total_wt = np.sum(hist)
            
            conditional_distributions[i+1,:] = hist/total_wt
                       
        np.savetxt(file_name,conditional_distributions,delimiter=',')

SyntaxError: invalid syntax (<ipython-input-65-94f319484703>, line 15)