In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import rpy2
from rpy2.robjects import numpy2ri
from rpy2.robjects import pandas2ri
numpy2ri.activate()
pandas2ri.activate()
from sklearn.ensemble import RandomForestClassifier
import puff_lib
import pims
import os
import time

# Classifier V3.0: A New Hope??

We have a whole new set of cells. Let's sample puffs from each of those cells to build a training dataset. Ideally, I will have sampled randomly and have an equal number of cells from each condition. In reality, here's what condition_checker tells me:

```
condition_checker.py ~/UM\ Drive/OPP/OPP_scramble.log 0IYVHRNA.tif 19CC75ZU.tif 2TZWB6CN.tif 3W70AV4V.tif 3WE2GUYC.tif 5FWR8G3N.tif 5RTVNBXU.tif 6SG3YVW7.tif 7C4SV01F.tif 7XFW332I.tif 7XGJBQQ3.tif 7YFIHF8P.tif 8C92MS0M.tif 9VEA7WGA.tif
2 B2 cells, 6 MOR cells, 6 TfR Cells
```

So that's not...GREAT. But it's a start for a classifier. So, how do we do this?

## Sample Cells

Let's evaluate each of the cells that have been scored so far and find out the frequency of puffs and total number of puffs in those cells:

In [2]:
basedir = '/Volumes/Coatamer/Users/weinberz/UM Drive/OPP'
files = os.listdir(basedir)
tifs = [tif for tif in files if 'tif' in tif]
xmls = [xml for xml in files if 'xml' in xml]
movie_info = {}
for tif in tifs:
    movie_name = tif.split('.')[0]
    matches = [xml for xml in xmls if movie_name in xml]
    if matches:
        movie_info[movie_name] = {'tif' : tif,
                                  'xml' : matches[0]}

In [94]:
for movie in movie_info:
    events = pd.read_csv('data/'+movie+'.tif_events.csv',
                         usecols = [1,2,3,4], 
                         names = ['frame','x','y','particle'],
                         header=0)
#                          dtype = {'frame': np.int16, 'x' : np.int8, 'y' : np.int8, 'particle' : np.int32})
    marker_locs = puff_lib.import_xml_data('/Volumes/Coatamer/Users/weinberz/UM Drive/OPP/' + movie_info[movie]['xml'])
    puff_ids = np.array([puff_lib.filter_df(events, m, 5) for m in marker_locs])
    print('%d puffs in %d events (%.4f puff rate)' % (len(np.unique(puff_ids[puff_ids>0])),
                                                      len(np.unique(events['particle'])),
                                                      (len(np.unique(puff_ids[puff_ids>0]))/len(np.unique(events['particle'])))))

38 puffs in 28089 events (0.0014 puff rate)
124 puffs in 44733 events (0.0028 puff rate)
61 puffs in 22501 events (0.0027 puff rate)
64 puffs in 39856 events (0.0016 puff rate)
493 puffs in 37640 events (0.0131 puff rate)
107 puffs in 32159 events (0.0033 puff rate)
39 puffs in 33356 events (0.0012 puff rate)
519 puffs in 38728 events (0.0134 puff rate)
241 puffs in 28328 events (0.0085 puff rate)
265 puffs in 33864 events (0.0078 puff rate)
74 puffs in 28401 events (0.0026 puff rate)
16 puffs in 16951 events (0.0009 puff rate)
642 puffs in 46392 events (0.0138 puff rate)
201 puffs in 27119 events (0.0074 puff rate)


That's a large range :/. Let's go through each, pull enough events to contain at least 16 puffs (the minimum of the above), while maintaining the overall distribution of puffs-to-non-puffs in each movie.

In [3]:
np.random.seed(237)

movie = list(movie_info.keys())[6]
movie_file = movie_info[movie]['tif']

events = pd.read_csv('data/'+movie+'.tif_events.csv',
                     usecols = [1,2,3,4], 
                     names = ['frame','x','y','particle'],
                     header=0)
marker_locs = puff_lib.import_xml_data('/Volumes/Coatamer/Users/weinberz/UM Drive/OPP/' + movie_info[movie]['xml'])

puff_ids = np.unique([puff_lib.filter_df(events, m, 5) for m in marker_locs])
puff_ids = puff_ids[puff_ids>0]
nonpuff_ids = np.array([idx for idx in np.unique(events['particle']) if idx not in puff_ids])
num_puffs = len(np.unique(puff_ids))
num_events = len(np.unique(events['particle']))
puff_freq = num_puffs/num_events

if num_puffs <= 16:
    events_to_sample = events
else:
    puff_ids_to_sample = np.random.choice(puff_ids,16,replace=False)
    nonpuff_ids_to_sample = np.random.choice(nonpuff_ids,int(16/puff_freq), replace=False)
    sample_ids = np.hstack([puff_ids_to_sample, nonpuff_ids_to_sample])
    events_to_sample = events.loc[events['particle'].isin(sample_ids)]

In [5]:
frames = pims.TiffStack(basedir + os.sep + movie_file)
start = time.time()
intensities = puff_lib.intensity_grid(frames, events, delta=4)
scores = puff_lib.get_pc_scores(intensities)


1446.1493620872498


In [15]:
all_events = pd.read_csv('6SG3YVW7.tif_events.csv',usecols=[1,2,3,4], names=['frame','x','y','particle'], header=0)

In [26]:
num_pad=5
delta=4
f = pims.TiffStack('/Volumes/Coatamer/Users/weinberz/UM Drive/OPP/6SG3YVW7.tif')
marker_locs = puff_lib.import_xml_data('/Volumes/Coatamer/Users/weinberz/UM Drive/OPP/6SG3YVW7-ZYW-1.xml')
puff_ids = np.array([puff_lib.filter_df(all_events, m, 5) for m in marker_locs])
all_events['puff'] = all_events['particle'].isin(puff_ids)
all_events = puff_lib.pad_frames(all_events, np.unique(all_events['particle']), num_pad)
all_intensities = puff_lib.intensity_grid(f, all_events, delta)

In [42]:
all_events['puff']= all_events['particle'].isin(puff_ids).values.astype('int')

In [51]:
all_intensities['frame'] = pd.to_numeric(all_intensities['frame'],downcast='integer')
all_intensities['x'] = pd.to_numeric(all_intensities['x'],downcast='integer')
all_intensities['y'] = pd.to_numeric(all_intensities['y'],downcast='integer')
all_intensities['particle'] = pd.to_numeric(all_intensities['particle'],downcast='integer')
all_intensities['intensity'] = pd.to_numeric(all_intensities['intensity'],downcast='integer')
all_intensities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46567143 entries, 0 to 46567142
Data columns (total 5 columns):
frame        int16
x            int8
y            int8
particle     int32
intensity    int16
dtypes: int16(2), int32(1), int8(2)
memory usage: 444.1 MB


In [61]:
puff_intensities = all_intensities.loc[all_intensities['particle']==34]
puff_event = all_events.loc[all_events['particle'] == 34]

puff_scores = pandas2ri.ri2py(puff_lib.get_pc_scores(all_intensities.loc[all_intensities['particle']==34]))

In [66]:
puff_scores.head()

Unnamed: 0,frame,particle,s1,s2,s3,smooth1,smooth2,smooth3
0,0,34,-27.364928,-3.000417,0.3713,-27.343296,-2.977732,0.383188
1,1,34,-27.317731,-2.941639,0.3942,-27.330931,-2.970389,0.390719
2,2,34,-27.32672,-2.987848,0.405369,-27.321762,-2.968481,0.395706
3,3,34,-27.293761,-2.954721,0.387049,-27.324485,-2.967554,0.396249
4,4,34,-27.355328,-2.977831,0.407638,-27.340394,-2.965305,0.393953


In [64]:
pd.concat([puff_event.set_index(['frame','particle']),
           puff_scores.set_index(['frame','particle'])], sort=False, axis=1).reset_index()

Unnamed: 0,frame,particle,x,y,puff,s1,s2,s3,smooth1,smooth2,smooth3
0,0,34,120.0,86.0,1,-27.364928,-3.000417,0.371300,-27.343296,-2.977732,0.383188
1,1,34,120.0,86.0,1,-27.317731,-2.941639,0.394200,-27.330931,-2.970389,0.390719
2,2,34,121.0,86.0,1,-27.326720,-2.987848,0.405369,-27.321762,-2.968481,0.395706
3,3,34,121.0,86.0,1,-27.293761,-2.954721,0.387049,-27.324485,-2.967554,0.396249
4,4,34,121.0,86.0,1,-27.355328,-2.977831,0.407638,-27.340394,-2.965305,0.393953
5,5,34,120.0,86.0,1,-27.365564,-2.951040,0.381280,-27.352986,-2.962289,0.389209
6,6,34,120.0,86.0,1,-27.352179,-2.968034,0.378627,-27.351516,-2.960731,0.388945
7,7,34,120.0,86.0,1,-27.355502,-2.949758,0.400087,-27.339599,-2.961710,0.395960
8,8,34,121.0,86.0,1,-27.292170,-2.977541,0.408458,-27.330761,-2.963475,0.401639
9,9,34,120.0,86.0,1,-27.346474,-2.955720,0.405803,-27.338862,-2.961611,0.398740


In [71]:
len(np.unique(all_events['particle']))

38728