# Processing rat data - draft 2

First import the coding file

In [1]:
import numpy as np
import pandas as pd

coding_file = 'treatment_index.csv'

dtypes = {'date':'str',
          'rat':'int',
          'cage':'int',
          'time':'str',
          'duration':'int',
          'drug':'str',
          'dose':'float'}

parse_dates = {'start_time':['time', 'date']}

def date_parser(time, date):
    if len(time) == 3:
        time = '0' + time
    
    return pd.datetime.strptime(time+date, '%H%M%d/%m/%y')
#date_parser = lambda x, y: pd.datetime.strptime(x+y, '%-H%M %d/%m/%y')

coding = pd.read_csv(coding_file, dtype=dtypes, parse_dates=parse_dates, date_parser=date_parser)
print coding['start_time'].dtype
print coding

datetime64[ns]
             start_time  rat  cage  duration    drug   dose
0   2015-02-26 18:48:00    3     3         8    Ex-4    1.0
1   2015-02-26 18:57:00   12    15         8    Ex-4    1.0
2   2015-03-03 18:55:00    9     9         8    Ex-4    1.0
3   2015-03-03 18:56:00   11    11         8    Ex-4    1.0
4   2015-03-04 18:45:00    5     5         8    Ex-4    1.0
5   2015-03-04 18:45:00    7     7         8    Ex-4    1.0
6   2015-03-04 18:45:00    2    13         8    Ex-4    1.0
7   2015-02-26 18:49:00    4     4         8   GLP-1   30.0
8   2015-02-26 18:49:00    5     5         8   GLP-1  100.0
9   2015-02-26 18:51:00    7     7         8   GLP-1  100.0
10  2015-02-26 18:51:00    8     8         8   GLP-1   30.0
11  2015-02-26 18:52:00    9     9         8   GLP-1  300.0
12  2015-02-26 18:53:00   10    10         8   GLP-1   30.0
13  2015-02-26 18:54:00   11    11         8   GLP-1  300.0
14  2015-02-26 18:55:00    2    13         8   GLP-1  100.0
15  2015-03-03 18:51:00  

Now import the data

In [2]:
import CLAMS_parsers as parser
import os
import matplotlib.pyplot as plt
import seaborn as sns

data_path = 'data_to_process/'
file_list = os.listdir(data_path)

dfs = []
for data_file in file_list:
    if data_file[-3:] != 'CSV':
        continue
    
    #print data_file
    cage_id = int(data_file.split('.')[1][-2:])
    #print data_file, cage_id
    num_id, subject_id, mass, data = parser.parse_bouts(data_path+data_file)
    
    column_labels = ['start', 'stop', 'amount']
    this_data = pd.DataFrame(data, columns = column_labels)
    this_data['cage_id'] = cage_id
    this_data['filename'] = data_file
    
    dfs.append(this_data)
    
data = pd.concat(dfs, ignore_index=True)

data['duration'] = (data['stop'] - data['start']).astype('timedelta64[s]')
data['rate'] = data['amount']/data['duration']
print data

data['dropme'] = 0

                    start                stop  amount  cage_id  \
0     2014-10-27 12:41:09 2014-10-27 12:41:37    0.21        5   
1     2014-10-28 09:07:06 2014-10-28 09:07:45    0.04        5   
2     2014-10-28 09:08:05 2014-10-28 09:08:51    0.17        5   
3     2014-10-28 09:09:09 2014-10-28 09:10:36    0.52        5   
4     2014-10-28 09:11:05 2014-10-28 09:11:26    0.18        5   
5     2014-10-28 09:11:49 2014-10-28 09:12:24    0.26        5   
6     2014-10-28 09:15:16 2014-10-28 09:15:24    0.04        5   
7     2014-10-28 09:18:15 2014-10-28 09:18:30    0.11        5   
8     2014-10-28 09:18:49 2014-10-28 09:19:00    0.10        5   
9     2014-10-28 09:19:29 2014-10-28 09:19:32    0.02        5   
10    2014-10-28 09:23:55 2014-10-28 09:24:08    0.09        5   
11    2014-10-28 09:24:33 2014-10-28 09:24:57    0.14        5   
12    2014-10-28 09:38:29 2014-10-28 09:38:58    0.06        5   
13    2014-10-28 09:39:19 2014-10-28 09:39:27    0.04        5   
14    2014

## Define selection function

We'll use this to slice out data for further processing

In [3]:
def filter_data(df, cage_id, start, stop):
    after_start = df[df['start'] >= start].index
    correct_cage = df[df['cage_id'] == cage_id].index
    before_stop = df[df['stop'] <= stop].index
    
    full_index = after_start.intersection(correct_cage).intersection(before_stop)
    
    if len(full_index) == 0:
        return [-1]
    
    next_after = pd.Index([full_index[-1] + 1])
    full_index = full_index.union(next_after)
    full_index = full_index.intersection(correct_cage)
    
    return full_index

## Remove data based on thresholds

Remove outliers using hard thresholds

In [4]:
amt_max = 4
dur_max = 1000
rate_max = 0.02
dur_min = 4

drop_idx = data[data['amount'] > amt_max].index
data.set_value(drop_idx, 'dropme', 1)
#data = data.drop(drop_idx)

drop_idx = data[data['duration'] > dur_max].index
data.set_value(drop_idx, 'dropme', 1)
#data = data.drop(drop_idx)

drop_idx = data[data['duration'] < dur_min].index
data.set_value(drop_idx, 'dropme', 1)
#data = data.drop(drop_idx)

drop_idx = data[data['rate'] > rate_max].index
data.set_value(drop_idx, 'dropme', 1)

Unnamed: 0,start,stop,amount,cage_id,filename,duration,rate,dropme
0,2014-10-27 12:41:09,2014-10-27 12:41:37,0.21,5,2014-10-27.B0105.CSV,28.0,0.007500,0
1,2014-10-28 09:07:06,2014-10-28 09:07:45,0.04,5,2014-10-27.B0105.CSV,39.0,0.001026,0
2,2014-10-28 09:08:05,2014-10-28 09:08:51,0.17,5,2014-10-27.B0105.CSV,46.0,0.003696,0
3,2014-10-28 09:09:09,2014-10-28 09:10:36,0.52,5,2014-10-27.B0105.CSV,87.0,0.005977,0
4,2014-10-28 09:11:05,2014-10-28 09:11:26,0.18,5,2014-10-27.B0105.CSV,21.0,0.008571,0
5,2014-10-28 09:11:49,2014-10-28 09:12:24,0.26,5,2014-10-27.B0105.CSV,35.0,0.007429,0
6,2014-10-28 09:15:16,2014-10-28 09:15:24,0.04,5,2014-10-27.B0105.CSV,8.0,0.005000,0
7,2014-10-28 09:18:15,2014-10-28 09:18:30,0.11,5,2014-10-27.B0105.CSV,15.0,0.007333,0
8,2014-10-28 09:18:49,2014-10-28 09:19:00,0.10,5,2014-10-27.B0105.CSV,11.0,0.009091,0
9,2014-10-28 09:19:29,2014-10-28 09:19:32,0.02,5,2014-10-27.B0105.CSV,3.0,0.006667,1


## Remove cancelling and negative readings

In [5]:
## Cancelling readings - spurious errors on scales that immediately cancel
shift_data = data.shift(-1)
data['next_amount'] = shift_data['amount']
dup_ix = data[data['next_amount'] == -1*data['amount']].index

drop_mask = dup_ix.union(dup_ix + 1)

data.set_value(drop_mask, 'dropme', 1)

## Negative readings - scale value is negative due to rat standing on it etc.
neg_ix = data[data['rate'] < 0].index
data.set_value(neg_ix, 'dropme', 1)

Unnamed: 0,start,stop,amount,cage_id,filename,duration,rate,dropme,next_amount
0,2014-10-27 12:41:09,2014-10-27 12:41:37,0.21,5,2014-10-27.B0105.CSV,28.0,0.007500,0,0.04
1,2014-10-28 09:07:06,2014-10-28 09:07:45,0.04,5,2014-10-27.B0105.CSV,39.0,0.001026,0,0.17
2,2014-10-28 09:08:05,2014-10-28 09:08:51,0.17,5,2014-10-27.B0105.CSV,46.0,0.003696,0,0.52
3,2014-10-28 09:09:09,2014-10-28 09:10:36,0.52,5,2014-10-27.B0105.CSV,87.0,0.005977,0,0.18
4,2014-10-28 09:11:05,2014-10-28 09:11:26,0.18,5,2014-10-27.B0105.CSV,21.0,0.008571,0,0.26
5,2014-10-28 09:11:49,2014-10-28 09:12:24,0.26,5,2014-10-27.B0105.CSV,35.0,0.007429,0,0.04
6,2014-10-28 09:15:16,2014-10-28 09:15:24,0.04,5,2014-10-27.B0105.CSV,8.0,0.005000,0,0.11
7,2014-10-28 09:18:15,2014-10-28 09:18:30,0.11,5,2014-10-27.B0105.CSV,15.0,0.007333,0,0.10
8,2014-10-28 09:18:49,2014-10-28 09:19:00,0.10,5,2014-10-27.B0105.CSV,11.0,0.009091,0,0.02
9,2014-10-28 09:19:29,2014-10-28 09:19:32,0.02,5,2014-10-27.B0105.CSV,3.0,0.006667,1,0.09


## Iterate over datasets

In [6]:
import bouts_from_data as bfd
reload(bfd)
import os

min_bout_count = 5

err_thresh = 10

for row in coding.itertuples():
    start, rat, cage, duration, drug, dose = row[1], row[2], row[3], row[4], row[5], row[6]    
    exp_ix = filter_data(data, cage, start, start + pd.Timedelta(hours=duration))
    
    ## Skip if no data
    if exp_ix[0] == -1:
        print "Missing data for: ", start, rat, cage, duration
        continue
    
    exp_data = data.loc[exp_ix] # fetch the indexed data

    tot_errs = exp_data['dropme'].sum()

    ## Skip if too many errors
    if tot_errs > err_thresh:
        print "Too many errors for: ", start, rat, cage, duration, tot_errs
        continue
        
    ## Drop individual errors
    drop_ix = exp_data[exp_data['dropme'] == 1].index
    exp_data = exp_data.drop(drop_ix)
    
    ## Code with day/night
    exp_data['exp_start'] = start
    
    def dark_light(row):
        if 6 <= row['start'].hour < 18:
            return 'L'

        else:
            return 'D'
    
    def fast_rec(row):
        if row['exp_start'] + pd.Timedelta(hours=20) > row['start']: # the long duration ones start at ~9am
            return 'R'
        else:
            return 'N'

    exp_data['period'] = exp_data.apply(dark_light, axis=1)
    exp_data['fasting'] = exp_data.apply(fast_rec, axis=1)
    
    ## Code with category and id
    exp_data['drug'] = drug
    exp_data['dose'] = dose
    
    def make_exp_cat(row):
        exp_cat = '_'.join([row['drug'],
                           str(row['dose']),
                           row['fasting'],
                           row['period']])
        return exp_cat

    def make_exp_id(row):
        exp_id = '_'.join([row['drug'],
                           str(row['dose']),
                           row['fasting'],
                           row['period'],
                           str(row['cage_id']),
                           row['filename']])
        return exp_id

    exp_data['exp_cat'] = exp_data.apply(make_exp_cat, axis=1)
    exp_data['exp_id'] = exp_data.apply(make_exp_id, axis=1)
    
    ## Now process and export
    bout_data = exp_data[['start', 'stop', 'amount', 'exp_cat', 'exp_id']]
    bout_data = bout_data.as_matrix()
    
    if len(bout_data) < min_bout_count:
        print "Skipping ", start, rat, cage, duration, drug, dose
        continue
    else:
        output = bfd.get_events(bout_data)
        
        cols = ['f_length', 'g_start', 'rate', 'p_length', 'g_end_feeding', 'period', 't_from_start', 'exp_cat', 'exp_id']
        exp_df = pd.DataFrame(output, columns=cols)
        
        cats = exp_df['exp_cat'].unique()
        
        for cat in cats:
            x = exp_df[exp_df['exp_cat'] == cat]
            exp_id = x['exp_id'].unique()
            
            if len(exp_id) > 1:
                print "ERROR"
                
            else:
                exp_id = exp_id[0]
                
            exportme = x[['f_length', 'g_start', 'rate', 'p_length', 'g_end_feeding']]
            exportme = exportme.as_matrix()
                
            path = 'all_data/'+cat+'/'
            if not os.path.exists(path):
                os.makedirs(path)
            
            np.savetxt(path+exp_id, exportme, delimiter='\t')
    
    

AssertionError: 9 columns passed, passed data had 10 columns