In [71]:
import sys, os
import pandas as pd
import numpy as np

from statsmodels.stats import proportion

import matplotlib.pyplot as plt

%matplotlib inline

In [22]:
def getExpRunData(fname):
    tokens = fname.split('_')
    # (Experiment, Run)
    return (tokens[0], tokens[1])

def getAllCsvFileNames(directory='.', csv_suffix='misinc_data.csv'):
    csv_fnames = [f for f in os.listdir(directory) if os.path.isfile(f) and f.endswith(csv_suffix)]
    return csv_fnames

def simpleMisincStats(df, letterorder=['C', 'A', 'T', 'G'], cutoff=1):
    # This probably could get rewritten more elegantly...
    # Make output Df's
    labels = [l+'->!'+l for l in letterorder]
    misinc_df = pd.DataFrame(data=np.zeros([len(df), len(labels)]), columns=labels)
    lb_df = pd.DataFrame(data=np.zeros([len(df), len(labels)]), columns=labels)
    ub_df = pd.DataFrame(data=np.zeros([len(df), len(labels)]), columns=labels)
    
    m = df[[c for c in df.columns if not c == 'sequence']].values
    
    for i in range(len(df)):
        for nt_idx in range(len(letterorder)):
            total_n = np.sum(m[i, nt_idx*len(letterorder):nt_idx*len(letterorder)+len(letterorder)])
            misinc_n = total_n - m[i, nt_idx*len(letterorder)+nt_idx]
            if total_n >= cutoff:
                (lb,ub) = proportion.proportion_confint(misinc_n, total_n, method='jeffrey')
                misinc_df.iloc[i][nt_idx] = misinc_n / float(total_n)
                lb_df.iloc[i][nt_idx] = lb
                ub_df.iloc[i][nt_idx] = ub
    return misinc_df, lb_df, ub_df

In [23]:
os.chdir('./Data/2015_09_03_AMD/')

WindowsError: [Error 3] The system cannot find the path specified: './Data/2015_09_03_AMD/'

In [25]:
csv_fnames = getAllCsvFileNames(directory='.')

In [26]:
csv_fnames

['exp1_run1_misinc_data.csv',
 'exp1_run6_misinc_data.csv',
 'exp2_run2_misinc_data.csv',
 'exp2_run6_misinc_data.csv',
 'exp3_run3_misinc_data.csv',
 'exp3_run6_misinc_data.csv',
 'exp4_run4_misinc_data.csv',
 'exp4_run6_misinc_data.csv',
 'exp5_run5_misinc_data.csv',
 'exp5_run6_misinc_data.csv']

In [75]:
foo = {}

for fname in csv_fnames:
    foo[fname] = pd.read_csv(fname, index_col=0)
    
bar = foo[csv_fnames[0]]
print csv_fnames[0]

exp1_run1_misinc_data.csv


In [86]:
def simpleMisincStats(df, cutoff=1):
    data = df[[c for c in df.columns if not c == 'sequence']]
    total_n = data.sum(axis=1)
    correct_n = data[[n+'->'+n for n in ['A', 'C', 'T', 'G']]]
    misinc_n = total_n - correct_n.sum(axis=1)
    
    rate = misinc_n / total_n
    lb, ub = proportion.proportion_confint(misinc_n, total_n, method='jeffrey')
    
    simp_df = pd.DataFrame()
    simp_df['rate'] = rate
    simp_df['lb'] = lb
    simp_df['ub'] = ub
    simp_df['n'] = total_n
    simp_df['sequence'] = df.sequence
    
    return simp_df

In [87]:
myDf = simpleMisincStats(bar)

In [74]:
myDf

Unnamed: 0,rate,lb,ub,n,sequence
0,,0.001541,0.998459,0,A
1,,0.001541,0.998459,0,C
2,,0.001541,0.998459,0,A
3,,0.001541,0.998459,0,C
4,,0.001541,0.998459,0,T
5,,0.001541,0.998459,0,G
6,,0.001541,0.998459,0,A
7,,0.001541,0.998459,0,C
8,,0.001541,0.998459,0,G
9,,0.001541,0.998459,0,A


In [33]:
def genCollapsedSimpleDf(df):
    out_df = pd.DataFrame(columns=['mean', 'lb', 'ub', 'sequence'])
    (mean_df, lb_df, ub_df) = simpleMisincStats(df)
    out_df['mean'] = mean_df.apply(np.max, axis=1)
    out_df['lb'] = lb_df.apply(np.max, axis=1)
    out_df['ub'] = ub_df.apply(np.max, axis=1)
    out_df['sequence'] = df['sequence']
    
    return out_df
    

In [34]:
genCollapsedSimpleDf(bar)

Unnamed: 0,mean,lb,ub,sequence
0,0.000000,0.000000,0.000000,A
1,0.000000,0.000000,0.000000,C
2,0.000000,0.000000,0.000000,A
3,0.000000,0.000000,0.000000,C
4,0.000000,0.000000,0.000000,T
5,0.000000,0.000000,0.000000,G
6,0.000000,0.000000,0.000000,A
7,0.000000,0.000000,0.000000,C
8,0.000000,0.000000,0.000000,G
9,0.000000,0.000000,0.000000,A
