In [1]:
from functools import partial
import random

import numpy as np
import tables
import pandas as pd

In [2]:
fname = '../raw/total-3L.h5'

In [3]:
def unbound_range(start=0):
    current_value = start
    while True:
        yield current_value
        current_value += 1

In [4]:
#filtering order matters for performance...

def accessibility_filter(recs):
    for rec in recs:
        if rec['accessibility']:
            yield rec


def subsample_filter(recs, cut):
    for rec in recs:
        if random.random() < cut:
            yield rec


#start filter vs end filter
def start_filter(recs, start):
    #There would be more efficient ways of doing this...
    for rec in recs:
        pos = rec['pos']
        if pos < start:
            continue
        else:
            yield rec

            
def end_filter(recs, end):
    for rec in recs:
        pos = rec['pos']
        if pos > end:
            #we can do this because the input is ordered!
            raise StopIteration
        else:
            yield rec
            

def biallelic_filter(recs):
    for rec in recs:
        alt = rec['alt']
        if len([x for x in alt if x != b'']) == 1:
            yield rec

In [5]:
def get_all_data(store):
    genotype_array = store.get_node('/3L/calldata/genotype').iterrows()
    called_array = store.get_node('/3L/calldata/is_called').iterrows()
    accessibility_array = store.get_node('/3L/variants/Accessible').iterrows()
    pos_array = store.get_node('/3L/variants/POS').iterrows()
    alt_array = store.get_node('/3L/variants/ALT').iterrows()
    #this will not work on Python 2 (unless you use ittertools.zip)
    for genotype, accessibility, pos, alt, called in zip(
        genotype_array, accessibility_array, pos_array, alt_array, called_array):
        yield {
            'genotype': genotype,
            'called': called,
            'accessibility': accessibility,
            'pos': pos,
            'alt': alt,
        }


In [6]:
start_function = partial(start_filter, start=10000000)
end_function = partial(end_filter, end=30000000)
subsample_function = partial(subsample_filter, cut=0.1)

In [7]:
#python 3.5!
def filter_subsample(recs):
    yield from biallelic_filter(end_function(start_function(accessibility_filter(subsample_function(recs)))))

def filter_subsample_slow(recs):
    yield from end_function(start_function(accessibility_filter(subsample_function(biallelic_filter(recs)))))
    
def filter_no_subsample(recs):
    yield from end_function(start_function(accessibility_filter(biallelic_filter(recs))))

In [8]:
store = tables.open_file(fname, 'r')

In [9]:
%time subsample_size = sum(1 for x in filter_subsample(get_all_data(store)))
print(subsample_size)

CPU times: user 3min 52s, sys: 1.76 s, total: 3min 54s
Wall time: 3min 54s
397437


In [10]:
%time print(sum(1 for x in filter_subsample_slow(get_all_data(store))))

396245
CPU times: user 4min 4s, sys: 1.67 s, total: 4min 6s
Wall time: 4min 6s


In [11]:
%time sample_size = sum(1 for x in filter_no_subsample(get_all_data(store)))
print(sample_size)

CPU times: user 4min 27s, sys: 1.62 s, total: 4min 29s
Wall time: 4min 29s
3964746


In [16]:
def create_hdf5(name, samples, recs, size):
    w = tables.open_file(name, mode='w', filters=tables.Filters(complib='zlib', complevel=5))
    genotypes = tables.EArray(w.root, 'genotypes', tables.BoolAtom(),
                         expectedrows=size, shape=(0, len(samples), 2))
    for rec in recs:
        old_genotypes = rec['genotype']
        min_val = old_genotypes.min()
        
        genotypes.append([np.array(list(map(lambda x: x == min_val, old_genotypes)))])
    w.create_array(w.root, 'samples', samples, 'Sample ids')    
    w.close()

In [17]:
samples = store.get_node('/3L/samples').read()

In [18]:
create_hdf5('subsample.h5', samples, filter_subsample(get_all_data(store)), subsample_size)

In [None]:
create_hdf5('all.h5', samples, get_all_data(store), subsample_size)

In [None]:
meta = pd.DataFrame.from_csv('../raw/samples.all.txt', sep='\t')
pandas_sub_store = pd.HDFStore('subsample.h5')
pandas_sub_store