In [1]:
import os, math, itertools
import numpy as np

In [2]:
from generate_test_sample import generate_sample
from sample_reader import SampleReader

In [3]:
samples_fv = SampleReader().read('/Users/andrewbzikadze/chihua/datasets/AbVitro/flu_time_course/FV/', ['25'])
samples_gmc = SampleReader().read('/Users/andrewbzikadze/chihua/datasets/AbVitro/flu_time_course/GMC/', ['8'])
samples_ido = SampleReader().read('/Users/andrewbzikadze/chihua/datasets/AbVitro/flu_time_course/IDO/')
samples_age = SampleReader().read('/Users/andrewbzikadze/chihua/datasets/age/')
samples_paired = SampleReader().read('/Users/andrewbzikadze/chihua/datasets/AbVitro/paired/')

In [4]:
from shm_kmer_model_estimator import ShmKmerModelEstimator

In [9]:
estimator = ShmKmerModelEstimator()
est_model = estimator.estimate_models_of_one_type((samples_age,
                                                   samples_ido,
                                                   samples_fv,
                                                   samples_gmc,
                                                   samples_paired
                                                  ),
                                                  chains=['IGH'])

NoKNeighbours: IGH
Trivial: IGH


In [10]:
est_model['NoKNeighbours']['IGH']

Unnamed: 0,beta_shape1,beta_shape2,dir_shape1,dir_shape2,dir_shape3,success_optim_beta,success_optim_dir
AAAAA,7.146083,97.292401,7.391904,22.786171,3.108382,1.0,1.0
AAAAC,5.632682,87.909398,4.517482,12.852754,2.116969,1.0,1.0
AAAAG,12.202138,276.022835,5.445092,14.850338,1.756242,1.0,1.0
AAAAT,2.184276,19.908167,0.978943,1.999556,0.978943,1.0,0.0
AAACA,5.759037,106.243574,4.398597,9.349050,1.289642,1.0,1.0
AAACC,7.042889,111.368976,6.212425,15.620687,4.015276,1.0,1.0
AAACG,0.139305,2.665921,0.845697,1.771517,0.382787,1.0,0.0
AAACT,3.828654,56.912397,5.438131,6.513621,4.081127,1.0,1.0
AAAGA,1.513937,84.662514,0.974197,1.999334,0.974197,1.0,0.0
AAAGC,6.512415,135.272392,4.276333,11.903764,2.483350,1.0,1.0


In [17]:
est_model['NoKNeighbours']['IGH'].to_csv('/Users/andrewbzikadze/chihua/datasets/shm_model_no_kneighbours.csv', na_rep='NaN')
est_model['Trivial']['IGH'].to_csv('/Users/andrewbzikadze/chihua/datasets/shm_model_trivial.csv', na_rep='NaN')

In [11]:
def apply_to_datasets(datasets, func):
    result = dict.fromkeys(datasets)
    for strategy in datasets:
        result[strategy] = dict.fromkeys(datasets[strategy])
        for chain_type in datasets[strategy]:
            result[strategy][chain_type] = \
            func(datasets[strategy][chain_type])
    return result

In [12]:
apply_to_datasets(est_model, (lambda x: x.isnull().any(axis=1).mean()))

{'NoKNeighbours': {'IGH': 0.01171875}, 'Trivial': {'IGH': 0.0068359375}}

In [13]:
apply_to_datasets(est_model, (lambda x: x['success_optim_beta'].mean()))

{'NoKNeighbours': {'IGH': 0.8505859375}, 'Trivial': {'IGH': 0.9228515625}}

In [14]:
apply_to_datasets(est_model, lambda x: x['success_optim_dir'].mean())

{'NoKNeighbours': {'IGH': 0.89453125}, 'Trivial': {'IGH': 0.94921875}}

In [31]:
def sd_beta(x):
    a, b = x['beta_shape1'], x['beta_shape2']
    return np.sqrt(a * b / ((a + b)**2 * (a + b + 1)))

In [32]:
apply_to_datasets(est_model, sd_beta)

{'NoKNeighbours': {'IGH': AAAAA    0.017486
  AAAAC    0.018470
  AAAAG    0.014571
  AAAAT    0.038216
  AAACA    0.025841
  AAACC    0.019296
  AAACG    0.233854
  AAACT    0.011597
  AAAGA    0.003977
  AAAGC    0.011693
  AAAGG    0.011516
  AAAGT    0.012435
  AAATA    0.011563
  AAATC    0.012519
  AAATG    0.011079
  AAATT    0.001226
  AACAA    0.055360
  AACAC    0.016209
  AACAG    0.015431
  AACAT    0.023711
  AACCA    0.019547
  AACCC    0.013522
  AACCG    0.021619
  AACCT    0.247436
  AACGA    0.000000
  AACGC    0.010702
  AACGG         NaN
  AACGT    0.234575
  AACTA    0.029949
  AACTC    0.040355
             ...   
  TTGAG    0.005885
  TTGAT    0.021919
  TTGCA    0.008965
  TTGCC    0.000000
  TTGCG         NaN
  TTGCT    0.306186
  TTGGA    0.006131
  TTGGC    0.006341
  TTGGG    0.010758
  TTGGT    0.011646
  TTGTA    0.044723
  TTGTC    0.000362
  TTGTG    0.021424
  TTGTT    0.123477
  TTTAA    0.000000
  TTTAC    0.015433
  TTTAG    0.020614
  TTTAT    0.027

In [45]:
def sd_dir(x):
    alpha = x.ix[:, np.arange(2, 5)]
    print(x)
    alpha_sum = np.sum(alpha)
    denominator = alpha_sum * np.sqrt(alpha_sum + 1)
    return np.sqrt(alpha * (alpha_sum - alpha)) / denominator
    # a, b, c = x['dir_shape1'], x['dir_shape2'], x['dir_shape1']
    # a0 = a + b = c
    # denominator = a0 * np.sqrt(a0 + 1)
    # return np.sqrt(

In [46]:
apply_to_datasets(est_model, sd_dir)

       beta_shape1    beta_shape2    dir_shape1    dir_shape2    dir_shape3  \
AAAAA    13.867292     174.737317     11.119210     13.526186      7.515945   
AAAAC     7.281846     147.875912      5.137314     20.252963      2.521210   
AAAAG    13.404396     191.874566      6.158950     10.127302      3.419595   
AAAAT    20.166262     281.123462      7.432446     34.663248      3.154815   
AAACA    18.551463     312.912307     25.150872     23.809888      7.129203   
AAACC     4.989137      76.464260      3.808253      7.335002      1.019364   
AAACG     7.933607     102.632138      5.953848     11.233465      2.778540   
AAACT     9.424777     141.579633     13.130946     13.119252      3.678593   
AAAGA    10.473991     129.512994      8.901366     18.279832      6.741694   
AAAGC    23.080843     394.969763     11.928094     22.853410      3.407771   
AAAGG     8.238353     203.135713      2.074368      0.922025      0.703436   
AAAGT     7.532897     114.392074      5.967661     

{'NoKNeighbours': {'IGH':        dir_shape1  dir_shape2  dir_shape3
  AAAAA    0.000030    0.000046    0.000020
  AAAAC    0.000026    0.000030    0.000018
  AAAAG    0.000025    0.000040    0.000019
  AAAAT    0.000028    0.000073    0.000022
  AAACA    0.000009    0.000012    0.000009
  AAACC    0.000022    0.000032    0.000021
  AAACG    0.000007    0.000011    0.000007
  AAACT    0.000036    0.000037    0.000032
  AAAGA    0.000008    0.000011    0.000008
  AAAGC    0.000032    0.000047    0.000024
  AAAGG    0.000028    0.000039    0.000018
  AAAGT    0.000009    0.000012    0.000009
  AAATA    0.000021    0.000047    0.000025
  AAATC    0.000009    0.000012    0.000009
  AAATG    0.000024    0.000023    0.000030
  AAATT    0.000009    0.000012    0.000009
  AACAA    0.000012    0.000014    0.000027
  AACAC    0.000041    0.000033    0.000078
  AACAG    0.000040    0.000029    0.000074
  AACAT    0.000023    0.000017    0.000054
  AACCA    0.000026    0.000034    0.000074
  AACCC 