# Analysis of Variance of Read Counts
To support evaluation of classifiers, we need to know the variance (standard deviations) of read counts that are normalized in several ways: by library size, by gene size, and w.r.t. time 0. Two variances are of interest. The first is the variability of replications for the same gene and time. The second is the variability of mean values of replicas for a gene.

In [1]:
import init
from common import constants as cn
from common.data_provider import DataProvider
from common_python.plots import util_plots

import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline

## Data

In [5]:
provider = DataProvider()
provider.do()

In [23]:
# The data used in this analysis is a list of dataframe "matrix" that are the log2 values of the
# libary and gene adjusted readcounts divided by time 0.
# Each data frame has columns of genes and rows of time.
dfs = provider.dfs_adjusted_read_count_wrt0
matrices = [df.T[1:] for df in dfs]
for matrix in matrices:
    matrix = matrix.applymap(lambda v: 0 if v == 0 else np.log(v))

In [21]:
matrices[0].head()

Unnamed: 0,Rv0001,Rv0005,Rv0006,Rv0009,Rv0010c,Rv0011c,Rv0013,Rv0014c,Rv0016c,Rv0020c,...,Rvnt36,Rvnt37,Rvnt38,Rvnt39,Rvnt40,Rvnt41,Rvnt42,Rvnt43,Rvnt44,Rvnt45
1,1.072119,1.637825,1.628194,1.959713,1.378715,0.996944,1.666453,1.638025,1.375098,1.475387,...,0.49628,0.792937,0.427036,0.455354,0.848166,1.448165,0.437719,0.745297,0.655512,0.857393
2,1.301797,1.730923,1.519583,2.390467,1.160466,1.152987,1.498363,1.626492,1.405057,1.659761,...,0.973684,0.467172,0.407508,0.562189,0.713544,2.241541,0.516432,0.714582,0.992488,0.689073
3,1.25519,2.040822,1.936701,1.762375,1.171577,1.538041,1.400058,1.49066,1.295392,1.548468,...,0.337012,0.369778,0.547845,0.328393,0.899654,0.99483,0.529903,0.118391,1.138566,0.28345
4,1.353351,1.984227,1.92885,2.045719,1.136251,1.200222,1.161079,1.642649,1.405477,1.562457,...,0.545159,0.380754,0.223361,0.322582,0.586064,1.232037,0.174814,0.055071,0.454141,0.120963
5,1.274508,1.989241,1.580781,2.008867,1.267012,1.412982,1.467291,1.68456,1.281612,1.857274,...,0.409178,0.202661,0.143987,0.260133,0.714756,1.393947,0.137482,0.255278,0.573114,0.167321


In [24]:
len(matrices[0].columns)

2582

## Variance Across times

In [29]:
mean_mat = np.mean(matrices)
std_mat = np.std(matrices)

ValueError: cannot copy sequence with size 25 to array axis with dimension 2582

In [27]:
avg_mat.head()

Unnamed: 0,Rv0001,Rv0005,Rv0006,Rv0009,Rv0010c,Rv0011c,Rv0013,Rv0014c,Rv0016c,Rv0020c,...,Rvnt36,Rvnt37,Rvnt38,Rvnt39,Rvnt40,Rvnt41,Rvnt42,Rvnt43,Rvnt44,Rvnt45
1,1.016997,1.349613,1.34886,1.713399,1.28911,1.010606,1.461666,1.269363,1.301147,1.310925,...,1.184997,1.573115,3.003892,1.221309,0.893812,0.829988,1.30278,1.066189,1.099224,2.613137
2,1.318838,1.657061,1.472357,2.103642,1.250123,1.265521,1.334447,1.361027,1.344718,1.510973,...,1.086868,1.075029,1.854107,0.920354,0.832404,1.425029,0.848703,0.607561,1.185813,1.050341
3,1.19442,1.617192,1.763478,1.632163,1.171438,1.319795,1.277632,1.318328,1.331028,1.451523,...,0.821168,1.492505,1.860031,0.499389,0.781539,1.060353,1.007913,0.315309,1.315067,0.834655
4,1.155473,1.635137,1.709586,1.780096,1.146901,1.095613,1.205745,1.406766,1.407687,1.592445,...,0.628807,1.138256,1.674123,0.636425,0.633919,1.171631,0.7652,0.29652,0.956015,0.718857
5,1.139123,1.607045,1.417559,1.53428,1.093589,1.140771,1.106032,1.234154,1.212447,1.53105,...,1.140393,1.791139,2.72328,1.178717,0.938784,1.247793,1.132774,1.225949,1.090984,1.072698
