In [1]:
%matplotlib inline
import pandas as pd
import random as rnd
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter, MaxNLocator
pd.set_option('display.max_rows', 2000)

# Data Loading and count check.
This portion of the code is intended to load the datafiles into dataframes to better handling and join. Also, will present the verification of the results from Table 2 presented in the [paper](https://www.scienceopen.com/document/review?3&id=7a235c56-a1da-498f-98ef-6fa16c8fd2f0&review=9870bf08-c06e-403a-85ba-774e27cdb520). 

In the following lines of code we have dropped the columns not relevant for the analysis, the final outcome are dataframes for each data sample and a merged dataframe with all the samples. 

In [2]:
#Colony dataframes
bishayee_colony = pd.read_excel("Bishayee Colony Counts 10.27.97-3.8.01.xlsx", skiprows=2,na_values=" ")
outside3_colony = pd.read_excel("Outside Lab 3.Colony Counts.2.4.10-5.21.12.xlsx", skiprows=1)
others_colony = pd.read_excel("Other Investigators in Lab.Colony Counts.4.23.92-11.27.02.xlsx", skiprows=1)
    #Clean up of the colony data
bishayee_colony["Inv"] = "Z"
bishayee_colony.drop('ISOTOPE', axis=1, inplace=True)
bishayee_colony2 = bishayee_colony.iloc[:,[0,1,6,2,3,4,5]]
bishayee_colony2.columns
bishayee_colony2.rename(columns={'Bate # B0/B00':'Batch'},inplace = True)
bishayee_colony2.drop('Batch', axis=1, inplace=True)
bishayee_colony2.drop("Date",axis =1,inplace=True)
others_colony2 = others_colony.rename(columns={'Bates # B00 or B0':'Batch'})
others_colony2.drop('Batch', axis=1, inplace=True)
others_colony2.drop("Date",axis =1,inplace=True)
outside3_colony["Inv"] = "O"
outside3_colony2=outside3_colony.iloc[:,[0,5,1,2,3,4]]
outside3_colony2.rename(columns={'date':'Date','c1':'col1','c2':'col2','c3':'col3'},inplace = True)
outside3_colony2.drop("Date",axis =1,inplace=True)
df_total = [bishayee_colony2,others_colony2,outside3_colony2]
merged_colony_data = pd.concat(df_total)    
#Coulter dataframes
bishayee_coulter = pd.read_excel("Bishayee Coulter Counts.10.20.97-7.16.01.xlsx", skiprows=1)
others_coulter = pd.read_excel("Other Investigators in Lab.Coulter Counts.4.15.92-5.21.05.xlsx", skiprows=1)
    #This data file had no header, introduced manually
outside1_coulter = pd.read_excel("Outside Lab 1.Coulter Counts.6.7.91-4.9.99.xlsx")
outside1_coulter.columns = ["Experiment", "C1", "C2", "C3","Average","Date"]
outside2_coulter = pd.read_excel("Outside Lab 2.Coulter Counts.6.6.08-7.7.08.xlsx", skiprows=1)
    #Clean up of the coulter data 
bishayee_coulter2 = bishayee_coulter.drop('Bates', axis=1)
bishayee_coulter2["Inv"] = "Z"
bishayee_coulter2 = bishayee_coulter2.iloc[:,[0,5,1,2,3,4]]
bishayee_coulter2.rename(columns={'Count 1':'col1','Count 2':'col2','Count 3':'col3'},inplace=True)
bishayee_coulter2.drop("Date",axis =1,inplace=True)
outside1_coulter2 = outside1_coulter.drop('Experiment', axis = 1)
outside1_coulter2["Inv"] = "O1"
outside1_coulter2 = outside1_coulter2.iloc[:,[4,5,0,1,2,3]]
outside1_coulter2.rename(columns={'C1':'col1','C2':'col2','C3':'col3'},inplace=True)
outside1_coulter2.drop("Date",axis =1,inplace=True)
outside2_coulter2 = outside2_coulter.rename(columns={'Count 1':"col1",'Count 2':"col2",'Count 3':"col3"})
outside2_coulter2["Inv"] = "O2"
outside2_coulter2 = outside2_coulter2.iloc[:,[0,5,1,2,3,4]]
outside2_coulter2.drop("Date",axis =1,inplace=True)
others_coulter2 = others_coulter.drop('Bates No.', axis = 1)
others_coulter2.rename(columns={'Coul 1':'col1','Coul 2':'col2','Coul 3':'col3','Investigator':'Inv'},\
                       inplace=True)
others_coulter2 = others_coulter2.iloc[:,[0,5,1,2,3,4]]
others_coulter2.drop("Date",axis =1,inplace=True)
df_total_coulter = [bishayee_coulter2,others_coulter2,outside1_coulter2,outside2_coulter2]
merged_coulter_data = pd.concat(df_total_coulter)

The following step is to verify the magnitudes of the data in the sets for the coulter and the colony. From the description of the experimental set-up, it seems that the count of the cells using the coulter and the colony count are actually individual experiments with different methods.

Completeness test of the combined colony data

In [3]:
print 'Total number of colony data triplets  ' + str(merged_colony_data.index.size)
print 'Total number of suspected colony data triplets  ' + str(bishayee_colony2.index.size)
print 'Total number of others colony data triplets  ' + str(others_colony2.index.size)
print 'Total number of outside lab 3 colony data triplets  ' + str(outside3_colony2.index.size)

Total number of colony data triplets  2033
Total number of suspected colony data triplets  1362
Total number of others colony data triplets  621
Total number of outside lab 3 colony data triplets  50


The raw total number of samples doesn't coincide with the numbers in the [paper](https://www.scienceopen.com/document/review?3&id=7a235c56-a1da-498f-98ef-6fa16c8fd2f0&review=9870bf08-c06e-403a-85ba-774e27cdb520). Furthermore, the labeling of the outside labs is not consistent. It looks like the authors labeled the outside colony counts at lab1 when indeed is lab3. 

The following step is to eliminate incomplete triples and drop the na rows and check again against the authors' table. The designation (AD) is used to differentiate the data after drop.

In [4]:
merged_colony_data.replace(r'\s*',np.nan, regex=True)
merged_colony_data=merged_colony_data.dropna()
merged_colony_data.reset_index(drop=True)
print 'Total number of colony data triplets (AD) ' + str(merged_colony_data.index.size) 
bishayee_colony2.replace(r'\s*',np.nan, regex=True)
bishayee_colony2 = bishayee_colony2.dropna()
bishayee_colony2.reset_index(drop=True)
print 'Total number of suspected colony data triplets (AD) ' + str(bishayee_colony2.index.size)
others_colony2.replace(r'\s*',np.nan, regex=True)
others_colony2 = others_colony2.dropna()
others_colony2.reset_index(drop=True)
print 'Total number of others colony data triplets (AD) ' + str(others_colony2.index.size)
outside3_colony2.replace(r'\s*',np.nan, regex=True)
outside3_colony2 = outside3_colony2.dropna()
outside3_colony2.reset_index(drop=True)
print 'Total number of outside lab 3 colony data triplets  ' + str(outside3_colony2.index.size)

Total number of colony data triplets (AD) 2008
Total number of suspected colony data triplets (AD) 1361
Total number of others colony data triplets (AD) 597
Total number of outside lab 3 colony data triplets  50


The same data clean up and count of the the triplets is performed with the coulter data

In [5]:
print 'Total number of coulter data triplets  ' + str(merged_coulter_data.index.size)
print 'Total number of suspected coulter data triplets  ' + str(bishayee_coulter2.index.size)
print 'Total number of others coulter data triplets  ' + str(others_coulter2.index.size)
print 'Total number of outside lab1 data triplets  ' + str(outside1_coulter2.index.size)
print 'Total number of outside lab2 data triplets  ' + str(outside2_coulter2.index.size)

Total number of coulter data triplets  2965
Total number of suspected coulter data triplets  1729
Total number of others coulter data triplets  1007
Total number of outside lab1 data triplets  109
Total number of outside lab2 data triplets  120


In [6]:
merged_coulter_data.replace(r'\s*',np.nan, regex=True)
merged_coulter_data = merged_coulter_data.dropna()
merged_coulter_data.reset_index(drop=True)
print 'Total number of coulter data triplets (AD) ' + str(merged_coulter_data.index.size)
bishayee_coulter2.replace(r'\s*',np.nan, regex=True)
bishayee_coulter2 = bishayee_coulter2.dropna()
bishayee_coulter2.reset_index(drop=True)
print 'Total number of suspected coulter data triplets (AD) ' + str(bishayee_coulter2.index.size)
others_coulter2.replace(r'\s*',np.nan, regex=True)
others_coulter2 = others_coulter2.dropna()
others_coulter2.reset_index(drop=True)
print 'Total number of others coulter data triplets (AD) ' + str(others_coulter2.index.size)
outside1_coulter2.replace(r'\s*',np.nan, regex=True)
outside1_coulter2 = outside1_coulter2.dropna()
outside1_coulter2.reset_index(drop=True)
print 'Total number of outside lab1 data triplets (AD) ' + str(outside1_coulter2.index.size)
outside2_coulter2.replace(r'\s*',np.nan, regex=True)
outside2_coulter2 = outside2_coulter2.dropna()
outside2_coulter2.reset_index(drop=True)
print 'Total number of outside lab2 data triplets (AD) ' + str(outside2_coulter2.index.size)

Total number of coulter data triplets (AD) 2873
Total number of suspected coulter data triplets (AD) 1727
Total number of others coulter data triplets (AD) 929
Total number of outside lab1 data triplets (AD) 97
Total number of outside lab2 data triplets (AD) 120


The analysis of the complete triples yields a discrepancy when compared to Table 2. The results of the count are consistent for "total" in the fourth column. The RTS (suspected) colony counts, and the outside lab3 (labeled in the [paper](https://www.scienceopen.com/document/review?3&id=7a235c56-a1da-498f-98ef-6fa16c8fd2f0&review=9870bf08-c06e-403a-85ba-774e27cdb520) as lab1) are equal. The only discrepancy is the count for the other's colony count; however, the discrepancy is very small i.e., 8 triplets. 

In the case of the coulter counts there is no discrepancy with respect to the No. complete, the others, lab2 and lab1 (labeled in the [paper](https://www.scienceopen.com/document/review?3&id=7a235c56-a1da-498f-98ef-6fa16c8fd2f0&review=9870bf08-c06e-403a-85ba-774e27cdb520) as lab3) are consistent with Table 2. The discrepancy appears in the RTS (suspected) sample, but again it is a comparatively small differnece i.e., 11. 

It is worth nothing that the authors dropped triplets with gap $\ge$ 2, this is not completely explained in the [paper](https://www.scienceopen.com/document/review?3&id=7a235c56-a1da-498f-98ef-6fa16c8fd2f0&review=9870bf08-c06e-403a-85ba-774e27cdb520) and it is only mentioned in a footnote in Table 2. If there are issues with the gaps, they are expected to show up in the ratio of the distances between the samples in the forthcoming analysis.  

# Calculation of the distance ratios

A function is used to take a dataframe and add columns with the values of the distance ratios between the samples, even though the [paper](https://www.scienceopen.com/document/review?3&id=7a235c56-a1da-498f-98ef-6fa16c8fd2f0&review=9870bf08-c06e-403a-85ba-774e27cdb520) only mentions a single ratio we tested the distances from the mid value in both directions. The proceedure for the calculations is as follows: 

$$
\begin{align}
\text{sort} (\hat{t})\\
\\
LowerRatio = \frac{\hat{t}(2) - \hat{t}(1)}{\hat{t}(3) - \hat{t}(1)}\\
\\
UpperRatio = \frac{\hat{t}(3) - \hat{t}(2)}{\hat{t}(3) - \hat{t}(1)}
\end{align}
$$

In [12]:
def DistanceRatioCalculation(data_frame):
    '''
    Calculates the ratio of the distance between a triplet sample.
    $$
    \begin{align}
    \text{sort} (\hat{t})\\
    \\
    LowerRatio = \frac{\hat{t}(2) - \hat{t}(1)}{\hat{t}(3) - \hat{t}(1)}\\
    \\
    UpperRatio = \frac{\hat{t}(3) - \hat{t}(2)}{\hat{t}(3) - \hat{t}(1)}
    \end{align}
    $$
    '''
    if not isinstance(data_frame, pd.DataFrame):
        print 'Wrong data type, dataframe expected'
    else:
        data_frame["LowerRatio"] = 0
        data_frame["UpperRatio"] = 0
        for i in range(0,len(data_frame)):
            temp_array = np.ones(3)
            for k in range(0,3):
                temp_array[k] = data_frame.iloc[i,(1+k)]
            temp_array = np.sort(temp_array)
            temp_l = (temp_array[1]-temp_array[0])/(temp_array[2]-temp_array[0])
            temp_u = (temp_array[2]-temp_array[1])/(temp_array[2]-temp_array[0])
            data_frame.iloc[i,5] = temp_l
            data_frame.iloc[i,6] = temp_u
            if (data_frame.iloc[i,5]) == 0 and (data_frame.iloc[i,6]) != 1:
                print "data issues"
    return data_frame

In [13]:
merged_colony_data = DistanceRatioCalculation(merged_colony_data)
merged_colony_data = merged_colony_data.dropna()
print 'Total number of colony data triplets (AD) ' + str(merged_colony_data.index.size) 
bishayee_colony2 = DistanceRatioCalculation(bishayee_colony2)
bishayee_colony2 = bishayee_colony2.dropna()
print 'Total number of suspected colony data triplets (AD) ' + str(bishayee_colony2.index.size)
others_colony2 = DistanceRatioCalculation(others_colony2)
others_colony2 = others_colony2.dropna()
print 'Total number of others colony data triplets (AD) ' + str(others_colony2.index.size)
outside3_colony2 = DistanceRatioCalculation(outside3_colony2)
outside3_colony2 = outside3_colony2.dropna()
print 'Total number of outside lab 3 colony data triplets  ' + str(outside3_colony2.index.size)

Total number of colony data triplets (AD) 1359
Total number of suspected colony data triplets (AD) 1360
Total number of others colony data triplets (AD) 595
Total number of outside lab 3 colony data triplets  50


In [14]:
merged_coulter_data = DistanceRatioCalculation(merged_coulter_data)
merged_coulter_data = merged_coulter_data.dropna()
print 'Total number of coulter data triplets (AD) ' + str(merged_coulter_data.index.size)
bishayee_coulter2 = DistanceRatioCalculation(bishayee_coulter2)
bishayee_coulter2 = bishayee_coulter2.dropna()
print 'Total number of suspected coulter data triplets (AD) ' + str(bishayee_coulter2.index.size)
others_coulter2 = DistanceRatioCalculation(others_coulter2)
others_coulter2 = others_coulter2.dropna()
print 'Total number of others coulter data triplets (AD) ' + str(others_coulter2.index.size)
outside1_coulter2 = DistanceRatioCalculation(outside1_coulter2)
outside1_coulter2 = outside1_coulter2.dropna()
print 'Total number of outside lab1 data triplets (AD) ' + str(outside1_coulter2.index.size)
outside2_coulter2 = DistanceRatioCalculation(outside2_coulter2)
outside2_coulter2 = outside2_coulter2.dropna()
print 'Total number of outside lab2 data triplets (AD) ' + str(outside2_coulter2.index.size)

Total number of coulter data triplets (AD) 2873
Total number of suspected coulter data triplets (AD) 1727
Total number of others coulter data triplets (AD) 929
Total number of outside lab1 data triplets (AD) 97
Total number of outside lab2 data triplets (AD) 120


In [None]:
colony_ur = merged_colony_data.as_matrix(['UpperRatio'])
colony_ur=np.squeeze(colony_ur)
colony_ur = colony_ur[~np.isnan(colony_ur)]
colony_lr = merged_colony_data.as_matrix(['LowerRatio'])
colony_lr=np.squeeze(colony_ur)
colony_lr = colony_lr[~np.isnan(colony_lr)]
coulter_ur = merged_coulter_data.as_matrix(['UpperRatio'])
coulter_ur=np.squeeze(coulter_ur)
coulter_lr = merged_coulter_data.as_matrix(['LowerRatio'])
coulter_lr=np.squeeze(coulter_lr)

In [None]:
#Print empirical CDF plot from a data vector x. 
def cdf_plot(x):
    x_data = np.sort(x)
    y_data = 1. * np.arange(len(x_data))/(len(x_data)-1)
    plt.plot(x_data, y_data)

In [None]:
print coulter_ur.shape 
sample_uniform = np.random.uniform(0,1,1000)
cdf_plot(coulter_ur)
cdf_plot(coulter_lr)
cdf_plot(sample_uniform)

In [None]:
print colony_ur.shape
sample_uniform = np.random.uniform(0,1,1000)
cdf_plot(colony_ur)
cdf_plot(colony_lr)
cdf_plot(sample_uniform)

In [None]:
def ReturnSample(l,n):
    indexes=rnd.sample(range(1,len(l)),n)  
    return np.asarray([l[i] for i in indexes])

In [None]:
def uniformkspermutation(data,permutations,sample_size=100):
    """
    This function is designed to take a set of data and perform a KS statistic permutation test 
    against an uniform distribution sample of size 10000. The output is an array of sixe trials of KS statistics. 
    
    inputs: data is the total population of data
    permutations: is the number of permutations 
    sample_size: is the sample size of the data draw for each permutation
    """    
    result_array = np.ones(permutations,)
    for t in range(0,permutations):
        permutationsample = ReturnSample(data,sample_size)
        test_result =stats.kstest(permutationsample, 'uniform', N=1000)
        result_array[t] = test_result[0]
    return np.squeeze(result_array)

In [None]:
result = uniformkspermutation(coulter_ur,1000, 200)
plt.hist(result)

The implementation of the ks-test for two samples in scipy is implemented as follows: 

```
data1 = np.sort(data1)
    data2 = np.sort(data2)
    n1 = data1.shape[0]
    n2 = data2.shape[0]
    data_all = np.concatenate([data1, data2])
    cdf1 = np.searchsorted(data1, data_all, side='right') / (1.0*n1)
    cdf2 = np.searchsorted(data2, data_all, side='right') / (1.0*n2)
    d = np.max(np.absolute(cdf1 - cdf2))
    # Note: d absolute not signed distance
    en = np.sqrt(n1 * n2 / float(n1 + n2))
    try:
        prob = distributions.kstwobign.sf((en + 0.12 + 0.11 / en) * d)
    except:
        prob = 1.0

    return Ks_2sampResult(d, prob)
```

If you see, it assummes that the two samples make up the whole distribution space. Hence we need to be careful on how we provide the data to not over-specify the information. 

In [None]:
sample_test1 = ReturnSample(colony_ur,100)
sample_test2 = ReturnSample(colony_ur,1000)
result = stats.ks_2samp(sample_test1, sample_test2)
print result

In [None]:
def ks2samplepermutation(data_benchmark, data_test, trials, sample_size=100):
    """
    This function is designed to take a set of the data "data_benchmark" and perform a KS statistic permutation test 
    against a larger sample to determine wether "data_benchmark" could have been produced from data with the same
    statistical nature as the "data_test"
    """
    if (len(data_benchmark)-len(data_test))/len(data_benchmark) < 0.5:
        print 'The benchmarking data represents more than half of the total sample size, the test might be meaningless'
    result_array = np.ones(trials)
    for t in range(0,trials):
        result = stats.ks_2samp(data_benchmark, data_test)
        result_array[t] = result[0]
    return np.squeeze(result_array)    

In [None]:
sample_test1 = ReturnSample(colony_ur,200)
sample_test2 = ReturnSample(colony_ur,2000)
result = ks2samplepermutation(sample_test1, sample_test2, 1000)

In [None]:
plt.hist(result)

In [None]:
def ks2samplepermutation(data_benchmark, data_test, trials, sample_size=100):
    """
    This function is designed to take a set of the data "data_benchmark" and perform a KS statistic permutation test 
    against a larger sample to determine wether "data_benchmark" could have been produced from data with the same
    statistical nature as the "data_test"
    """
    if (len(data_benchmark)-len(data_test))/len(data_benchmark) < 0.5:
        print 'The benchmarking data represents more than half of the total sample size, the test might be meaningless'
    result_array = np.ones(trials)
    for t in range(0,trials):
        result = stats.ks_2samp(data_benchmark, data_test)
        result_array[t] = result[0]
    return np.squeeze(result_array)   