In [1]:
import tsv
import numpy as np
import pandas as pd

In [2]:
root_path = "../data/poly_ro/"

## Parse poly_truth.tsv
This file gives us the true count of each transcript.

In [3]:
# Poly_truth.tsv: true counts for each transcript
poly_truth = open(root_path + "poly_truth.tsv")
lines = poly_truth.readlines()
poly_truth.close()

# unlike poly_mo, first line does not have column labels
poly_truth = [['transcript_id','count']]
poly_truth.extend(line[:-1].split('\t') for line in lines)

df_poly_truth = pd.DataFrame.from_records(poly_truth[1:], columns=poly_truth[0])

In [4]:
df_poly_truth['transcript_id']=df_poly_truth['transcript_id'].astype(str)
df_poly_truth['count']=df_poly_truth['count'].astype(int)

## Parse quant_bootstraps.tsv
This file gives us the bootstrap data (201 rounds of sample taking).

In [5]:
quant_bootstraps = tsv.TsvReader(open(root_path + "quant_bootstraps.tsv"))

quant_boot = [line for line in quant_bootstraps]

In [6]:
df_quant_boot = pd.DataFrame.from_records(quant_boot[1:], columns=quant_boot[0])

We find and retrieve the intersecting transcript ids of poly_truth and quant_bootstraps, and sort each id's data by ascending order. There are transcripts in quant_bootstraps that don't show up in poly_truth, we'll deal with them later.

In [7]:
set_qb_id = set(df_quant_boot.columns)
set_pt_id = set(df_poly_truth.transcript_id)
intersect_ids = set_qb_id & set_pt_id 

sort_qb = []
use_id = []
for id in intersect_ids:
    listed = list(df_quant_boot[id])
    listed.sort()
    use_id.append(id)
    sort_qb.append(listed)
sort_qb = list(map(list,zip(*sort_qb)))

In [8]:
df_qb_sorted = pd.DataFrame.from_records(sort_qb, columns=use_id)

## Find confidence interval
Since we have already sorted each transcript id's data, we can find an empirical confidence interval of 95% by locating the numbers at index `(total_length) * 2.5%` and `(total_length) * 97.5%`, which would be the lower and upper bound.

In [9]:
df_poly_truth = df_poly_truth.set_index(['transcript_id'])

sum = len(sort_qb)
percent2dot5 = df_qb_sorted.loc[int(sum*0.025)-1]
percent97dot5 = df_qb_sorted.loc[int(sum*0.975)-1]

## Find the failed transcripts  
Compare the counts given by poly_truth with the lower and upper bound we found earlier. If not in range we treat it as a failed transcript, else true.

In [10]:
true_id = []
false_id = []
for id in use_id:
    down = float(percent2dot5[id])
    up = float(percent97dot5[id])
    true_count = float(df_poly_truth.loc[id])
    if down < true_count < up: 
        true_id.append(id)
    else: 
        false_id.append(id)

We go back to deal with the 'diff' transcript ids we ignored earlier. The counts of these diff transcript ids are zero, and we assume that these are true transcripts.

In [12]:
extend_true = list(set_qb_id.difference(set_pt_id))
true_id.extend(extend_true)

In [13]:
all_id = true_id[:]
all_id.extend(false_id)

We add a label of 1 representing true transcripts and 0 representing failed transcripts for easy grouping later on.

In [14]:
label = [1 if i < len(true_id) else 0 for i in range(len(true_id) + len(false_id))]
    
labeled_id = [all_id,label]
labeled = list(map(list,zip(*labeled_id)))

## Parse quant.sf
This file gives us some attributes of the transcripts.

In [15]:
quant_file = open(root_path + "quant.sf")
lines = quant_file.readlines()
quant_file.close()
quant = [line[:-1].split('\t') for line in lines]

In [16]:
df_quant = pd.DataFrame.from_records(quant[1:], columns=quant[0])

df_quant.Name = df_quant.Name.astype(str)
df_quant.Length = df_quant.Length.astype(int)
df_quant.EffectiveLength = df_quant.EffectiveLength.astype(float)
df_quant.TPM = df_quant.TPM.astype(float)
df_quant.NumReads = df_quant.NumReads.astype(float)

In [17]:
df_labeled_id = pd.DataFrame.from_records(labeled, columns=['Name','label'])
df_labeled_id.Name = df_labeled_id.Name.astype(str)
df_labeled = df_labeled_id.merge(df_quant, on='Name')

label = df_labeled.pop('label')
df_labeled.insert(5,'label',label)

data = df_labeled

## Find common properties of failed transcripts
We group the data by true and failed transcripts, and observe the mean, std, max and min.  
Observing the mean, the average TPM and NumReads of failed transcripts is a lot bigger than the true ones.  
With the std, we find that failed transcripts tend to have a significantly larger variance of TPM and NumReads.

In [18]:
data.groupby([data.label]).mean()

Unnamed: 0_level_0,Length,EffectiveLength,TPM,NumReads,label
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2320.422083,2121.332135,41.485945,1378.079501,0.0
1,1901.681056,1703.008206,0.154861,6.681882,1.0


In [19]:
data.groupby([data.label]).max()

Unnamed: 0_level_0,Name,Length,EffectiveLength,TPM,NumReads
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,ENST00000610279,101518,101318.991,23356.420222,1109005.0
1,ENST00000610276,109224,109024.991,2435.717783,31105.09


In [20]:
data.groupby([data.label]).min()

Unnamed: 0_level_0,Name,Length,EffectiveLength,TPM,NumReads
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,ENST00000000233,82,10.861,0.0,0.0
1,ENST00000000412,21,9.784,0.0,0.0


In [21]:
data.groupby([data.label]).std()

Unnamed: 0_level_0,Length,EffectiveLength,TPM,NumReads
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2302.710811,2302.636007,396.200319,13629.239858
1,2059.80843,2059.407366,9.489371,152.179457
