# Uncertainty Quantification

## Overview
    we will analyze why we cannot get the right count for some transcripts using the output of salmon. 

## Analyze tools
    we will mainly use dataframe of pandas to analyze the data.

In [1]:
import tsv
import numpy as np
import pandas as pd

# data Preprocess

## Poly Truth
    Read the file poly_truth.tsv
    Poly_truth.tsv: true counts for each transcript

In [2]:
# Poly_truth.tsv: true counts for each transcript
poly_truth = open("../data/poly_mo/poly_truth.tsv")
lines = poly_truth.readlines()
poly_truth.close()
# print l
count = 0
poly_truth = []
for line in lines:
    line = line[:-1]
    l = line.split('\t')
    poly_truth.append(l)

df_poly_truth = pd.DataFrame.from_records(poly_truth[1:], columns=poly_truth[0])

In [3]:
df_poly_truth['transcript_id']=df_poly_truth['transcript_id'].astype(str)
df_poly_truth['count']=df_poly_truth['count'].astype(int)

In [4]:
df_poly_truth.describe()

Unnamed: 0,count
count,26959.0
mean,1483.678
std,15040.54
min,1.0
25%,17.0
50%,142.0
75%,735.0
max,1316896.0


In [5]:
df_poly_truth.head(5)

Unnamed: 0,transcript_id,count
0,ENST00000608495,1
1,ENST00000382369,55
2,ENST00000360321,54
3,ENST00000400269,92
4,ENST00000382352,2653


In [6]:
truth_id = df_poly_truth.transcript_id

## Quant_bootstraps
    Read the file quant_bootstraps.tsv
    Quant_bootstraps.tsv :containing the matrix of bootstrap experiments containing the final count for each transcript in each round of bootstrapping with a row be a bootstrap output and columns be list of transcripts. 

In [7]:
# Quant_bootstraps.tsv :containing the matrix of bootstrap experiments 
# containing the final count for each transcript in each round of bootstrapping 
# with a row be a bootstrap output and columns be list of transcripts. 

#??? what's the meaning? count of every round?
quant_bootstraps = tsv.TsvReader(open("../data/poly_mo/quant_bootstraps.tsv"))
count = 0
quant_boot = []
for parts in quant_bootstraps:
    quant_boot.append(parts)
#     print len(parts)
print(len(quant_boot))

201


In [8]:
df_quant_boot = pd.DataFrame.from_records(quant_boot[1:], columns=quant_boot[0])
# print(len(quant_boot[1,:]))

In [2]:
df_quant_boot.describe()

In [9]:
id_qb = list(df_quant_boot.columns)

### Sort the quant_boot Results
#### Attention: there are some ids in truth_id but not in quant_boot

In [11]:
sort_qb = []
use_id = []
for id in truth_id:
    try:
        listed = list(df_quant_boot[id])        
    except KeyError:
#         print('has No '+id) # there are some ids in truth_id but not in quant_boot
        pass
    else:
        use_id.append(id)
        listed.sort()
        sort_qb.append(listed)

### reverse sort_qb

In [12]:
sort_qb = list(map(list,zip(*sort_qb)))

### transfer to dataframe

In [13]:
df_qb_sorted = pd.DataFrame.from_records(sort_qb, columns=use_id)

## Filter the False

### find the value of 2.5% & 97.5% and the false transcript is out of this range

In [14]:
df_poly_truth = df_poly_truth.set_index(['transcript_id'])

In [15]:
sum = len(sort_qb)
print(int(sum*0.025))
print(int(sum*0.975))
percent2dot5 = df_qb_sorted.loc[int(sum*0.025)-1]
percent97dot5 = df_qb_sorted.loc[int(sum*0.975)-1]

5
195


## divide the transcript_id into two group
    if the true_count of the id is in the range(2.5%,97.5%)，we will set it in the true_id group.
    if the true_coutn of the id is out of the range(2.5%,97.5%), we set it in the false_id group.

In [16]:
true_id = []
false_id = []
for id in use_id:
    down = float(percent2dot5[id])
    up = float(percent97dot5[id])
    true_count = df_poly_truth.loc[id]
    true_count = float(true_count)
    if true_count>down and true_count<up:
        true_id.append(id)
    else:
        false_id.append(id)

### directly set the transcript_id whose true_count is zeros into the true_id group

#### get the different set of id from quant_boot and poly_truth.
    This set include the transcripts_id whose true count is zero.

In [19]:
extend_true = list(set(id_qb).difference(set(truth_id)))

#### get the extended true_id

In [21]:
true_id.extend(extend_true)

### concatenate the true and false id
    we put the true_id and false_id in the same list in sequence, and then we will add label for them.

In [25]:
# concatenate the true and false id in one list
all_id = list(true_id)
all_id.extend(false_id)

### add label for the list
    set label for every transcript_id(success(true_id,set as 1),fail(false_id,set as 0))
    And them we will merge this labeled list with list of properties in order to get a list which include both properties and label of every transcript.

In [26]:
# add label for the id
label = []
for i in range(len(true_id)):
    label.append(1)
for i in range(len(false_id)):
    label.append(0)
    
labeled_id = [all_id,label]
labeled = list(map(list,zip(*labeled_id)))

## Read Quant.sf
    Read the quant.sf file.
    Quant.sf :estimated quantifications for each transcript

In [27]:
# Quant.sf :estimated quantifications for each transcript
quant_file = open("../data/poly_mo/quant.sf")
lines = quant_file.readlines()
quant_file.close()
count = 0
quant = []
for line in lines:
    line = line[:-1]
    l = line.split('\t')
    quant.append(l)

In [28]:
df_quant = pd.DataFrame.from_records(quant[1:], columns=quant[0])

In [29]:
df_quant.Name = df_quant.Name.astype(str)
df_quant.Length = df_quant.Length.astype(int)
df_quant.EffectiveLength = df_quant.EffectiveLength.astype(float)
df_quant.TPM = df_quant.TPM.astype(float)
df_quant.NumReads = df_quant.NumReads.astype(float)

## Merge quant.sf and labeled_id to get the useful data for training
    labeled_id is a list of transcript_id togather with label(success(true，set as 1) or fail(flase,set as 0))  
    And we will add the label with the protery from quant.sf in order to analyze the properties of different label.
    Then it will be easy for us to analyze the relation between properties and label and the difference between group of different label.

#### merge the data

In [30]:
df_labeled_id = pd.DataFrame.from_records(labeled, columns=['Name','label'])
df_labeled_id.Name = df_labeled_id.Name.astype(str)
df_labeled = df_labeled_id.merge(df_quant, on='Name')

#### put label as the last property

In [31]:
label = df_labeled.pop('label')
df_labeled.insert(5,'label',label)

#### set "data = df_labeled" for easy processing the data below 

In [32]:
data = df_labeled

## Analyze the difference between true & false data

### we can easily found that the mean value of TPM & NumReads has great difference between false and true

In [33]:
data.groupby([data.label]).mean()

Unnamed: 0_level_0,Length,EffectiveLength,TPM,NumReads
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2445.00735,2245.894343,52.556885,1784.370671
1,1905.694197,1706.993431,0.841197,20.235059


### the max value of TPM & NumReads alse has great difference between false and true

In [34]:
data.groupby([data.label]).max()

Unnamed: 0_level_0,Name,Length,EffectiveLength,TPM,NumReads
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,ENST00000610278,101518,101318.991,23356.420222,1109005.0
1,ENST00000610279,109224,109024.991,10710.459004,37690.85


In [35]:
data.groupby([data.label]).min()

Unnamed: 0_level_0,Name,Length,EffectiveLength,TPM,NumReads
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,ENST00000000233,158,10.987,0.0,0.0
1,ENST00000000412,21,9.784,0.0,0.0


## Eq_classes.txt

In [None]:
# Eq_classes.txt: list of equivalence classes and their information