# Uncertainty Quantification

## Overview
    we will analyze why we cannot get the right count for some transcripts using the output of salmon. 

## Analyze tools
    we will mainly use dataframe of pandas to analyze the data.

In [2]:
import tsv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
# %matplotlib inline

## root path

In [3]:
root_path = "../data/poly_mo/"

# data Preprocess

## Poly Truth
    Read the file poly_truth.tsv
    Poly_truth.tsv: true counts for each transcript

In [4]:
# Poly_truth.tsv: true counts for each transcript
poly_truth = open(root_path+"poly_truth.tsv")
lines = poly_truth.readlines()
poly_truth.close()
# print l
count = 0
poly_truth = []
for line in lines:
    line = line[:-1]
    l = line.split('\t')
    poly_truth.append(l)

df_poly_truth = pd.DataFrame.from_records(poly_truth[1:], columns=poly_truth[0])

In [5]:
df_poly_truth['transcript_id']=df_poly_truth['transcript_id'].astype(str)
df_poly_truth['count']=df_poly_truth['count'].astype(int)

In [6]:
id_in_true_count = df_poly_truth.transcript_id

## Quant_bootstraps
    Read the file quant_bootstraps.tsv
    Quant_bootstraps.tsv :containing the matrix of bootstrap experiments containing the final count for each transcript in each round of bootstrapping with a row be a bootstrap output and columns be list of transcripts. 

In [7]:
# Quant_bootstraps.tsv :containing the matrix of bootstrap experiments 
# containing the final count for each transcript in each round of bootstrapping 
# with a row be a bootstrap output and columns be list of transcripts. 

quant_bootstraps = tsv.TsvReader(open(root_path+"quant_bootstraps.tsv"))
count = 0
quant_boot = []
for parts in quant_bootstraps:
    quant_boot.append(parts)
#     print len(parts)
# print(len(quant_boot))

In [8]:
df_quant_boot = pd.DataFrame.from_records(quant_boot[1:], columns=quant_boot[0])
# print(len(quant_boot[1,:]))

In [9]:
df_quant_boot = df_quant_boot.astype('float')

In [10]:
df_quant_boot_mean = df_quant_boot.mean()

In [11]:
df_quant_boot_std = df_quant_boot.std()

In [12]:
id_in_quant_boot = list(df_quant_boot.columns)

### Get All use id
#### Attention: there are some ids in truth_id but not in quant_boot

In [13]:
sort_qb = []
use_id = []
for id in id_in_true_count:
    try:
        listed = list(df_quant_boot[id])        
    except KeyError:
#         print('has No '+id) # there are some ids in truth_id but not in quant_boot
        pass
    else:
        use_id.append(id)

## Get the distance between true_count and mean of bootstrapping

### solve the distance of use_id

In [14]:
df_poly_truth = df_poly_truth.set_index(['transcript_id'])

In [15]:
# dist = []
dist=[]
for id in use_id:
    mean = df_quant_boot_mean[id]
    true_count = df_poly_truth.loc[id]
    distance = true_count - mean
    distance = float(distance)
#     dist['id']=distance
    dist.append(distance)

### get the distance of  extended true_id

In [16]:
extend_id = list(set(id_in_quant_boot).difference(set(use_id)))
use_id.extend(extend_id)

In [17]:
for id in extend_id:
    mean = df_quant_boot_mean[id]
    true_count = 0
    distance = mean - true_count
    dist.append(distance)

### add label for the list
    set distance as label for every transcript_id
    And them we will merge this labeled list with list of properties in order to get a list which include both properties and label of every transcript.

In [18]:
labeled_id = [use_id,dist]
labeled = list(map(list,zip(*labeled_id)))

In [19]:
df_labeled_id = pd.DataFrame.from_records(labeled, columns=['Name','label'])
df_labeled_id.Name = df_labeled_id.Name.astype(str)

## Read Quant.sf
    Read the quant.sf file.
    Quant.sf :estimated quantifications for each transcript

In [21]:
# Quant.sf :estimated quantifications for each transcript
quant_file = open(root_path+"quant.sf")
lines = quant_file.readlines()
quant_file.close()
count = 0
quant = []
for line in lines:
    line = line[:-1]
    l = line.split('\t')
    quant.append(l)

In [22]:
df_quant = pd.DataFrame.from_records(quant[1:], columns=quant[0])

In [23]:
df_quant.Name = df_quant.Name.astype(str)
df_quant.Length = df_quant.Length.astype(int)
df_quant.EffectiveLength = df_quant.EffectiveLength.astype(float)
df_quant.TPM = df_quant.TPM.astype(float)
df_quant.NumReads = df_quant.NumReads.astype(float)

## Merge quant.sf and labeled_id to get the useful data for training
    labeled_id is a list of transcript_id togather with label(success(true，set as 1) or fail(flase,set as 0))  
    And we will add the label with the protery from quant.sf in order to analyze the properties of different label.
    Then it will be easy for us to analyze the relation between properties and label and the difference between group of different label.

#### merge the data

In [24]:
df_labeled = df_labeled_id.merge(df_quant, on='Name')

In [25]:
label = df_labeled.pop('label')
df_labeled.insert(5,'label',label)

In [26]:
quant_boot_std = df_quant_boot_std.tolist()
quant_boot_mean = df_quant_boot_mean.tolist()

In [27]:
df_labeled.insert(5,'quant_boot_mean',quant_boot_mean)
df_labeled.insert(6,'quant_boot_std',quant_boot_std)

In [28]:
df_labeled.head(10)

Unnamed: 0,Name,Length,EffectiveLength,TPM,NumReads,quant_boot_mean,quant_boot_std,label
0,ENST00000608495,1672,1472.991,0.0,0.0,0.0,0.0,1.0
1,ENST00000382369,1420,1220.991,1.180968,46.855146,0.0,0.0,9.348416
2,ENST00000360321,1575,1375.991,0.91208,40.780781,0.0,0.0,12.419083
3,ENST00000400269,1022,822.991,2.705958,72.364073,0.0,0.0,21.022501
4,ENST00000382352,3229,3029.991,20.638372,2032.0,0.0,0.0,617.25
5,ENST00000342665,4627,4427.991,13.886148,1998.0,0.0,0.0,807.185
6,ENST00000609179,578,379.005,14.940537,184.0,0.0,0.0,77.61
7,ENST00000217233,2499,2299.991,3.581666,267.681402,45.651584,12.63618,113.376762
8,ENST00000449710,1070,870.991,8.234501,233.054648,41.580917,11.378288,116.0365
9,ENST00000422053,1533,1333.991,4.643081,201.26395,70.977499,10.91263,60.696738


In [90]:
df_labeled.describe()

Unnamed: 0,Length,EffectiveLength,TPM,NumReads,quant_boot_mean,quant_boot_std,label
count,93109.0,93109.0,93109.0,93109.0,93109.0,93109.0,93109.0
mean,2008.924164,1810.144491,10.7401,357.9084,357.9084,8.375432,73.778214
std,2132.499138,2132.174682,201.478857,6924.434,6924.414,27.312889,1282.693605
min,21.0,9.784,0.0,0.0,0.0,0.0,-57221.321454
25%,640.0,441.002,0.0,0.0,0.0,0.0,0.0
50%,1344.0,1144.991,0.0,0.0,0.0,0.0,0.0
75%,2640.0,2439.991,0.173865,6.0,6.949153,4.249446,2.265762
max,109224.0,109024.991,23356.420222,1109005.0,1109140.0,1112.644975,207755.50795


In [29]:
data = df_labeled

# Training

## Prepare the Training data

In [30]:
import sklearn
from sklearn.utils import shuffle  

## Shuffle the data

In [105]:
sfdata = shuffle(data) # make the data random
# input_data = sfdata[['Length','EffectiveLength','TPM','NumReads','quant_boot_mean','quant_boot_std']]
input_data = sfdata[['Length','EffectiveLength','quant_boot_mean','quant_boot_std']]

input_label = sfdata['label']

In [106]:
train = int(len(input_data)*9/10)
test = train+1
train_data = input_data[0:train]
train_label = input_label[0:train]
test_data = input_data[test:]
test_label = input_label[test:]

In [107]:
train_data = train_data.as_matrix()
train_label = train_label.as_matrix()
test_data = test_data.as_matrix()
test_label = test_label.as_matrix()

In [37]:
# trian_data = np.reshape(train_data,(83798,6,1))

## Some models

In [63]:
def cal_mse(regr,test_label):
    error = (regr-test_label)
#     error = error.tolist()
#     error_sq = error.apply(square)
    error_sq = error**2
    mse = error_sq.sum()/len(error)
    return mse

## Neural Network

In [39]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Activation
from keras.layers import Dropout

Using TensorFlow backend.


In [118]:
model = Sequential()
model.add(Dense(1, input_shape=(4,)))
model.add(Activation('sigmoid'))
model.add(Dense(1))
# sgd = keras.optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
rmsp = keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-06)
model.compile(optimizer=rmsp,loss='mse')

In [None]:
model.fit(train_data, train_label, epochs=3, batch_size=16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

In [110]:
pred_distance = model.predict(test_data)

In [111]:
cal_mse(pred_distance,test_label)

14750512325.551928

In [None]:
model.save('nn.h5')

## Linear Regression

In [48]:
from sklearn.linear_model import LinearRegression

In [50]:
lr = LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=1)

In [51]:
lr.fit(train_data,train_label)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [52]:
pred = lr.predict(test_data)

In [66]:
print(pred[0:10])

[ 16.39705994  16.46410104  16.4096305   16.41684938  16.31698863
  15.78670591  20.69657738  16.83202467  16.37960305   8.25260346]


In [68]:
print(test_label[0:10])

[ 0.          0.          0.29358114  0.          0.          0.          0.
  0.          0.          0.        ]


In [71]:
test_label[10:20]

array([   0.        ,    0.        ,  533.6       ,    0.        ,
         71.515     ,    0.        ,    0.        ,    0.        ,
          2.11996099,    0.        ])

In [64]:
lr_mse = cal_mse(pred,test_label)

In [91]:
train_pred = lr.predict(train_data)

In [92]:
train_mse = cal_mse(train_pred,train_label)

In [93]:
train_mse

223663.12443406248

In [65]:
lr_mse

569298.01021590026

## SVR

In [94]:
from sklearn.svm import SVR

In [96]:
clf = SVR(C=1.0, epsilon=0.2)
clf.fit(train_data,train_label)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [97]:
svr_regr = clf.predict(test_data)

In [99]:
svr_mse = cal_mse(svr_regr,test_label)

In [100]:
svr_mse

4069770.9581566448