This notebook contains code for traning a RandomForest regressor to predict measurement biases of miRXplore datasets from TGIRT-seq.

The [R RandomForest](https://cran.r-project.org/web/packages/randomForest/randomForest.pdf) model is being used here becuase neither [scikit-learn randomforest](https://github.com/scikit-learn/scikit-learn/issues/5442) nor the *train* module from [caret](https://stats.stackexchange.com/questions/135671/how-does-caret-handle-factors) can handle categorically-labeled data.

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
import seaborn as sns
from sequencing_tools.viz_tools import color_encoder, simpsons_palette, mixed_sort
from helper_function import *
from feature_selection import make_pca_df, pca_biplot, loading_plot, plot_outliers,\
                        labeling_expression
from rf_modeling import h2o_randomForest, R_randomForest, test_nucleotides, train_to_cat,\
                        rename_col, k_fold_cv, plot_kfold_reg, plot_var,\
                        plot_R2, plot_test

plt.rc('axes', labelsize=15)
plt.rc('xtick', labelsize=15)
plt.rc('ytick', labelsize=15)




Reading in miRNA count data, combining counts from replicates and only look at NTT data, the first and last 3 nucleotides of each miRNA are extracted as predictors, $\Delta log10$ CPM is computed and will be used as target:

In [2]:
df = pd.read_csv('../data/miR_counts.csv') \
    .rename(columns = {'id':'seq_id'})\
    .pipe(pd.melt, var_name = 'prep', value_name = 'seq_count', id_vars='seq_id')\
    .assign(prep = lambda d: d.prep.str.replace('[0-9]+$',''))\
    .groupby(["prep","seq_id"], as_index=False) \
    .agg({'seq_count':'sum'})\
    .merge(get_seq_base(shuffle = [0,1,2,-3,-2,-1]))\
    .assign(cpm = lambda d: d.groupby('prep').seq_count.transform(count_to_cpm))\
    .assign(expected_cpm = lambda d: 1e6 / 962) \
    .assign(Y = lambda d: np.log10(d['cpm']) - np.log10(d['expected_cpm']))  \
    .query('prep == "NTT"')\
    .reset_index() \
    .drop('index', axis=1)
df.head()

  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,prep,seq_id,seq_count,head0,head1,head2,tail0,tail1,tail2,cpm,expected_cpm,Y
0,NTT,EBV-1-1,404.0,T,A,A,G,T,T,11.834411,1039.50104,-1.943678
1,NTT,EBV-1-2,42864.0,T,A,T,T,G,A,1255.619332,1039.50104,0.082033
2,NTT,EBV-1-2-star,2511.0,A,A,A,A,G,C,73.554968,1039.50104,-1.150213
3,NTT,EBV-1-3,3967.0,T,A,A,A,C,A,116.205718,1039.50104,-0.951597
4,NTT,EBV-1-3P,3687.0,T,A,G,G,T,C,108.003651,1039.50104,-0.983386


In [3]:
control_df = pd.read_csv('../data/miR_counts.csv') \
    .rename(columns = {'id':'seq_id'})\
    .pipe(pd.melt, var_name = 'prep', value_name = 'seq_count', id_vars='seq_id')\
    .assign(prep = lambda d: d.prep.str.replace('[0-9]+$',''))\
    .groupby(["prep","seq_id"], as_index=False) \
    .agg({'seq_count':'sum'})\
    .merge(get_seq_base(shuffle = [3,4,5,-6,-5,-4]))\
    .assign(cpm = lambda d: d.groupby('prep').seq_count.transform(count_to_cpm))\
    .assign(expected_cpm = lambda d: 1e6 / 962) \
    .assign(Y = lambda d: np.log10(d['cpm']) - np.log10(d['expected_cpm']))  \
    .query('prep == "NTT"')\
    .reset_index() \
    .drop('index', axis=1)
control_df.head()

  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,prep,seq_id,seq_count,head0,head1,head2,tail0,tail1,tail2,cpm,expected_cpm,Y
0,NTT,EBV-1-1,404.0,C,C,T,G,G,A,11.834411,1039.50104,-1.943678
1,NTT,EBV-1-2,42864.0,C,T,T,A,A,T,1255.619332,1039.50104,0.082033
2,NTT,EBV-1-2-star,2511.0,T,T,C,G,A,T,73.554968,1039.50104,-1.150213
3,NTT,EBV-1-3,3967.0,C,G,G,A,G,C,116.205718,1039.50104,-0.951597
4,NTT,EBV-1-3P,3687.0,C,A,C,T,A,T,108.003651,1039.50104,-0.983386


In [4]:
model_df = df.filter(regex = 'head|tail|Y') \
    .pipe(train_to_cat)
model_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,head0,head1,head2,tail0,tail1,tail2,Y
0,T,A,A,G,T,T,-1.943678
1,T,A,T,T,G,A,0.082033
2,A,A,A,A,G,C,-1.150213
3,T,A,A,A,C,A,-0.951597
4,T,A,G,G,T,C,-0.983386


In [5]:
control_model_df = control_df.filter(regex = 'head|tail|Y') \
    .pipe(train_to_cat)
control_model_df.head()

Unnamed: 0,head0,head1,head2,tail0,tail1,tail2,Y
0,C,C,T,G,G,A,-1.943678
1,C,T,T,A,A,T,0.082033
2,T,T,C,G,A,T,-1.150213
3,C,G,G,A,G,C,-0.951597
4,C,A,C,T,A,T,-0.983386


# Train cross-validation #

In [6]:
rrf = R_randomForest()
rf = GridSearchCV(estimator=rrf, 
             param_grid={'ntrees':np.arange(500,600,10),
                        'mtry':np.arange(2,10,5)},
             n_jobs = 2,
             refit = True,
             cv = 8, 
             return_train_score = True)
rf.fit(model_df.drop('Y', axis=1), model_df.Y)



GridSearchCV(cv=8, error_score='raise-deprecating',
       estimator=R_randomForest(mtry=2, ntrees=500), fit_params=None,
       iid='warn', n_jobs=2,
       param_grid={'ntrees': array([500, 510, 520, 530, 540, 550, 560, 570, 580, 590]), 'mtry': array([2, 7])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [7]:
rf.cv_results_

{'mean_fit_time': array([2.10026595, 1.76597887, 1.85254386, 3.33593315, 2.34250897,
        1.62810728, 1.56776941, 1.59027368, 1.63691756, 1.62188041,
        3.51933551, 3.64386529, 3.73621812, 3.89040071, 3.92218444,
        3.89502111, 3.99940443, 4.37690872, 4.31070188, 4.05204549]),
 'std_fit_time': array([0.44495877, 0.31649419, 0.61853308, 1.14451593, 0.44214217,
        0.09479057, 0.06006114, 0.07732238, 0.08566183, 0.07459732,
        0.06096259, 0.09247295, 0.06900852, 0.16072355, 0.07187828,
        0.07031298, 0.06033658, 0.14355623, 0.1645722 , 0.38188948]),
 'mean_score_time': array([0.06064999, 0.06824648, 0.04638529, 0.11328682, 0.03736156,
        0.03850189, 0.03419396, 0.03177541, 0.03215498, 0.04073399,
        0.0376685 , 0.04728475, 0.03275585, 0.0540697 , 0.05849877,
        0.04292399, 0.06188458, 0.03864127, 0.03610587, 0.03779045]),
 'std_score_time': array([0.03336305, 0.07513566, 0.02669626, 0.10501744, 0.00621575,
        0.00850487, 0.00602824, 0.004952