# 02 - Boruta experiment 1

**Objectives:**
* test the **boruta** feature selection on the logcpm expression matrix
* `pip install boruta`
* see: https://github.com/scikit-learn-contrib/boruta_py
* paper: https://www.jstatsoft.org/article/view/v036i11

In [70]:
import pandas as pd
import numpy as np
import os
import pickle

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor

import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()

sns.set(style="whitegrid")

---

### Load the data

In [17]:
wd = '/media/tmo/data/work/datasets/02_ST'

logcpm_path = wd + '/ashley_21.03.2018/logcpm_merge_20180212.pickle'
meta_path = wd + '/meta/meta.parquet'

In [19]:
meta_df = pd.read_parquet(meta_path)

In [9]:
%%time
logcpm_df = pickle.load(open(logcpm_path, "rb"))

In [11]:
logcpm_df.index.name = 'spot_UID'

In [12]:
logcpm_df.reset_index(inplace=True)

In [14]:
def add_slide_ID(df):
    df['slide_ID'] = df['spot_UID'].apply(lambda x: str(x).split('__')[0])
    
    return df

In [15]:
logcpm_df = add_slide_ID(logcpm_df)

In [16]:
logcpm_df.head()

Unnamed: 0,spot_UID,-343C11.2,00R_AC107638.2,0610005C13Rik,0610006L08Rik,0610007P14Rik,0610009B22Rik,0610009E02Rik,0610009L18Rik,0610009O20Rik,...,n-R5s93,n-R5s94,n-R5s95,n-R5s96,n-R5s97,n-R5s98,n-TSaga9,n-TStga1,sampleID,slide_ID
0,N05_C2__20_6,2.982974,2.982974,2.982974,2.982974,2.982974,2.982974,4.984943,2.982974,2.982974,...,2.982974,2.982974,2.982974,2.982974,2.982974,2.982974,2.982974,2.982974,N05_C2,N05_C2
1,N05_C2__17_6,2.982974,2.982974,2.982974,2.982974,6.827059,5.924174,2.982974,2.982974,2.982974,...,2.982974,2.982974,2.982974,2.982974,2.982974,2.982974,2.982974,2.982974,N05_C2,N05_C2
2,N05_C2__16_6,2.982974,2.982974,2.982974,2.982974,2.982974,5.609164,2.982974,2.982974,5.609164,...,2.982974,2.982974,2.982974,2.982974,2.982974,2.982974,2.982974,2.982974,N05_C2,N05_C2
3,N05_C2__19_6,2.982974,2.982974,2.982974,2.982974,7.661385,2.982974,2.982974,2.982974,5.157746,...,2.982974,2.982974,2.982974,2.982974,2.982974,2.982974,2.982974,2.982974,N05_C2,N05_C2
4,N05_C2__18_6,2.982974,2.982974,2.982974,2.982974,6.738947,6.738947,2.982974,2.982974,2.982974,...,2.982974,2.982974,2.982974,2.982974,2.982974,2.982974,2.982974,2.982974,N05_C2,N05_C2


In [20]:
st_df = logcpm_df.merge(meta_df, how='inner', on='spot_UID')

In [21]:
st_df.head()

Unnamed: 0,spot_UID,-343C11.2,00R_AC107638.2,0610005C13Rik,0610006L08Rik,0610007P14Rik,0610009B22Rik,0610009E02Rik,0610009L18Rik,0610009O20Rik,...,endothelial,interneuron,slide_ID_y,region,sampleID_y,GT,age_days,age_months,age,age_GT
0,N05_C2__20_6,2.982974,2.982974,2.982974,2.982974,2.982974,2.982974,4.984943,2.982974,2.982974,...,0.095625,0.106595,N05_C2,AT_OLF,N05_C2,KI,558,18,old,old_KI
1,N05_C2__17_6,2.982974,2.982974,2.982974,2.982974,6.827059,5.924174,2.982974,2.982974,2.982974,...,-0.014931,0.049969,N05_C2,AT_OLF,N05_C2,KI,558,18,old,old_KI
2,N05_C2__16_6,2.982974,2.982974,2.982974,2.982974,2.982974,5.609164,2.982974,2.982974,5.609164,...,-0.049142,-0.042531,N05_C2,AT_OLF,N05_C2,KI,558,18,old,old_KI
3,N05_C2__19_6,2.982974,2.982974,2.982974,2.982974,7.661385,2.982974,2.982974,2.982974,5.157746,...,0.057104,0.143494,N05_C2,AT_OLF,N05_C2,KI,558,18,old,old_KI
4,N05_C2__18_6,2.982974,2.982974,2.982974,2.982974,6.738947,6.738947,2.982974,2.982974,2.982974,...,0.043532,0.098493,N05_C2,AT_OLF,N05_C2,KI,558,18,old,old_KI


In [22]:
n_genes = 46454
gene_columns = st_df.columns[1:n_genes+1]

In [25]:
expression_df = st_df[gene_columns]

In [33]:
AB1_std_dev_yen = st_df[['AB1_StdDev_Yen']]

--- 

### **Boruta experiment 1**

In [27]:
rf = RandomForestRegressor(n_jobs=-1, n_estimators=1000, max_features='sqrt', max_depth=5)

In [65]:
def do_boruta(estimator, verbose=2, seed=42):
    feat_selector = BorutaPy(estimator=estimator, verbose=verbose, random_state=seed)
    
    X = expression_df.as_matrix()
    y = AB1_std_dev_yen.values.ravel()
    feat_selector.fit(X, y)
    
    selected_columns = feat_selector.support_
    return gene_columns[selected_columns]    

In [28]:
feat_selector = BorutaPy(estimator=rf, verbose=2, random_state=42)

In [None]:
%%time


**TODO**: clean this up

In [55]:
gene_columns[selected_columns]

Index(['B2m', 'C1qa', 'C1qb', 'C1qc', 'C4a', 'C4b', 'Ccl6', 'Cd52', 'Cd68',
       'Cd74', 'Clec7a', 'Cst3', 'Cst7', 'Ctsb', 'Ctsd', 'Ctss', 'Ctsz',
       'Fcer1g', 'Fcgr3', 'Gfap', 'Gm14173', 'Hexb', 'Itgax', 'Lag3', 'Ly86',
       'Lyz2', 'Mpeg1', 'Rplp1', 'Serpina3n', 'Sez6', 'Trem2', 'Tyrobp',
       'mt-Rnr1', 'mt-Rnr2'],
      dtype='object')

In [54]:
feat_selector.ranking_

array([29456,  8630, 29456, ..., 29456, 29456, 29456])

In [45]:
X_filtered = feat_selector.transform(X)

---

### **Boruta experiment 2**

In [None]:
%%time
ex2_selected = do_boruta(estimator=RandomForestRegressor(n_jobs=-1, n_estimators=1000, max_features=0.01, max_depth=5))

In [67]:
ex2_selected

Index(['2810433D01Rik', 'Apoe', 'B2m', 'C1qa', 'C1qb', 'C1qc', 'C3ar1', 'C4a',
       'C4b', 'Ccl3', 'Ccl6', 'Cd14', 'Cd52', 'Cd68', 'Cd74', 'Cd9', 'Clec7a',
       'Clu', 'Csf1r', 'Cst3', 'Cst7', 'Ctsb', 'Ctsd', 'Ctsl', 'Ctss', 'Ctsz',
       'Fcer1g', 'Fcgr2b', 'Fcgr3', 'Fcrls', 'Fth1', 'Gfap', 'Gm26917',
       'Gm5874', 'Gpnmb', 'H2-Aa', 'H2-D1', 'Hexa', 'Hexb', 'Itgax', 'Lag3',
       'Laptm5', 'Lgals3bp', 'Lrrc17', 'Ly86', 'Lyz1', 'Lyz2', 'Mamdc2',
       'Mpeg1', 'Olfml3', 'Olfr912', 'Plek', 'Rpl12', 'Rplp1', 'Serpina3n',
       'Sez6', 'Spp1', 'Trem2', 'Tyrobp', 'Wfdc17', 'mt-Rnr1', 'mt-Rnr2'],
      dtype='object')

---

### **Boruta experiment3**

In [None]:
%%time
ex3_selected = do_boruta(estimator=RandomForestRegressor(n_jobs=-1, n_estimators=1000, max_features=0.05, max_depth=5))

In [69]:
ex3_selected

Index(['Apoe', 'B2m', 'C1qa', 'C1qb', 'C1qc', 'C4a', 'C4b', 'Ccl6', 'Cd52',
       'Cd68', 'Cd74', 'Clec7a', 'Cst3', 'Cst7', 'Ctsb', 'Ctsd', 'Ctss',
       'Ctsz', 'Fcer1g', 'Fcgr3', 'Gfap', 'Hexa', 'Hexb', 'Itgax', 'Lag3',
       'Laptm5', 'Ly86', 'Lyz1', 'Lyz2', 'Mpeg1', 'Serpina3n', 'Sez6', 'Trem2',
       'Tyrobp', 'mt-Rnr1', 'mt-Rnr2'],
      dtype='object')

---

# **HITS**

references to Alzheimer related papers with one of the selected genes

* **Fcer1g**
    * https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3677161/
* **Lyz2**
    * https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4198361/
* **Hexb**
    * https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2612729/
* **B2m**
    * https://www.nature.com/articles/nm.3898
* **Serpina3n**
    * https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1609963/
* **Trem2**
    * https://www.ncbi.nlm.nih.gov/pubmed/28426958
* **Sez6**
    * https://molecularneurodegeneration.biomedcentral.com/articles/10.1186/s13024-016-0134-z
* **Cst3**
    * https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3390601/
* **Tyrobp**
    * https://www.ncbi.nlm.nih.gov/gene/7305

---

# Gradient Boosting yay!

* TODO: use a stochastic GBM regressor