# 06 Region classifier

**Objectives:**
* train a classifier to predict Regions (`-->` multiclass)
* explain classifications in different regions with LIME

In [1]:
import pandas as pd
import numpy as np
import os
import pickle

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from boruta import BorutaPy

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, ShuffleSplit

import lime 
import sklearn.datasets
from lime.lime_tabular import LimeTabularExplainer

import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()

sns.set(style="whitegrid")

---

## Load the data

In [2]:
wd = '/media/tmo/data/work/datasets/02_ST'

logcpm_path = wd + '/ashley_21.03.2018/logcpm_merge_20180212.pickle'
meta_path = wd + '/meta/meta.parquet'

In [3]:
%%time
meta_df = pd.read_parquet(meta_path)

CPU times: user 122 ms, sys: 274 ms, total: 396 ms
Wall time: 193 ms


In [4]:
%%time
logcpm_df = pickle.load(open(logcpm_path, "rb"))

logcpm_df.index.name = 'spot_UID'
logcpm_df.reset_index(inplace=True)
logcpm_df.rename(columns={'sampleID': 'slide_ID'}, inplace=True)

CPU times: user 16 s, sys: 6.89 s, total: 22.9 s
Wall time: 22.9 s


In [5]:
st_df = logcpm_df.merge(meta_df, how='inner', on=['spot_UID', 'slide_ID'])

In [6]:
st_df['slide_ID'] = st_df['slide_ID'].astype('category', copy=False)
st_df['GT'] = st_df['GT'].astype('category', copy=False)
st_df['age'] = st_df['age_GT'].astype('category', copy=False)
st_df['age_GT'] = st_df['age_GT'].astype('category', copy=False)

In [7]:
n_genes = 46454
gene_columns = st_df.columns[1:n_genes+1]

In [8]:
expression_df = st_df[gene_columns]

In [9]:
assert expression_df.shape == (10327, 46454)

In [108]:
region_levels_df = pd.read_csv('region_levels.csv', sep=',', keep_default_na=False)

In [110]:
genotype_df = st_df[['GT']]

In [111]:
region_df = st_df[['Region_predict']].copy()

In [112]:
region_df = region_df.merge(region_levels_df, on='Region_predict')

In [113]:
region_cat_df = region_df['Region_predict'].astype('category', copy=False)
region_level1_df = region_df['Level_01'].astype('category', copy=False)
region_level2_df = region_df['Level_02'].astype('category', copy=False)

---



In [120]:
region_df['Level_01'].value_counts()

CX    4104
BS    3028
NA    1651
HP    1544
Name: Level_01, dtype: int64

In [119]:
region_df['Level_02'].value_counts()

TH       1931
FB       1155
HPd      1122
HY       1097
AUD       860
OLF       749
CTXsp     709
PTL       585
SSp       532
NA        496
HPs       422
COM       295
RSP       262
ENTI      112
Name: Level_02, dtype: int64

---
## Extract *all-relevant* feature set: Region

In [25]:
boruta_rf = RandomForestClassifier(n_jobs=-1, n_estimators=1000, max_features='sqrt', max_depth=5)

def train_feature_selector(X_df=expression_df,  # the transcriptome expression vectors
                           y_df=region_df,      # the Region column
                           estimator=boruta_rf, verbose=2, seed=42):  # boruta parameters
    feature_selector = BorutaPy(estimator=estimator, verbose=verbose, random_state=seed, n_estimators='auto')    
    
    X = X_df.as_matrix()
    y = y_df.values.ravel()
    feature_selector.fit(X, y)
    
    return feature_selector

In [None]:
Region_feature_selector = train_feature_selector(y_df = region_df)

In [27]:
Region_features = list(gene_columns[Region_feature_selector.support_])

In [30]:
pd.DataFrame(Region_features).to_csv('06_region_features.txt', index=None, header=None)

## Extract *all-relevant* feature set: Level 01

In [None]:
level1_feature_selector = train_feature_selector(y_df = region_level1_df)

In [124]:
Level1_features = list(gene_columns[level1_feature_selector.support_])

In [126]:
pd.DataFrame(Level1_features).to_csv('06_level1_features.txt', index=None, header=None)

## Extract *all-relevant* feature set: Level 02

In [None]:
level2_feature_selector = train_feature_selector(y_df = region_level2_df)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	46454
Rejected: 	0


In [None]:
Level2_features = list(gene_columns[level2_feature_selector.support_])

In [None]:
pd.DataFrame(Level2_features).to_csv('06_level2_features.txt', index=None, header=None)