# Project Notebook III: Feature Selection for Supervised Learning

David Chen

In [1]:
import pandas as pd 
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline

import sagemaker
from sagemaker import get_execution_role
import boto3

## My custom helper functions
from helpers import *

## Step 1. Sagemaker Resources & Processed Data Saved in S3

In [2]:
sagemaker_session = sagemaker.Session()
sagemaker_session

<sagemaker.session.Session at 0x7f498c7c3f28>

In [3]:
role = sagemaker.get_execution_role()
role

'arn:aws:iam::644307290749:role/service-role/AmazonSageMaker-ExecutionRole-20210527T073969'

In [4]:
bucket = sagemaker_session.default_bucket()
bucket

'sagemaker-us-west-1-644307290749'

In [5]:
## Load processed features:
genes = pd.read_csv(get_s3_uri("assets/processed_data/genes.csv", bucket), index_col=0)
genes.shape

(999, 20528)

In [6]:
## Load processed patient annotations (with defined class label):
clinical = pd.read_csv(get_s3_uri("assets/processed_data/clinical.csv", bucket))
clinical.shape

(999, 15)

In [7]:
## Check to see if patient IDs in the 2 DataFrames match
all(genes.index == clinical['Sample ID'])

True

### Exclude genes with missing values

Only a small number of genes (approx $1.5\%$) have missing values. Excluding them should not have major impact on machine learning outcome.

After dropping NaNs, there are over 20,000 features and 999 observations in the dataset.

In [8]:
genesWithNan = genes.columns[genes.isna().any()].tolist()
len(genesWithNan) / genes.shape[1]

0.01553975058456742

In [9]:
genes = genes.dropna(axis=1)
genes.shape

(999, 20209)

## Step 2. L1-based Feature Selection

Reference: sklearn documentation on [feature selection](https://scikit-learn.org/stable/modules/feature_selection.html)

I choose L1 because it is a rather "aggressive" approach, which may work well here given our feature space is very high-dimensional.

There are alternative approaches such as Recursive Feature Elimination (RFE). However, such approaches may be too computationally expensive and not good choices for an very-large feature space.

In [10]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

In [11]:
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False)

In [12]:
lsvc.fit(genes, clinical["Label"].values)



LinearSVC(C=0.01, dual=False, penalty='l1')

In [13]:
model = SelectFromModel(lsvc, prefit=True)

In [14]:
genes_l1 = model.transform(genes)
genes_l1.shape

(999, 33)

In [15]:
## Set aside gene features selected
selected_features = genes.columns[model.get_support()]
selected_features #entrez gene Ids

Index(['155465', '7916', '696', '57150', '158067', '729767', '1131', '22866',
       '1602', '2300', '161835', '9118', '10265', '3787', '55915', '116372',
       '284348', '84789', '9313', '193629', '93210', '5460', '25837', '255426',
       '140032', '81893', '26781', '6627', '254122', '222183', '7031', '11013',
       '83887'],
      dtype='object')

In [16]:
len(selected_features) #should match genes_l1.shape[1]

33

The downstream result of L1 feature selection can be useful. For example, we can compare the performance via A/B testing when ___ is used as the input:

* All genes in the processed `genes` DataFrame
* 2-dimensional PCA features (which we saw to well overlap with the positive class in Notebook II)
* L1-selected features

## Step 3. Export L1-features

In [17]:
genes_l1 = pd.DataFrame(genes_l1)
genes_l1.columns = selected_features #selected features
genes_l1.index = genes.index #observations
genes_l1.head()

Unnamed: 0,155465,7916,696,57150,158067,729767,1131,22866,1602,2300,...,255426,140032,81893,26781,6627,254122,222183,7031,11013,83887
TCGA-3C-AAAU-01,0.744,1.2784,-0.5802,-0.6921,0.2543,-728.237,-0.8139,-1.727,1.2323,-0.5828,...,-0.8564,-110.8859,0.8984,-0.7966,-0.3612,-0.815,1.7296,-0.6914,0.6873,-0.4827
TCGA-3C-AALI-01,0.854,0.5455,-0.3288,-1.056,0.3772,-728.237,-1.1093,-0.5511,0.7238,-0.4548,...,-0.7491,-110.8859,0.3764,1.3914,0.2696,-0.3699,0.758,-0.3852,0.3087,0.0989
TCGA-3C-AALJ-01,0.5845,0.4624,-0.4364,-2.3766,0.491,-728.237,-1.1788,-1.727,0.16,0.346,...,-0.0659,-110.8859,-0.0003,-1.7618,0.0888,-0.6935,0.5507,-0.3153,1.0042,-0.5672
TCGA-3C-AALK-01,0.198,0.7335,-0.2444,0.1833,0.3688,-728.237,-0.0368,0.0906,-0.1117,-0.0171,...,0.301,-110.8859,-1.2495,-0.6335,-0.0343,-0.361,0.6755,1.1335,0.0023,-1.4099
TCGA-4H-AAAK-01,0.2268,0.7309,-0.7818,0.1617,0.9663,-728.237,-0.7991,0.8768,0.1532,0.2638,...,-0.811,-110.8859,-0.7703,-0.6064,-0.7436,0.8746,0.8174,0.8219,0.5726,-0.6057


In [18]:
directS3Save(genes_l1, bucket, "assets/processed_data/genes_l1_selected.csv")