In [1]:
# Importing everything we need 
# The usual packages
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Scalars
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

In [2]:
# import dataset

df_preprocess = pd.read_csv('data/df_preprocess.csv', index_col=[0])

df_preprocess.T.head(50)

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,101748,101749,101751,101752,101753,101754,101755,101756,101758,101765
encounter_id,64410,500364,16680,35754,55842,63768,12522,15738,28236,36900,...,443811536,443816024,443835140,443835512,443841992,443842016,443842022,443842070,443842340,443867222
patient_nbr,86047875,82442376,42519267,82637451,84259809,114882984,48330783,63555939,89869032,77391171,...,189481478,106392411,175326800,139605341,184875899,183087545,188574944,140199494,120975314,175429310
race,Non_Caucasian,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian,Non_Caucasian,Non_Caucasian,...,Caucasian,Caucasian,Caucasian,Non_Caucasian,Non_Caucasian,Caucasian,Non_Caucasian,Non_Caucasian,Caucasian,Caucasian
gender,Female,Male,Male,Male,Male,Male,Female,Female,Female,Male,...,Female,Female,Male,Female,Male,Female,Female,Female,Female,Male
age,[20-30),[30-40),[40-50),[50-60),[60-70),[70-80),[80-90),[90-100),[40-50),[60-70),...,[40-50),[70-80),[70-80),[40-50),[40-50),[70-80),[40-50),[60-70),[80-90),[70-80)
admission_type_id,emergency,emergency,emergency,urgent,elective,emergency,urgent,elective,emergency,urgent,...,emergency,elective,elective,elective,emergency,emergency,emergency,emergency,emergency,emergency
discharge_disposition_id,home,home,home,home,home,home,home,transferred_SNF,home,home,...,transferred_other,home_with_home_health_IV,home_with_home_health_IV,home,home,home,home,home,home,home
admission_source_id,emergency_room,emergency_room,emergency_room,other,other,emergency_room,transfer,transfer,emergency_room,transfer,...,emergency_room,physician_referral,physician_referral,physician_referral,emergency_room,emergency_room,emergency_room,emergency_room,emergency_room,emergency_room
time_in_hospital,2,2,1,3,4,5,13,12,9,7,...,14,3,13,3,13,9,14,2,5,6
medical_specialty,unknown,unknown,unknown,unknown,unknown,unknown,unknown,InternalMedicine,unknown,unknown,...,unknown,other,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown


### One-Hot Encoding

Algorithms can't read in categorical data directly, so we need to one-hot encode the categorical variables first. This means "binarizing" the categories. For example, `admission_type_id` value = 'emergency' becomes 0, 'urgent' becomes 1 and so on. These values have no ordered relationships, as in 'urgent' isn't "better" than 'emergency', which is why we preprocess using one-hot encoding rather than label encoding. 

In [4]:
# check

df_preprocess.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'medical_specialty', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'change',
       'diabetesMed', 'readmitted', 'num_med_changes', 'race_orig',
       'readmitted_orig'],
      dtype='object')

In [5]:
# instantiate the OneHotEncoder

ohe = OneHotEncoder()

# fit the OneHotEncoder on the features

categorical = df_preprocess[['race','gender','age','admission_type_id','discharge_disposition_id','admission_source_id', \
                'medical_specialty','diag_1','diag_2','diag_3','max_glu_serum','A1Cresult','change','diabetesMed']]

# transform the features

encoded = ohe.fit_transform(categorical) 
encoded

<69989x77 sparse matrix of type '<class 'numpy.float64'>'
	with 979846 stored elements in Compressed Sparse Row format>

In [6]:
# get each feature's categories 

ohe.categories_

[array(['Caucasian', 'Non_Caucasian'], dtype=object),
 array(['Female', 'Male'], dtype=object),
 array(['[20-30)', '[30-40)', '[40-50)', '[50-60)', '[60-70)', '[70-80)',
        '[80-90)', '[90-100)'], dtype=object),
 array(['elective', 'emergency', 'other', 'unknown', 'urgent'],
       dtype=object),
 array(['expired', 'home', 'home_with_home_health_IV', 'hospice',
        'left_ama', 'other', 'transferred_SNF', 'transferred_other',
        'transferred_shortterm_hospital', 'unknown'], dtype=object),
 array(['emergency_room', 'other', 'physician_referral', 'transfer',
        'unknown'], dtype=object),
 array(['Cardiology', 'Emergency/Trauma', 'Family/GeneralPractice',
        'InternalMedicine', 'other', 'unknown'], dtype=object),
 array(['circulatory', 'diabetes', 'digestive', 'genitourinary', 'injury',
        'musculoskeletal', 'neoplasms', 'other', 'respiratory'],
       dtype=object),
 array(['circulatory', 'diabetes', 'digestive', 'genitourinary', 'injury',
        'musculoskel

In [7]:
# put into a dataframe to get column names

encoded_df = pd.DataFrame.sparse.from_spmatrix(encoded, index=df_preprocess.index)
encoded_df.columns = ohe.get_feature_names(categorical.columns)

# show

encoded_df.head()

Unnamed: 0,race_Caucasian,race_Non_Caucasian,gender_Female,gender_Male,age_[20-30),age_[30-40),age_[40-50),age_[50-60),age_[60-70),age_[70-80),...,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
2,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
5,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
6,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [8]:
# convert values of 0 and 1 from float to int

encoded_df = encoded_df.astype('int64')

In [10]:
# show

encoded_df.head()

Unnamed: 0,race_Caucasian,race_Non_Caucasian,gender_Female,gender_Male,age_[20-30),age_[30-40),age_[40-50),age_[50-60),age_[60-70),age_[70-80),...,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
2,0,1,1,0,1,0,0,0,0,0,...,1,0,0,0,1,0,0,1,0,1
3,1,0,0,1,0,1,0,0,0,0,...,1,0,0,0,1,0,1,0,0,1
4,1,0,0,1,0,0,1,0,0,0,...,1,0,0,0,1,0,1,0,0,1
5,1,0,0,1,0,0,0,1,0,0,...,1,0,0,0,1,0,0,1,0,1
6,1,0,0,1,0,0,0,0,1,0,...,1,0,0,0,1,0,1,0,0,1


In [14]:
# compare to the above as a spot check

df_preprocess.iloc[2,:]

encounter_id                         16680
patient_nbr                       42519267
race                             Caucasian
gender                                Male
age                                [40-50)
admission_type_id                emergency
discharge_disposition_id              home
admission_source_id         emergency_room
time_in_hospital                         1
medical_specialty                  unknown
num_lab_procedures                      51
num_procedures                           0
num_medications                          8
number_outpatient                        0
number_emergency                         0
number_inpatient                         0
diag_1                           neoplasms
diag_2                           neoplasms
diag_3                            diabetes
number_diagnoses                         5
max_glu_serum                         None
A1Cresult                             None
change                                  Ch
diabetesMed

In [32]:
# get names of columns as a result of one-hot encoding

ohe.get_feature_names(categorical.columns)

array(['race_Caucasian', 'race_Non_Caucasian', 'gender_Female',
       'gender_Male', 'age_[20-30)', 'age_[30-40)', 'age_[40-50)',
       'age_[50-60)', 'age_[60-70)', 'age_[70-80)', 'age_[80-90)',
       'age_[90-100)', 'admission_type_id_elective',
       'admission_type_id_emergency', 'admission_type_id_other',
       'admission_type_id_unknown', 'admission_type_id_urgent',
       'discharge_disposition_id_expired',
       'discharge_disposition_id_home',
       'discharge_disposition_id_home_with_home_health_IV',
       'discharge_disposition_id_hospice',
       'discharge_disposition_id_left_ama',
       'discharge_disposition_id_other',
       'discharge_disposition_id_transferred_SNF',
       'discharge_disposition_id_transferred_other',
       'discharge_disposition_id_transferred_shortterm_hospital',
       'discharge_disposition_id_unknown',
       'admission_source_id_emergency_room', 'admission_source_id_other',
       'admission_source_id_physician_referral',
       'admis

After spot checking that the one-hot-encoding is done correctly, let's now join this new `encoded_df` back to the data set, drop the original features we've encoded.

In [15]:
# join encoded_df with df_preprocess

df_model = pd.merge(df_preprocess, encoded_df, how='left', left_index=True, right_index=True)

In [16]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69989 entries, 2 to 101765
Columns: 105 entries, encounter_id to diabetesMed_Yes
dtypes: Sparse[int64, 0](77), int64(12), object(16)
memory usage: 29.2+ MB


In [17]:
# check 

df_model.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
2,64410,86047875,Non_Caucasian,Female,[20-30),emergency,home,emergency_room,2,unknown,...,1,0,0,0,1,0,0,1,0,1
3,500364,82442376,Caucasian,Male,[30-40),emergency,home,emergency_room,2,unknown,...,1,0,0,0,1,0,1,0,0,1
4,16680,42519267,Caucasian,Male,[40-50),emergency,home,emergency_room,1,unknown,...,1,0,0,0,1,0,1,0,0,1
5,35754,82637451,Caucasian,Male,[50-60),urgent,home,other,3,unknown,...,1,0,0,0,1,0,0,1,0,1
6,55842,84259809,Caucasian,Male,[60-70),elective,home,other,4,unknown,...,1,0,0,0,1,0,1,0,0,1


In [18]:
# spot check the above

df_model.loc[6]['medical_specialty_unknown']

1

In [19]:
# save this df_model to another dataframe before dropping some categorical features

df_model_predrop = df_model.copy()

In [20]:
# note that we are dropping race, gender and readmitted here as well since we want our clustering to be 
# race and gender-blind

df_model_cluster = df_model.drop(columns=['encounter_id', 'patient_nbr', 'race', 'gender', 'age', \
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id', \
       'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'change', \
       'diabetesMed', 'race_orig', 'readmitted_orig', 'race_Caucasian', 'race_Non_Caucasian', \
       'gender_Female', 'gender_Male', 'readmitted'])

In [21]:
# output to csv

df_model_cluster.to_csv('data/df_premodel_cluster.csv')

#df_model_classify.to_csv('data/df_model_classify.csv')

Since one-hot encoding the categorical variables, we note that the dataset now has a total of 105 columns. We would like to reduce the number of features since having many features doesn't necessarily mean the modeling will improve, and some features might actually just be noise data that the model doesn't need. We would like to see if there are additional features we should drop as part of feature selection before we begin clustering. One feature selection method is to remove low-variance features using `VarianceThreshold`. The idea is that low-variance features don't provide meaninful relations in the data anyway.

Side note: Principal Component Analysis (PCA) is a common dimension reduction (and multicolinearity reduction) method, but we won't use it here. What PCA does is that it finds linear combinations of features with highest variance and converts that information into principal components (PC), which are like vectors. The reason that we won't use PCA in our work is because PC's aren't interpretable, meaning, we can't know what the features are in the linear combinations. This is a concern since we're interested in finding patterns in our clustering later on.

### Drop more columns based on variance

In [22]:
# check columns

df_model_cluster.columns

Index(['time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses', 'num_med_changes',
       'age_[20-30)', 'age_[30-40)', 'age_[40-50)', 'age_[50-60)',
       'age_[60-70)', 'age_[70-80)', 'age_[80-90)', 'age_[90-100)',
       'admission_type_id_elective', 'admission_type_id_emergency',
       'admission_type_id_other', 'admission_type_id_unknown',
       'admission_type_id_urgent', 'discharge_disposition_id_expired',
       'discharge_disposition_id_home',
       'discharge_disposition_id_home_with_home_health_IV',
       'discharge_disposition_id_hospice', 'discharge_disposition_id_left_ama',
       'discharge_disposition_id_other',
       'discharge_disposition_id_transferred_SNF',
       'discharge_disposition_id_transferred_other',
       'discharge_disposition_id_transferred_shortterm_hospital',
       'discharge_disposition_id_unknown',
       'admission_source_id_e

The proper step before applying variance feature selector is to scale the data first. We choose MinMaxScaler because it preserves the shape of the original distribution, unlike StandardScaler that scales the data to have a mean of 0 and a variance of 1. This is a consideration as we think about the clustering we are doing as a modeling step. 

In [23]:
# scale the data

# fit the scaler 

my_mm = MinMaxScaler().fit(df_model_cluster)

# transform the features

df_cluster_s = my_mm.transform(df_model_cluster)

In [24]:
# check

df_cluster_s

array([[0.07692308, 0.07633588, 0.83333333, ..., 1.        , 0.        ,
        1.        ],
       [0.07692308, 0.32824427, 0.16666667, ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.38167939, 0.        , ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.07692308, 0.34351145, 1.        , ..., 1.        , 0.        ,
        1.        ],
       [0.30769231, 0.57251908, 0.16666667, ..., 0.        , 0.        ,
        1.        ],
       [0.38461538, 0.09160305, 0.5       , ..., 1.        , 1.        ,
        0.        ]])

In [27]:
scaled_df = pd.DataFrame(df_cluster_s, index=df_model_cluster.index, columns=df_model_cluster.columns)

scaled_df.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,num_med_changes,age_[20-30),...,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
2,0.076923,0.076336,0.833333,0.15,0.047619,0.0,0.083333,0.230769,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,0.076923,0.328244,0.166667,0.1875,0.0,0.0,0.0,0.307692,0.25,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,0.0,0.381679,0.0,0.0875,0.0,0.0,0.0,0.153846,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
5,0.153846,0.229008,1.0,0.1875,0.0,0.0,0.0,0.461538,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
6,0.230769,0.526718,0.166667,0.25,0.0,0.0,0.0,0.307692,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [28]:
# check that non-boolean values are in the first 9 columns and assign these to a variable

scaled_df_nonbool = scaled_df.iloc[:,:9]

scaled_df_nonbool

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,num_med_changes
2,0.076923,0.076336,0.833333,0.1500,0.047619,0.00000,0.083333,0.230769,0.00
3,0.076923,0.328244,0.166667,0.1875,0.000000,0.00000,0.000000,0.307692,0.25
4,0.000000,0.381679,0.000000,0.0875,0.000000,0.00000,0.000000,0.153846,0.00
5,0.153846,0.229008,1.000000,0.1875,0.000000,0.00000,0.000000,0.461538,0.00
6,0.230769,0.526718,0.166667,0.2500,0.000000,0.00000,0.000000,0.307692,0.00
...,...,...,...,...,...,...,...,...,...
101754,0.615385,0.374046,0.333333,0.4000,0.000000,0.00000,0.000000,0.461538,0.25
101755,1.000000,0.549618,1.000000,0.3125,0.000000,0.02381,0.000000,0.461538,0.25
101756,0.076923,0.343511,1.000000,0.2000,0.023810,0.02381,0.083333,0.461538,0.00
101758,0.307692,0.572519,0.166667,0.2625,0.000000,0.02381,0.000000,0.461538,0.25


After we've scaled the data, we can go ahead with the feature selector, `VarianceThreshold`. It removes features whose variance don't meet a threshold we set. The variance formula<sup>1</sup> is as follows:

$$Var[X] = p(1-p)$$

In our case, we'll set p = 0.95, meaning, it'll remove features that are either 1 or 0 in more than 95% of the values. 

In [29]:
# only apply variance threshold on boolean features

from sklearn.feature_selection import VarianceThreshold

X = scaled_df.iloc[:,9:]

# set threshold = (.95 * (1 - .95))

sel = VarianceThreshold(threshold=(.95 * (1 - .95)))

# fit and transform boolean features

sel.fit_transform(X)

array([[0., 0., 0., ..., 1., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 1., ..., 1., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 1., 0.]])

In [30]:
# return true or false for each feature - whether or not it has met the threshold

sel.get_support()

array([False, False,  True,  True,  True,  True,  True, False,  True,
        True, False,  True,  True, False,  True,  True, False, False,
       False,  True, False, False, False,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
       False,  True, False, False,  True,  True,  True,  True,  True,
       False,  True, False, False,  True,  True,  True, False, False,
       False, False, False,  True,  True,  True,  True,  True,  True,
        True])

In [31]:
# create a new dataframe that keeps features where get_support() = true

X_variance = X[X.columns[sel.get_support(indices=True)]]

In [32]:
# check 

X_variance

Unnamed: 0,age_[40-50),age_[50-60),age_[60-70),age_[70-80),age_[80-90),admission_type_id_elective,admission_type_id_emergency,admission_type_id_unknown,admission_type_id_urgent,discharge_disposition_id_home,...,diag_3_neoplasms,diag_3_other,diag_3_respiratory,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
6,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101754,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
101755,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
101756,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
101758,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


After applying `VarianceThreshold`, 24 non-boolean features are removed, and so 49 non-boolean features remain. Let's join these back with the boolean features we set aside earlier.

In [34]:
# create new dataframe with variables that remain after feature selection

df_model_cluster2 = pd.merge(scaled_df_nonbool, X_variance, how='left', left_index=True, right_index=True)

In [35]:
# check

df_model_cluster2[df_model_cluster2.duplicated()==True]

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,num_med_changes,age_[40-50),...,diag_3_neoplasms,diag_3_other,diag_3_respiratory,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
11151,0.153846,0.160305,0.166667,0.175,0.0,0.0,0.0,0.307692,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
19017,0.076923,0.160305,0.166667,0.15,0.0,0.0,0.0,0.153846,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0


We've just checked that there are no duplicate rows in this new dataframe, so we can proceed to the clustering modeling stage. Let's go ahead and save to csv.

In [36]:
# output to csv

df_model_cluster2.to_csv('data/df_model_cluster.csv')

References:

<sup>1</sup> https://scikit-learn.org/stable/modules/feature_selection.html#variance-threshold