# Feature Selection Version 3.0

### 1. General Set Up

In [1]:
# Automatically reload external modules (see https://ipython.org/ipython-doc/3/config/extensions/autoreload.html for more information)
%load_ext autoreload
%autoreload 2

# Set up system path to include our "anoog" python package
import sys
sys.path.append('../src/src')

### 2. Import Packages / Modules

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import os
import numpy as np
import pandas as pd


import anoog

import matplotlib.pyplot as plt
import plotly_express as px

### 3. Loading Data

In [3]:
df = anoog.io.load_data("../data/2021-11-09/2021-11-09", ['tippolit', 'vkorzev'],
                        extraction=anoog.io.extraction_mode.TSFRESH,
                        selection=anoog.io.selection_mode.NONE,
                        train_test_split=False)
#df

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 10/10 [01:19<00:00,  7.91s/it]
 'Voltage__query_similarity_count__query_None__threshold_0.0'
 'Current__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.


### 4. Feature Selection [SelectKBest]

For more information: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html#sklearn.feature_selection.SelectKBest.get_feature_names_out

In [4]:
from sklearn.feature_selection import SelectKBest, f_regression
selectKBest = SelectKBest(f_regression, k=5).fit(df.iloc[:, 0:649], df.iloc[:, -1]) #choose between 'f_regression' or 'mutual_info_regression'. For more Details see link above. choose k!
np.set_printoptions(suppress = True) #Visualize values 'better'
#selectKBest.scores_ #remove '#' to see scores
#selectKBest.get_support() #remove '#' to see boolean mask over scores

  corr /= X_norms


In [5]:
def check(list1, list2): #function for feature name mask
    array = []
    for i, x in enumerate(list1):
        if x == True:
            array.append(list2[i])
    
    return array

important_features = check(selectKBest.get_support(), list(df.columns))
#print(important_features) #remove '#' to see all selected features

In [6]:
df_new = df[important_features]
df_new #new Dataframe to see which features is selected

Unnamed: 0,Audio__length,Audio__count_below_mean,Audio__cid_ce__normalize_True,Audio__number_peaks__n_1,Audio__number_peaks__n_5
0.0,779.0,367.0,50.302242,263.0,69.0
1.0,638.0,289.0,44.315737,200.0,53.0
2.0,420.0,197.0,34.953408,145.0,34.0
3.0,788.0,340.0,46.843048,256.0,66.0
4.0,429.0,220.0,36.731115,149.0,33.0
5.0,646.0,329.0,45.257508,221.0,56.0
6.0,520.0,259.0,41.193902,182.0,46.0
7.0,494.0,253.0,39.698692,163.0,38.0
8.0,398.0,175.0,35.167249,129.0,34.0
9.0,532.0,269.0,40.800686,187.0,43.0


### 5. Feature Selection [Variance Thresholding]

For more Information: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html

In [7]:
from sklearn.feature_selection import VarianceThreshold
varianceSelect= VarianceThreshold(threshold=99999) #Select threshold
varianceSelect.fit_transform(df)
np.set_printoptions(suppress = True) #Visualize Data better
#varianceSelect.variances_ #Remove "#" to see variances of each feature
#varianceSelect.get_support() #Remove "#" to see a boolean mask variance scores

In [8]:
important_features2 = check(varianceSelect.get_support(), list(df.columns))

In [9]:
df_new2 = df[important_features2]
df_new2

Unnamed: 0,Audio__variation_coefficient,"Audio__fft_aggregated__aggtype_""variance""",Voltage__sum_values,Voltage__abs_energy,Voltage__sum_of_reoccurring_data_points,"Voltage__fft_coefficient__attr_""real""__coeff_0","Voltage__fft_coefficient__attr_""abs""__coeff_0","Voltage__fft_aggregated__aggtype_""variance""",Voltage__friedrich_coefficients__coeff_3__m_3__r_30,Current__abs_energy,"Current__fft_aggregated__aggtype_""variance"""
0.0,-393.930269,9141.065258,15553.062272,310639.795876,2075.330377,15553.062272,15553.062272,2331.452903,1019.68184,8379.635236,11372.943171
1.0,-223.468057,6710.341832,12663.958916,251461.840957,948.652417,12663.958916,12663.958916,1584.009049,1572.018156,7093.573803,7753.985006
2.0,-65.589229,2829.417154,8251.376135,162193.52725,889.187369,8251.376135,8251.376135,552.07621,1044.826421,8744.081087,2855.793583
3.0,-179.322551,10492.060479,15564.792882,307554.794399,1424.623252,15564.792882,15564.792882,1916.875744,37.690582,8226.798846,9792.052586
4.0,112.1626,2625.958988,8419.762921,165336.211098,1607.404372,8419.762921,8419.762921,609.623435,-1203.832486,6856.944879,3304.303732
5.0,196.041722,5680.99336,12601.773851,245957.694791,1299.654412,12601.773851,12601.773851,1530.879913,183.259007,10631.327879,7713.456849
6.0,1457.900408,3716.606976,10133.178563,197574.583501,1175.481051,10133.178563,10133.178563,1014.212857,-465.310155,9085.067543,5108.647841
7.0,-7340.92154,3906.352136,9597.496332,186559.257722,480.177967,9597.496332,9597.496332,877.124886,-223.784985,9378.793193,4548.73638
8.0,-122.716564,2515.287607,7721.278543,149880.567778,439.758224,7721.278543,7721.278543,535.517011,624.69329,8402.435553,2692.32486
9.0,-234.370743,4437.306538,10298.234433,199433.259417,1216.344194,10298.234433,10298.234433,579.415826,-109.269432,9294.082906,3710.387793


### 6. Feature Selection [Sequential Feature Selection] (Forward) {Work in Progress}

For more details: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SequentialFeatureSelector.html 

Sequential Feature Selection vs Model-bades Selection. May be interresting: https://scikit-learn.org/stable/auto_examples/feature_selection/plot_select_from_model_diabetes.html#sphx-glr-auto-examples-feature-selection-plot-select-from-model-diabetes-py

For more Information about Decision Tree: https://scikit-learn.org/stable/modules/tree.html

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn import tree

Dtree = RandomForestClassifier(max_depth=2, random_state=0) #A Decision Tree was chosen
sfs = SequentialFeatureSelector(Dtree,n_features_to_select=2, direction="backward") #n_features_to_select = None, cv "Cross Validation" = None
sfs.fit(df.iloc[:, 0:649], df.iloc[:, -1])



### 7. Feature Selection [Modelbased Selection] {Work in Progress}

For more Information: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html#sklearn.feature_selection.SelectFromModel             
For more Information about Decision Tree: https://scikit-learn.org/stable/modules/tree.html

In [35]:
from sklearn.feature_selection import SelectFromModel
from sklearn import tree

Dtree = tree.DecisionTreeClassifier() #Default yet.

SelectM = SelectFromModel(estimator=Dtree)
SelectM.fit(df.iloc[:, 0:649], df.iloc[:, -1])
#SelectM.get_support() #Remove "#" to see boolean Mask over features

SelectFromModel(estimator=DecisionTreeClassifier())

In [33]:
important_features3 = check(SelectM.get_support(), list(df.columns))

In [34]:
df_new3 = df[important_features3]
df_new3

Unnamed: 0,Audio__count_above_mean
0.0,412.0
1.0,349.0
2.0,223.0
3.0,448.0
4.0,209.0
5.0,317.0
6.0,261.0
7.0,241.0
8.0,223.0
9.0,263.0
