In [1]:
from feature_selector import *

import pandas as pd
import numpy as np
import seaborn as sns

from scipy.spatial import distance
from scipy.cluster import hierarchy

import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [2]:
df_ionizable_train: pd.DataFrame = pd.read_csv("../../Data/ionizable_dataset_72_train_divprio.csv", delimiter=';')
df_ionizable_test: pd.DataFrame = pd.read_csv("../../Data/ionizable_dataset_72_test_divprio.csv", delimiter=';')


FS_io_train: FeatureSelector = FeatureSelector(df_ionizable_train)
FS_io_test: FeatureSelector = FeatureSelector(df_ionizable_train)

FS_io_train.scale_data(inplace=True)
FS_io_test.scale_data(inplace=True)


df_neutral_train: pd.DataFrame = pd.read_csv("../../Data/neutral_dataset_111_train_divprio.csv", delimiter=';')
df_neutral_test: pd.DataFrame = pd.read_csv("../../Data/neutral_dataset_111_test_divprio.csv", delimiter=';')

FS_ne_train: FeatureSelector = FeatureSelector(df_neutral_train)
FS_ne_test: FeatureSelector = FeatureSelector(df_neutral_test)

FS_ne_train.scale_data(inplace=True)
FS_ne_test.scale_data(inplace=True)

df_full_train: pd.DataFrame = pd.read_csv("../../Data/full_dataset_train_divprio.csv", delimiter=';')
df_full_test: pd.DataFrame = pd.read_csv("../../Data/full_dataset_test_divprio.csv", delimiter=';')


FS_full_train: FeatureSelector = FeatureSelector(df_full_train)
FS_full_test: FeatureSelector = FeatureSelector(df_full_test)

FS_full_train.scale_data(inplace=True)
print(FS_full_train.df)
FS_full_test.scale_data(inplace=True)


df_all =  [df_ionizable_train, df_ionizable_test, df_neutral_train, df_neutral_test, df_full_train, df_full_test]

         apol       ASA      ASA+      ASA-     ASA_H     ASA_P    a_acc   
0    0.039739  0.052654  0.051855  0.044585  0.048074  0.048610  0.05698  \
1    0.076320  0.075341  0.000000  0.150000  0.000000  0.231884  0.00000   
2    0.053171  0.076801  0.015419  0.134993  0.000000  0.236377  0.05698   
3    0.067034  0.081523  0.109528  0.035056  0.058578  0.112676  0.05698   
4    0.014515  0.030469  0.028113  0.027999  0.026754  0.030642  0.02849   
..        ...       ...       ...       ...       ...       ...      ...   
141  0.064770  0.078362  0.028776  0.122582  0.102203  0.000000  0.00000   
142  0.062131  0.075601  0.031786  0.113588  0.098601  0.000000  0.00000   
143  0.064770  0.077528  0.028777  0.120921  0.101115  0.000000  0.00000   
144  0.059492  0.072341  0.034204  0.104288  0.094350  0.000000  0.00000   
145  0.063372  0.074162  0.021167  0.119882  0.094987  0.004104  0.00000   

     a_acid     a_aro    a_base  ...  vsurf_Wp4  vsurf_Wp5  vsurf_Wp6   
0       0.0  0

# Low variance features
We try to detect every feature that has a variance below the threshold

## IONIZABLE

In [3]:
FS_io_train.df.loc[:, FS_io_train.df.columns != "Log_MP_RATIO"].var(axis=1).mean() * 0.1

0.0010069034753557255

In [4]:
print("===== TRAIN =====")
df_ionizable_train_lv, col_ionizable_train_lv = FS_io_train.remove_low_variance(variance_threshold=0.05)
print("===== TEST =====")
df_ionizable_test_lv, col_ionizable_test_lv = FS_io_test.remove_low_variance(variance_threshold=0.05)



col_ionizable_lv: set = set(col_ionizable_train_lv) & set(col_ionizable_test_lv)
print(col_ionizable_lv)
print("length of the feature with low variance that are common for train and test: ", len(col_ionizable_lv))



df_ionizable_train = df_ionizable_train.drop(list(col_ionizable_lv), axis=1)
df_ionizable_test = df_ionizable_test.drop(list(col_ionizable_lv), axis=1)

===== TRAIN =====
===== TEST =====
{'MACCS(-30)', 'a_nBr', 'density', 'petitjeanSC', 'MACCS(-26)', 'vsurf_Wp8', 'GCUT_PEOE_1', 'MACCS(-20)', 'BCUT_SLOGP_3', 'GCUT_SLOGP_3', 'vsurf_EDmin2', 'MACCS(-40)', 'MACCS(--4)', 'MACCS(-39)', 'vsurf_R', 'MACCS(-21)', 'MACCS(-16)', 'MACCS(-22)', 'MACCS(-15)', 'MACCS(-44)', 'MACCS(-49)', 'MACCS(166)', 'GCUT_PEOE_0', 'BCUT_PEOE_0', 'MACCS(-31)', 'vsurf_EDmin3', 'MACCS(-23)', 'MACCS(-35)', 'vsurf_EDmin1', 'MACCS(-10)', 'VDistEq', 'a_ICM', 'BCUT_SMR_0', 'E_stb', 'MACCS(--9)', 'vsurf_CW2', 'nmol', 'MACCS(--6)', 'GCUT_SMR_3', 'MACCS(--5)', 'MACCS(-12)', 'npr2', 'GCUT_SMR_0', 'MACCS(-17)', 'VDistMa', 'GCUT_SMR_1', 'a_nB', 'vsurf_G', 'GCUT_PEOE_3', 'a_nP', 'VAdjMa', 'BCUT_PEOE_3', 'MACCS(-27)', 'MACCS(--2)', 'MACCS(-18)', 'BCUT_SMR_3', 'a_nI', 'MACCS(-68)', 'MACCS(-48)', 'petitjean', 'MACCS(-29)', 'MACCS(--7)', 'MACCS(-69)', 'MACCS(--1)', 'MACCS(-64)', 'MACCS(-14)', 'BCUT_SLOGP_0', 'vsurf_CW1', 'MACCS(-46)', 'PEOE_VSA_FHYD', 'MACCS(--3)'}
length of the fea

## NEUTRAL

In [5]:
print("===== TRAIN =====")
df_neutral_train_lv, col_neutral_train_lv =FS_ne_train.remove_low_variance(variance_threshold=0.05)
print("===== TEST =====")
df_neutral_test_lv, col_neutral_test_lv = FS_ne_test.remove_low_variance(variance_threshold=0.05)

col_neutral_lv: set = set(col_neutral_train_lv) & set(col_neutral_test_lv)
print(col_neutral_lv)
print("length of the feature that are common for train and test: ", len(col_neutral_lv))

df_neutral_train = df_neutral_train.drop(list(col_neutral_lv), axis=1)
df_neutral_test = df_neutral_test.drop(list(col_neutral_lv), axis=1)

===== TRAIN =====
===== TEST =====
{'MACCS(-30)', 'h_pKa', 'GCUT_PEOE_1', 'MACCS(-20)', 'BCUT_SLOGP_3', 'GCUT_SLOGP_3', 'vsurf_EDmin2', 'MACCS(-40)', 'MACCS(--4)', 'MACCS(-39)', 'vsurf_R', 'MACCS(-15)', 'MACCS(166)', 'MACCS(-44)', 'MACCS(-49)', 'GCUT_PEOE_0', 'BCUT_PEOE_0', 'MACCS(-13)', 'MACCS(-24)', 'MACCS(-31)', 'MACCS(-35)', 'vsurf_EDmin1', 'vsurf_EDmin3', 'MACCS(-10)', 'VDistEq', 'a_ICM', 'BCUT_SMR_0', 'E_stb', 'MACCS(--9)', 'vsurf_CW2', 'nmol', 'MACCS(--6)', 'GCUT_SMR_3', 'MACCS(--5)', 'MACCS(-12)', 'npr2', 'GCUT_SMR_0', 'VDistMa', 'FCharge', 'h_pKb', 'GCUT_PEOE_3', 'a_nP', 'vsurf_G', 'MACCS(-34)', 'std_dim2', 'VAdjMa', 'BCUT_PEOE_3', 'MACCS(-41)', 'MACCS(-27)', 'MACCS(--2)', 'MACCS(-18)', 'BCUT_SMR_3', 'a_nI', 'MACCS(-68)', 'MACCS(-48)', 'VAdjEq', 'MACCS(-29)', 'MACCS(--7)', 'MACCS(--1)', 'MACCS(-14)', 'BCUT_SLOGP_0', 'vsurf_CW1', 'a_nB', 'PEOE_VSA_FHYD', 'MACCS(--3)'}
length of the feature that are common for train and test:  65


## FULL

In [6]:
print("===== TRAIN =====")
df_full_train_lv, col_full_train_lv = FS_full_train.remove_low_variance(variance_threshold=0.05)
print("===== TEST =====")
df_full_test_lv, col_full_test_lv = FS_full_test.remove_low_variance(variance_threshold=0.05)

col_full_lv: set = set(col_full_train_lv) & set(col_full_test_lv)
print(col_full_lv)
print("length of the feature that are common for train and test: ", len(col_full_lv))

df_full_train = df_full_train.drop(list(col_full_lv), axis=1)
df_full_test = df_full_test.drop(list(col_full_lv), axis=1)

===== TRAIN =====
===== TEST =====
{'MACCS(-30)', 'GCUT_PEOE_1', 'MACCS(-20)', 'GCUT_SLOGP_3', 'BCUT_SLOGP_3', 'vsurf_EDmin2', 'MACCS(-40)', 'MACCS(--4)', 'MACCS(-39)', 'vsurf_R', 'MACCS(-15)', 'MACCS(166)', 'MACCS(-44)', 'GCUT_PEOE_0', 'BCUT_PEOE_0', 'MACCS(-31)', 'vsurf_EDmin3', 'MACCS(-35)', 'vsurf_EDmin1', 'MACCS(-10)', 'VDistEq', 'a_ICM', 'BCUT_SMR_0', 'E_stb', 'MACCS(--9)', 'vsurf_CW2', 'nmol', 'MACCS(--6)', 'GCUT_SMR_3', 'MACCS(--5)', 'MACCS(-12)', 'npr2', 'GCUT_SMR_0', 'VDistMa', 'h_pKb', 'GCUT_PEOE_3', 'a_nP', 'vsurf_G', 'VAdjMa', 'BCUT_PEOE_3', 'MACCS(-27)', 'MACCS(--2)', 'MACCS(-18)', 'BCUT_SMR_3', 'a_nI', 'MACCS(-68)', 'MACCS(-48)', 'petitjean', 'MACCS(-29)', 'MACCS(--7)', 'MACCS(--1)', 'MACCS(-14)', 'BCUT_SLOGP_0', 'vsurf_CW1', 'a_nB', 'PEOE_VSA_FHYD', 'MACCS(--3)'}
length of the feature that are common for train and test:  57


# High correlation feature

## IONIZABLE

In [7]:
df_io_train_correlation: pd.DataFrame = FS_io_train.get_correlation(df_ionizable_train_lv)
df_io_train_correlation = FS_io_train.remove_highly_correlated(df_io_train_correlation, df_ionizable_train_lv, threshold=0.9)

df_io_test_correlation: pd.DataFrame = FS_io_test.get_correlation(df_ionizable_test_lv)
df_io_test_correlation = FS_io_test.remove_highly_correlated(df_io_test_correlation, df_ionizable_test_lv, threshold=0.9)




In [13]:
# df_full_train_lv.loc[:,"vsurf_Wp8"]

df_io_train_removed_feat =  df_ionizable_train_lv.columns.difference(df_io_train_correlation.columns)
df_io_test_removed_feat =  df_ionizable_test_lv.columns.difference(df_io_test_correlation.columns)
# io_feat_corr = np.intersect1d(df_io_train_removed_feat.columns, df_io_test_removed_feat.columns)
print("Number of features with strong correlations: ", )
print(df_io_train_removed_feat)

Number of features with strong correlations: 
Index(['E_nb', 'FASA-', 'FASA_P', 'FCharge', 'MACCS(-11)', 'MACCS(-33)',
       'MACCS(-41)', 'MACCS(-42)', 'MACCS(-47)', 'MACCS(-51)', 'MACCS(-52)',
       'MACCS(-55)', 'MACCS(-58)', 'MACCS(-60)', 'MACCS(-61)', 'MACCS(-63)',
       'MACCS(-67)', 'MACCS(-70)', 'MACCS(-73)', 'MACCS(-76)', 'MACCS(-81)',
       'MACCS(-88)', 'MACCS(-96)', 'MACCS(103)', 'MACCS(110)', 'MACCS(117)',
       'MACCS(119)', 'MACCS(124)', 'MACCS(134)', 'MACCS(142)', 'MACCS(148)',
       'MACCS(153)', 'MACCS(159)', 'MACCS(160)', 'MACCS(161)', 'MACCS(164)',
       'PEOE_VSA_FPOS', 'Q_PC+', 'Q_PC-', 'Q_VSA_FPOL', 'Q_VSA_FPOS', 'RPC+',
       'RPC-', 'SMR', 'VAdjEq', 'VSA', 'Weight', 'b_ar', 'b_count', 'b_heavy',
       'b_rotN', 'chi0', 'chi0_C', 'chi1', 'h_pstrain', 'mr', 'opr_brigid',
       'pmi2', 'pmi3', 'radius', 'rings', 'vdw_vol', 'vol', 'vsa_acid',
       'vsurf_D2', 'vsurf_D3', 'vsurf_D4', 'vsurf_D5', 'vsurf_EWmin2',
       'vsurf_EWmin3', 'vsurf_HL1', 'vsurf_

## Cluster analysis

In [None]:
import pandas as pd
from feature_selector import *

d = {"A": [1,2,3,4], "B": [2,5,6,7], "C": [3,6,8,9], "D":[4,7,9,10]}
df = pd.DataFrame(data=d, index=["A", "B", "C", "D"])
df

FS = FeatureSelector(df)
FS.remove_highly_correlated(df)