# Basic Filter Based Feature Selection Methods

* **Remove Constant Features**
  1. Remove Constant Features Using Variance Threshold
  2. Remove Constant Features Using Standard Deviation
  3. Remove Constant Features Using Categorical Variables
* **Remove Quasi-Constant Features**
* **Remove Duplicated Features**
* **Using the Feature Engine**
  1. Remove Constant Features Using Feature Engine
  2. Remove Quasi-Constant Features Using Feature Engine
  3. Remove Duplicated Features Using Feature Engine

* **Stack Feature Selection in a Pipeline**
  1. Using our own function
  2. Using the Feature Engine

# Import Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.width", 500)
pd.set_option("display.float_format", lambda x: "%.4f" % x)

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

# Import Dataset

In [None]:
def load_dataset(dir, drop_labels, target):
  df = pd.read_csv(dir)
  X = df.drop(labels=drop_labels, axis=1)
  y = df[target]
  return df, X, y

In [None]:
dir = "/content/drive/MyDrive/Colab Notebooks/datasets/FS/dataset.csv"
df, X, y = load_dataset(dir, drop_labels=["target"], target="target")

# Split Dataset

In [None]:
def create_train_test(data, drop_labels, target, test_size=0.33):
  X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=drop_labels, axis=1),
                                                      data[target],
                                                      test_size=test_size,
                                                      random_state=1)
  return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = create_train_test(df, drop_labels=["target"], target="target")

# Remove Constant Features

Constant features are features in a dataset that have the same value for all observations. In other words, these features have zero variance. Constant features do not provide any useful information for learning models and often introduce unnecessary noise. Therefore, removing constant features can improve model performance and reduce computational overhead. Identifying and removing constant features is particularly important in large-scale datasets as these features can negatively impact model performance.

In [None]:
selected_feature = VarianceThreshold(threshold=0)
selected_feature.fit(X_train)
print("Non-Constant Features: ", sum(selected_feature.get_support()))

Non-Constant Features:  258


In [None]:
constant = X_train.columns[~selected_feature.get_support()]

print("Constant Features: ", len(constant))

Constant Features:  42


In [None]:
constant

Index(['var_23', 'var_33', 'var_34', 'var_36', 'var_44', 'var_61', 'var_73', 'var_80', 'var_81', 'var_87', 'var_89', 'var_92', 'var_97', 'var_99', 'var_104', 'var_112', 'var_113', 'var_120', 'var_122', 'var_127', 'var_133', 'var_135', 'var_158', 'var_167', 'var_171', 'var_178', 'var_180', 'var_182', 'var_183', 'var_195', 'var_196', 'var_201', 'var_212', 'var_215', 'var_223', 'var_225', 'var_227', 'var_248', 'var_280', 'var_283', 'var_294', 'var_297'], dtype='object')

In [None]:
X_train["var_99"].unique()

array([0])

In [None]:
for col in constant:
    print(col, X_train[col].unique())

var_23 [0]
var_33 [0]
var_34 [0]
var_36 [0]
var_44 [0]
var_61 [0]
var_73 [0]
var_80 [0]
var_81 [0]
var_87 [0]
var_89 [0.]
var_92 [0]
var_97 [0]
var_99 [0]
var_104 [0]
var_112 [0]
var_113 [0]
var_120 [0]
var_122 [0]
var_127 [0]
var_133 [0]
var_135 [0]
var_158 [0]
var_167 [0]
var_171 [0]
var_178 [0.]
var_180 [0.]
var_182 [0]
var_183 [0]
var_195 [0]
var_196 [0]
var_201 [0]
var_212 [0]
var_215 [0]
var_223 [0]
var_225 [0]
var_227 [0.]
var_248 [0]
var_280 [0.]
var_283 [0.]
var_294 [0]
var_297 [0]


In [None]:
feature_names = X_train.columns[selected_feature.get_support()]

In [None]:
feature_names

Index(['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'var_6', 'var_7', 'var_8', 'var_9', 'var_10',
       ...
       'var_289', 'var_290', 'var_291', 'var_292', 'var_293', 'var_295', 'var_296', 'var_298', 'var_299', 'var_300'], dtype='object', length=258)

In [None]:
New_X_train = selected_feature.transform(X_train)
New_X_test = selected_feature.transform(X_test)

New_X_train.shape, New_X_test.shape

((33500, 258), (16500, 258))

In [None]:
X_train = pd.DataFrame(X_train, columns=feature_names)

In [None]:
X_train.head()

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12,var_13,var_14,var_15,var_16,var_17,var_18,var_19,var_20,var_21,var_22,var_24,var_25,var_26,var_27,var_28,var_29,var_30,var_31,var_32,var_35,var_37,var_38,var_39,var_40,var_41,var_42,var_43,var_45,var_46,var_47,var_48,var_49,var_50,var_51,var_52,var_53,var_54,var_55,var_56,var_57,var_58,var_59,var_60,var_62,var_63,var_64,var_65,var_66,var_67,var_68,var_69,var_70,var_71,var_72,var_74,var_75,var_76,var_77,var_78,var_79,var_82,var_83,var_84,var_85,var_86,var_88,var_90,var_91,var_93,var_94,var_95,var_96,var_98,var_100,var_101,var_102,var_103,var_105,var_106,var_107,var_108,var_109,var_110,var_111,var_114,var_115,var_116,var_117,var_118,var_119,var_121,var_123,var_124,var_125,var_126,var_128,var_129,var_130,var_131,var_132,var_134,var_136,var_137,var_138,var_139,var_140,var_141,var_142,var_143,var_144,var_145,var_146,var_147,var_148,var_149,var_150,var_151,var_152,var_153,var_154,var_155,var_156,var_157,var_159,var_160,var_161,var_162,var_163,var_164,var_165,var_166,var_168,var_169,var_170,var_172,var_173,var_174,var_175,var_176,var_177,var_179,var_181,var_184,var_185,var_186,var_187,var_188,var_189,var_190,var_191,var_192,var_193,var_194,var_197,var_198,var_199,var_200,var_202,var_203,var_204,var_205,var_206,var_207,var_208,var_209,var_210,var_211,var_213,var_214,var_216,var_217,var_218,var_219,var_220,var_221,var_222,var_224,var_226,var_228,var_229,var_230,var_231,var_232,var_233,var_234,var_235,var_236,var_237,var_238,var_239,var_240,var_241,var_242,var_243,var_244,var_245,var_246,var_247,var_249,var_250,var_251,var_252,var_253,var_254,var_255,var_256,var_257,var_258,var_259,var_260,var_261,var_262,var_263,var_264,var_265,var_266,var_267,var_268,var_269,var_270,var_271,var_272,var_273,var_274,var_275,var_276,var_277,var_278,var_279,var_281,var_282,var_284,var_285,var_286,var_287,var_288,var_289,var_290,var_291,var_292,var_293,var_295,var_296,var_298,var_299,var_300
21388,0,0,0.0,2.94,0.0,0,0,0,0,0,0.0,0.0,0.0,0,3,0.0,0.0,0.0,0,0,295.8336,0.0,0,0,0.0,0.0,0,1.82,0.0,3,0.0,289.6704,0,0,0.0,0.0,0.0,0.0,0,0,107926.1007,0.0,0.0,99,0.0,0,0,0,0,275.4432,0,0.0,0,0,0,3,0,0.0,0.0,0,0,0,0,1,0,0,2,298.9152,222.4131,0.0,0.0,0.0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,3,0.0,0.0,37.24,0,0,0.0,0,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,3,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,0,1,0.0,0.0,0,0.0,0.0,0,0,0.0,0,0,0,0.0,0.0,0,0,0,0,0.0,0,1,0,0,1,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,3,0.0,0,0.0,0.0,0,0,0,0,0.0,0,289.6704,0,0,0.0,0,3,283.5072,0,0,0,0,0,0.0,0,0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0,0,0,0.0,1,0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0,0,0,0.0,0.0,0,0.0,0,0,0,0,0.0,0,0.0,0,0.0,0,0.0,0,0,0,0,0.0,0.0
42853,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,0,1.84,0.0,3,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,66066.7968,0.0,0.0,99,0.0,0,0,0,0,0.0,0,0.0,0,0,0,3,0,0.0,0.0,0,0,0,0,1,0,0,1,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,3,0.0,0.0,22.08,0,0,0.0,0,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,0,0,0.0,0.0,0,0.0,0.0,0,0,0.0,0,0,0,0.0,0.0,0,0,0,0,0.0,0,1,0,0,1,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,3,0.0,0,0.0,0.0,0,0,0,0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0,0,0,0,0.0,0,0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0,0,0,0.0,0,0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0,0,0,0.0,0.0,0,0.0,0,0,0,0,0.0,0,0.0,0,0.0,0,0.0,0,0,0,0,0.0,0.0
41860,0,0,0.0,2.88,0.0,0,0,0,0,0,0.0,0.0,0.0,0,3,0.0,0.0,0.0,0,0,2.73,0.0,0,0,0.0,0.0,0,1.84,0.0,3,0.0,2.73,0,0,0.0,0.0,0.0,0.0,0,0,127365.93,0.0,0.0,3,0.0,0,0,0,0,1.9251,0,0.0,0,0,0,3,0,0.0,0.0,0,0,0,0,1,0,0,1,2.97,0.2079,0.0,0.0,0.0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,3,0.0,0.0,25.92,0,0,0.0,0,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,3,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,0,1,0.0,0.0,0,0.0,0.0,0,0,0.0,0,0,0,0.0,0.0,0,0,0,0,0.0,0,1,0,0,1,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,3,0.0,0,0.0,0.0,0,0,0,0,0.0,0,3.0,0,0,0.0,0,3,2.82,0,0,0,0,0,0.0,0,0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0,0,0,0.0,1,0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0,0,0,0.0,0.0,0,0.0,0,0,0,0,0.0,0,0.0,0,0.0,0,0.0,0,0,0,0,0.0,0.0
33322,0,0,0.0,5.94,0.0,0,0,0,0,0,0.0,0.0,0.0,0,3,0.0,0.0,0.0,0,0,232.8,0.0,0,0,0.0,0.0,0,1.96,0.0,3,0.0,119.4228,0,0,0.0,0.0,386.745,0.0,0,0,46945.7208,0.0,0.0,1,2.91,0,0,0,0,149.8662,0,276.0,0,0,0,3,0,0.0,0.0,0,0,0,0,1,0,0,2,120.6414,5.19,0.0,0.0,0.0,1,0,0,0.0,0.0,0,0.0,378.603,601.77,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,3,0.0,11.4,38.13,0,0,0.0,0,0,33.84,0.0,0,0,0,0.0,0.0,0.0,0,0.0,2.97,3,0.0,0.0,0,0.0,0.0,46.56,0,0.0,0,14.7,8.64,0,35280.0,0,0,0.0,0,1,0.0,0.0,0,0.0,2.82,0,0,0.0,0,0,0,0.0,390.816,0,0,0,0,0.0,0,1,0,0,1,0,0.0,0.0,386.745,0,0.0,0,11.52,0.0,43.68,0.0,0,0,0,0,0.0,0,288.0,0.0,0.0,1,50.49,0.0,3,0.0,0,0.0,0.0,0,0,0,0,2.91,0,114.5484,0,0,0.0,0,3,196.413,0,0,0,0,0,0.0,0,0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0,0,0,13.8,2,0,0.0,0.0,0.0,0,0.0,0,0,0.0,2.79,0,0,0,34809.7176,33.84,0,33.84,0,0,0,0,0.0,0,0.0,0,0.0,0,0.0,0,0,0,0,0.0,0.0
16289,0,0,0.0,2.94,0.0,0,0,0,0,0,0.0,0.0,0.0,0,3,0.0,0.0,0.0,0,0,2.79,0.0,0,0,0.0,0.0,0,1.86,0.0,3,0.0,2.79,0,0,0.0,0.0,0.0,0.0,0,0,107926.1007,0.0,0.0,3,0.0,0,0,0,0,2.5662,0,0.0,0,0,0,3,0,0.0,0.0,0,0,0,0,1,0,0,2,2.73,2.3571,0.0,0.0,0.0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,3,0.0,0.0,21.85,0,0,0.0,0,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,3,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0,0.0,0,0,0.0,0,1,0.0,0.0,0,0.0,0.0,0,0,0.0,0,0,0,0.0,0.0,0,0,0,0,0.0,0,1,0,0,1,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,3,0.0,0,0.0,0.0,0,0,0,0,0.0,0,2.76,0,0,0.0,0,3,2.82,0,0,0,0,0,0.0,0,0,0.0,0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0,0,0,0.0,1,0,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0,0,0,0.0,0.0,0,0.0,0,0,0,0,0.0,0,0.0,0,0.0,0,0.0,0,0,0,0,0.0,0.0


## Remove Constant Features Using Variance Threshold

In [None]:
def remove_constant_features_variance_threshold(dir, drop_labels, target, test_size=0.3, threshold=0):
  # Load Dataset
  df = pd.read_csv(dir)
  X = df.drop(labels=drop_labels, axis=1)
  y = df[target]

  # Split Dataset
  X_train, X_test, y_train, y_test = train_test_split(df.drop(labels=drop_labels, axis=1),
                                                      df[target],
                                                      test_size=test_size,
                                                      random_state=1)
  # Using VarianceThreshold
  selected_feature = VarianceThreshold(threshold=threshold)
  selected_feature.fit(X_train)
  print("Non-Constant Features: ", sum(selected_feature.get_support()))
  constant = X_train.columns[~selected_feature.get_support()]
  print("Constant Features: ", len(constant))
  Train_feature_names = X_train.columns[selected_feature.get_support()]
  Test_feature_names = X_test.columns[selected_feature.get_support()]
  X_train = selected_feature.transform(X_train)
  X_train = pd.DataFrame(X_train, columns=Train_feature_names)
  X_test = selected_feature.transform(X_test)
  X_test = pd.DataFrame(X_test, columns=Test_feature_names)
  return X_train, X_test

In [None]:
X_train, X_test = remove_constant_features_variance_threshold(dir, drop_labels=["target"], target="target")

Non-Constant Features:  258
Constant Features:  42


## Remove Constant Features Using Standard Deviation

In [None]:
def remove_constant_features_std(dir, drop_labels, target, test_size=0.3):
  # Load Dataset
  df = pd.read_csv(dir)
  X = df.drop(labels=drop_labels, axis=1)
  y = df[target]

  # Split Dataset
  X_train, X_test, y_train, y_test = train_test_split(df.drop(labels=drop_labels, axis=1),
                                                      df[target],
                                                      test_size=test_size,
                                                      random_state=1)
  # Using Standard Deviation
  constant_features = [feature for feature in X_train.columns if X_train[feature].std() == 0]
  print("Constant Features: ", len(constant_features))
  non_constant_features = [feature for feature in X_train.columns if X_train[feature].std() != 0]
  print("Non-Constant Features: ", len(non_constant_features))
  X_train.drop(labels=constant_features, axis=1, inplace=True)
  X_test.drop(labels=constant_features, axis=1, inplace=True)
  return X_train, X_test

In [None]:
X_train, X_test = remove_constant_features_std(dir, drop_labels=["target"], target="target")

Constant Features:  42
Non-Constant Features:  258


## Remove Constant Features Using Categorical Variables

In [None]:
def remove_constant_features_categorical(dir, drop_labels, target, test_size=0.3):
  # Load Dataset
  df = pd.read_csv(dir)
  X = df.drop(labels=drop_labels, axis=1)
  y = df[target]

  # Split Dataset
  X_train, X_test, y_train, y_test = train_test_split(df.drop(labels=drop_labels, axis=1),
                                                      df[target],
                                                      test_size=test_size,
                                                      random_state=1)
  # Using Categorical Variables
  constant_features = [feature for feature in X_train.columns if X_train[feature].nunique() == 1]
  print("Constant Features: ", len(constant_features))
  non_constant_features = [feature for feature in X_train.columns if X_train[feature].nunique() > 1]
  print("Non-Constant Features: ", len(non_constant_features))
  X_train.drop(labels=constant_features, axis=1, inplace=True)
  X_test.drop(labels=constant_features, axis=1, inplace=True)
  return X_train, X_test

In [None]:
X_train, X_test = remove_constant_features_categorical(dir, drop_labels=["target"], target="target")

Constant Features:  42
Non-Constant Features:  258


# Remove Quasi-Constant Features

Quasi-constant features are features in a dataset that have mostly the same value, but they are not entirely constant. In other words, these features have a high percentage of observations with the same value, but there are also some rare cases where the value differs. Quasi-constant features typically introduce unnecessary noise during model training and can negatively impact model performance. Therefore, identifying and removing quasi-constant features is important for improving model performance and reducing redundant information.

In [None]:
# We only need to change the threshold value
X_train, X_test = remove_constant_features_variance_threshold(dir, drop_labels=["target"], target="target", threshold=0.01)

Non-Constant Features:  213
Constant Features:  87


In [None]:
def remove_quasi_constant_features(dir, drop_labels, target, test_size=0.3):
  # Load Dataset
  df = pd.read_csv(dir)
  X = df.drop(labels=drop_labels, axis=1)
  y = df[target]

  # Split Dataset
  X_train, X_test, y_train, y_test = train_test_split(df.drop(labels=drop_labels, axis=1),
                                                      df[target],
                                                      test_size=test_size,
                                                      random_state=1)
  # Using Standard Deviation
  quasi_constant_features = []
  non_quasi_constant_features = []
  for feature in X_train.columns:
    predominant = X_train[feature].value_counts(normalize=True).sort_values(ascending=False).values[0]
    if predominant > 0.998:
      quasi_constant_features.append(feature)
    else:
      non_quasi_constant_features.append(feature)
  print("Non Quasi-Constant Features: ", len(non_quasi_constant_features))
  print("Quasi-Constant Features: ", len(quasi_constant_features))
  X_train.drop(labels=quasi_constant_features, axis=1, inplace=True)
  X_test.drop(labels=quasi_constant_features, axis=1, inplace=True)
  return X_train, X_test

In [None]:
X_train, X_test = remove_quasi_constant_features(dir, drop_labels=["target"], target="target")

Non Quasi-Constant Features:  163
Quasi-Constant Features:  137


# Remove Duplicated Features

Duplicated features are features within a dataset that contain the same or nearly identical information. These features add unnecessary redundancy and complexity during model training, often resulting in increased computational costs. Duplicated features can degrade model performance and lead to unnecessary computational overhead. Therefore, it is important to remove or process them from the dataset.

In [None]:
def remove_duplicated_features(dir, drop_labels, target, test_size=0.3, report_duplicated_feature_pairs=False):
  # Load Dataset
  df = pd.read_csv(dir)
  X = df.drop(labels=drop_labels, axis=1)
  y = df[target]

  # Split Dataset
  X_train, X_test, y_train, y_test = train_test_split(df.drop(labels=drop_labels, axis=1),
                                                      df[target],
                                                      test_size=test_size,
                                                      random_state=1)
  # Using own function
  duplicated_feature_pairs = {}
  duplicated_features = []
  for i in range(0, len(X_train.columns)):
    feature_1 = X_train.columns[i]
    if feature_1 not in duplicated_features:
      duplicated_feature_pairs[feature_1] = []
      for feature_2 in X_train.columns[i + 1:]:
        if X_train[feature_1].equals(X_train[feature_2]):
          duplicated_feature_pairs[feature_1].append(feature_2)
          duplicated_features.append(feature_2)

  print("Duplicated Features: ", len(duplicated_features))
  all_var = list(X_train.columns)
  non_duplicated_features = [var for var in all_var if var not in duplicated_features]
  print("Non-Duplicated Features: ", len(non_duplicated_features))
  X_train = X_train[duplicated_feature_pairs.keys()]
  X_test = X_test[duplicated_feature_pairs.keys()]

  if report_duplicated_feature_pairs:
    print(50*"#")
    for feature in duplicated_feature_pairs.keys():
      if len(duplicated_feature_pairs[feature]) > 0:
        print(feature, duplicated_feature_pairs[feature])
  return X_train, X_test

In [None]:
X_train, X_test = remove_duplicated_features(dir, drop_labels=["target"], target="target", report_duplicated_feature_pairs=True)

Duplicated Features:  56
Non-Duplicated Features:  244
var_6 ['var_151']
var_7 ['var_116']
var_23 ['var_33', 'var_34', 'var_36', 'var_44', 'var_61', 'var_73', 'var_80', 'var_81', 'var_87', 'var_92', 'var_97', 'var_99', 'var_104', 'var_112', 'var_113', 'var_120', 'var_122', 'var_127', 'var_133', 'var_135', 'var_158', 'var_167', 'var_171', 'var_182', 'var_183', 'var_195', 'var_196', 'var_201', 'var_212', 'var_215', 'var_223', 'var_225', 'var_248', 'var_294', 'var_297']
var_37 ['var_148']
var_43 ['var_106']
var_60 ['var_216']
var_66 ['var_69']
var_67 ['var_287']
var_71 ['var_289']
var_84 ['var_199']
var_89 ['var_178', 'var_180', 'var_227', 'var_280', 'var_283']
var_143 ['var_296']
var_149 ['var_239']
var_177 ['var_250']
var_187 ['var_285']
var_221 ['var_263']
var_226 ['var_232']
var_229 ['var_269']


# Using the Feature Engine

In [None]:
# Import Necessary Libraries
!pip install feature_engine

from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures
from sklearn.pipeline import Pipeline



In [None]:
# Remove Constant Features Using Feature Engine
def remove_constant_features_using_feature_engine(dir, drop_labels, target, test_size=0.3):
  # Load Dataset
  df = pd.read_csv(dir)
  X = df.drop(labels=drop_labels, axis=1)
  y = df[target]

  # Split Dataset
  X_train, X_test, y_train, y_test = train_test_split(df.drop(labels=drop_labels, axis=1),
                                                      df[target],
                                                      test_size=test_size,
                                                      random_state=1)
  # Using the Feature Engine
  selected_feature = DropConstantFeatures(tol=1, variables=None, missing_values='raise')
  selected_feature.fit(X_train)

  constant_features = selected_feature.features_to_drop_
  print("Constant Features: ", len(constant_features))
  all_var = list(X_train.columns)
  non_constant_features = [var for var in all_var if var not in constant_features]
  print("Non-Constant Features: ", len(non_constant_features))
  X_train.drop(labels=constant_features, axis=1, inplace=True)
  X_test.drop(labels=constant_features, axis=1, inplace=True)
  return X_train, X_test

In [None]:
X_train, X_test = remove_constant_features_using_feature_engine(dir, drop_labels=["target"], target="target")

Constant Features:  42
Non-Constant Features:  258


In [None]:
# Remove Quasi-Constant Features Using Feature Engine
def remove_quasi_constant_features_using_feature_engine(dir, drop_labels, target, test_size=0.3):
  # Load Dataset
  df = pd.read_csv(dir)
  X = df.drop(labels=drop_labels, axis=1)
  y = df[target]

  # Split Dataset
  X_train, X_test, y_train, y_test = train_test_split(df.drop(labels=drop_labels, axis=1),
                                                      df[target],
                                                      test_size=test_size,
                                                      random_state=1)
  # Using the Feature Engine
  selected_feature = DropConstantFeatures(tol=0.998, variables=None, missing_values='raise')
  selected_feature.fit(X_train)

  constant_features = selected_feature.features_to_drop_
  print("Quasi-Constant Features: ", len(constant_features))
  all_var = list(X_train.columns)
  non_constant_features = [var for var in all_var if var not in constant_features]
  print("Non-Quasi-Constant Features: ", len(non_constant_features))
  X_train.drop(labels=constant_features, axis=1, inplace=True)
  X_test.drop(labels=constant_features, axis=1, inplace=True)
  return X_train, X_test

In [None]:
X_train, X_test = remove_quasi_constant_features_using_feature_engine(dir, drop_labels=["target"], target="target")

Quasi-Constant Features:  137
Non-Quasi-Constant Features:  163


In [None]:
# Remove duplicated Features Using Feature Engine
def remove_duplicated_features_using_feature_engine(dir, drop_labels, target, test_size=0.3, report_duplicated_feature_pairs=False):
  # Load Dataset
  df = pd.read_csv(dir)
  X = df.drop(labels=drop_labels, axis=1)
  y = df[target]

  # Split Dataset
  X_train, X_test, y_train, y_test = train_test_split(df.drop(labels=drop_labels, axis=1),
                                                      df[target],
                                                      test_size=test_size,
                                                      random_state=1)
  # Using the Feature Engine
  selected_feature = DropDuplicateFeatures(variables=None, missing_values='raise')
  selected_feature.fit(X_train)

  duplicated_features = selected_feature.features_to_drop_
  print("Duplicated Features: ", len(duplicated_features))
  all_var = list(X_train.columns)
  non_constant_features = [var for var in all_var if var not in duplicated_features]
  print("Non-Duplicated Features: ", len(non_constant_features))
  X_train.drop(labels=duplicated_features, axis=1, inplace=True)
  X_test.drop(labels=duplicated_features, axis=1, inplace=True)

  if report_duplicated_feature_pairs:
    print("Duplicated Features Pairs: ")
    for i in selected_feature.duplicated_feature_sets_:
      print(i)

  return X_train, X_test

In [None]:
X_train, X_test = remove_duplicated_features_using_feature_engine(dir, drop_labels=["target"], target="target", report_duplicated_feature_pairs=True)

Duplicated Features:  57
Non-Duplicated Features:  243
Duplicated Features Pairs: 
{'var_151', 'var_6'}
{'var_7', 'var_116'}
{'var_89', 'var_23', 'var_225', 'var_73', 'var_120', 'var_248', 'var_36', 'var_104', 'var_92', 'var_178', 'var_212', 'var_223', 'var_196', 'var_34', 'var_182', 'var_80', 'var_215', 'var_158', 'var_201', 'var_167', 'var_113', 'var_87', 'var_227', 'var_81', 'var_112', 'var_183', 'var_44', 'var_195', 'var_61', 'var_127', 'var_133', 'var_171', 'var_280', 'var_99', 'var_97', 'var_135', 'var_294', 'var_283', 'var_180', 'var_33', 'var_297', 'var_122'}
{'var_37', 'var_148'}
{'var_106', 'var_43'}
{'var_60', 'var_216'}
{'var_66', 'var_69'}
{'var_287', 'var_67'}
{'var_289', 'var_71'}
{'var_199', 'var_84'}
{'var_143', 'var_296'}
{'var_149', 'var_239'}
{'var_250', 'var_177'}
{'var_187', 'var_285'}
{'var_221', 'var_263'}
{'var_232', 'var_226'}
{'var_229', 'var_269'}


# Stack Feature Selection in a Pipeline

In [None]:
def basic_feature_selection_pipeline(dir, drop_labels, target, test_size=0.30):
  # Load Dataset
  df = pd.read_csv(dir)
  X = df.drop(labels=drop_labels, axis=1)
  y = df[target]

  # Split Dataset
  X_train, X_test, y_train, y_test = train_test_split(df.drop(labels=drop_labels, axis=1),
                                                      df[target],
                                                      test_size=test_size,
                                                      random_state=1)
  # Remove Constant Features Using Standard Deviation
  constant_features = [feature for feature in X_train.columns if X_train[feature].std() == 0]
  print("Constant Features: ", len(constant_features))
  non_constant_features = [feature for feature in X_train.columns if X_train[feature].std() != 0]
  X_train.drop(labels=constant_features, axis=1, inplace=True)
  X_test.drop(labels=constant_features, axis=1, inplace=True)

  # Remove Quasi Constant Features Using Standard Deviation
  quasi_constant_features = []
  non_quasi_constant_features = []
  for feature in X_train.columns:
    predominant = X_train[feature].value_counts(normalize=True).sort_values(ascending=False).values[0]
    if predominant > 0.998:
      quasi_constant_features.append(feature)
    else:
      non_quasi_constant_features.append(feature)
  print("Quasi-Constant Features: ", len(quasi_constant_features))
  X_train.drop(labels=quasi_constant_features, axis=1, inplace=True)
  X_test.drop(labels=quasi_constant_features, axis=1, inplace=True)

  # Remove Duplicated Features Using My Own Function
  duplicated_feature_pairs = {}
  duplicated_features = []
  for i in range(0, len(X_train.columns)):
    feature_1 = X_train.columns[i]
    if feature_1 not in duplicated_features:
      duplicated_feature_pairs[feature_1] = []
      for feature_2 in X_train.columns[i + 1:]:
        if X_train[feature_1].equals(X_train[feature_2]):
          duplicated_feature_pairs[feature_1].append(feature_2)
          duplicated_features.append(feature_2)

  print("Duplicated Features: ", len(duplicated_features))
  all_var = list(X_train.columns)
  non_duplicated_features = [var for var in all_var if var not in duplicated_features]
  print("All Features For Train: ", len(non_duplicated_features))
  X_train = X_train[duplicated_feature_pairs.keys()]
  X_test = X_test[duplicated_feature_pairs.keys()]

  return X_train, X_test

In [None]:
X_train, X_test = basic_feature_selection_pipeline(dir, drop_labels=["target"], target="target")

Constant Features:  42
Quasi-Constant Features:  95
Duplicated Features:  6
All Features For Train:  157


In [None]:
def basic_feature_selection_pipeline_using_feature_engine(dir, drop_labels, target, test_size=0.30):
  # Load Dataset
  df = pd.read_csv(dir)
  X = df.drop(labels=drop_labels, axis=1)
  y = df[target]

  # Split Dataset
  X_train, X_test, y_train, y_test = train_test_split(df.drop(labels=drop_labels, axis=1),
                                                      df[target],
                                                      test_size=test_size,
                                                      random_state=1)
  # Remove Constant and Duplicated Features Using the Feature Engine
  pipe = Pipeline([
    ('constant', DropConstantFeatures(tol=0.998)),
    ('duplicated', DropDuplicateFeatures()),
  ])
  pipe.fit(X_train)
  X_train = pipe.transform(X_train)
  X_test = pipe.transform(X_test)
  print("Constant Features: ", len(pipe.named_steps['constant'].features_to_drop_))
  print("Dplicated Features: ", len(pipe.named_steps['duplicated'].features_to_drop_))
  print("All Features For Train: ", X_train.shape[1])
  return X_train, X_test

In [None]:
X_train, X_test = basic_feature_selection_pipeline_using_feature_engine(dir, drop_labels=["target"], target="target")

Constant Features:  137
Dplicated Features:  6
All Features For Train:  157
