In [14]:
# Load necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif

## Filter based methods

### Variance threshold method

In [6]:
# Create a dummy dataset with costant, quasi-constnat feature
dummy_data = {
    'feature1': [0, 0, 0, 0, 0],
    'feature2': [0, 1, 0, 1, 0],
    'feature3': [1, 2, 3, 4, 5]
}
dummy_df = pd.DataFrame(dummy_data)

# Apply VarianceThreshold
# Here, we set the threshold to 0 to remove features with zero variance
selector = VarianceThreshold(threshold=0)
X_variance_threshold = selector.fit_transform(dummy_df)

# Display selected features
print("Selected features using VarianceThreshold:")
print(dummy_df.columns[selector.get_support()])

Selected features using VarianceThreshold:
Index(['feature2', 'feature3'], dtype='object')


### Correlation method

In [7]:
# Create a dummy dataset with correlated features
correlated_data = {
    'feature1': [1, 2, 3, 4, 5],
    'feature2': [2, 4, 6, 8, 10],  # Perfectly correlated with feature1
    'feature3': [5, 4, 3, 2, 1],   # Negatively correlated with feature1
    'feature4': [1, 1, 1, 1, 1]    # Constant feature
}
correlated_df = pd.DataFrame(correlated_data)

# Calculate the correlation matrix
correlation_matrix = correlated_df.corr()

# Display the correlation matrix
print("Correlation matrix:")
print(correlation_matrix)

# Set a threshold for removing correlated features
threshold = 0.9

# Identify features to drop by going over the correlation matrix 2d grid
columns_to_drop = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            colname = correlation_matrix.columns[i]
            columns_to_drop.add(colname)

# Drop the correlated features
reduced_df = correlated_df.drop(columns=columns_to_drop)

# Display the remaining features
print("Remaining features after removing correlated features:")
print(reduced_df.columns)


Correlation matrix:
          feature1  feature2  feature3  feature4
feature1       1.0       1.0      -1.0       NaN
feature2       1.0       1.0      -1.0       NaN
feature3      -1.0      -1.0       1.0       NaN
feature4       NaN       NaN       NaN       NaN
Remaining features after removing correlated features:
Index(['feature1', 'feature4'], dtype='object')


In [15]:
# Create a dummy dataset with features and a target variable
dummy_data = {
    'feature1': [1, 2, 3, 4, 5],
    'feature2': [2, 4, 6, 8, 10],
    'feature3': [5, 4, 3, 2, 1],
    'feature4': [1, 1, 1, 1, 1],
    'target': [0, 1, 0, 1, 0]
}
dummy_df = pd.DataFrame(dummy_data)

# Separate features and target
X = dummy_df.drop(columns=['target'])
y = dummy_df['target']

# Calculate mutual information
mutual_info = mutual_info_classif(X, y)

# Create a DataFrame to display the mutual information scores
mutual_info_df = pd.DataFrame({'Feature': X.columns, 'Mutual Information': mutual_info})

# Display the mutual information scores
print("Mutual Information scores for each feature:")
print(mutual_info_df)


Mutual Information scores for each feature:
    Feature  Mutual Information
0  feature1            0.000000
1  feature2            0.000000
2  feature3            0.000000
3  feature4            1.383333
