# Feature Filtering

This notebook performs an initial selection of features for model training. Since high correlations between features can lead to overfitting, the dataset is first analyzed for correlations, and one feature from each highly correlated pair is removed.

A more specific feature space reduction will be part of the model training process, as part of hyperparameter tuning. The reason being, that the optimal feature space dimension can be highly dependent on the model used.

In [None]:
import pandas as pd
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np

In [None]:
# ---- LOAD DATA ----

df = pd.read_csv("../data/processed/labeled_feature_matrix.csv")
X = df.drop(columns=["performance_class"])
y = df["performance_class"]

value_counts = y.value_counts()
print(value_counts)


# Filter by Variance

This aimes to remove features with true zero variance, as they don't contain useful information.

In [None]:
variance_threshold = 0
var_selector = VarianceThreshold(threshold=variance_threshold)
X_var_filtered = var_selector.fit_transform(X)
var_scores = var_selector.variances_

feature_names = X.columns
variance_df = pd.DataFrame({"Feature": feature_names, "Variance": var_scores})
variance_df = variance_df.sort_values(by="Variance", ascending=True)

# Features with low variance
weak_features = variance_df[variance_df["Variance"] == variance_threshold]["Feature"].tolist()
print("Low variance features (Variance < 0.01):", weak_features)

# Show retained features
selected_var_features = X.columns[var_selector.get_support()]
print(f"Retained features after variance based filtering : {list(selected_var_features)}")

# Update X with the filtered features
X = pd.DataFrame(X_var_filtered, columns=selected_var_features)

# ---- PLOT ----

plt.figure(figsize=(6, 22))
bars = plt.barh(variance_df["Feature"], variance_df["Variance"], color='#296099')
plt.xlabel("Variance")
plt.ylabel("Feature")
for bar in bars:
    if bar.get_width() < 55000:
        plt.text(bar.get_width() + 200, bar.get_y() + bar.get_height()/2, f'{bar.get_width():.10f}', va='center')

plt.savefig("../data/figures/02_feature_correlation_chart.pdf", bbox_inches="tight")
plt.show()

## FEATURE SCALING
After filtering out features with low variance, the remaining features are scaled to have a mean of 0 and a standard deviation of 1. This is done to ensure that all features have the same scale, which is important for many machine learning algorithms.

In [None]:
# ---- FEATURE SCALING ----

scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Filter Highly Correlated Features

Features, that are highly correlated to each other are redundant and can lead to overfitting. To avoid this, we remove one feature from each highly correlated pair.

In [None]:
# Calculate correlation matrix
corr_matrix = X_scaled.corr()

# Create a custom colormap
def custom_colormap():
    cmap = sns.diverging_palette(610, 610, as_cmap=True, center="dark", s=90)
    cmap_colors = cmap(np.arange(cmap.N))
    custom_color = np.array([106/255, 160/255, 209/255, 1]) 
    white = np.array([1, 1, 1, 1])
    threshold_high = int(0.95 * (cmap.N - 1))
    threshold_low = int(-0.96 * (cmap.N - 1))
    cmap_colors[threshold_high:] = custom_color  # Set values > 0.9 to custom color
    cmap_colors[:threshold_low] = custom_color   # Set values < -0.9 to custom color
    return ListedColormap(cmap_colors)

# Find feature pairs with correlation < -0.9 or > 0.9, excluding the diagonal
high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if corr_matrix.iloc[i, j] < -0.9 or corr_matrix.iloc[i, j] > 0.9:
            high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))


plt.figure(figsize=(12, 12))
heatmap = sns.heatmap(corr_matrix, annot=False, cmap=custom_colormap(), fmt=".2f")
plt.subplots_adjust(bottom=0.3)
print(high_corr_pairs)

# Remove one feature from each pair if the second feature remains in X_scaled
features_to_remove = set()
for pair in high_corr_pairs:
    if pair[0] in X_scaled.columns and pair[1] in X_scaled.columns:
        features_to_remove.add(pair[0])
        
# ---- PLOT ----

plt.savefig("../data/figures/02_feature_correlation_heatmap.pdf", 
            bbox_inches='tight', 
            dpi=300)
plt.show()

# Actually perform the dropping after saving the figure
for feature in features_to_remove:
    if feature in X_scaled.columns:
        X_scaled = X_scaled.drop(columns=feature)

print(f"Removed features: {features_to_remove}")


## Calculate ANOVA Scores
Anova scores are calculated to determine the importance of each feature for the classification task. The scores are later compared to the SHAP Feature Importance scores to evaluate the feature selection process, and assess if ANOVA F-Scores are a good indicator for feature importance.

In [None]:
anova_selector = SelectKBest(score_func=f_classif, k="all")
X_anova = anova_selector.fit_transform(X_scaled, y)
selected_features_anova = X_scaled.columns[anova_selector.get_support()]

anova_scores = anova_selector.scores_
feature_names = X_scaled.columns

anova_df = pd.DataFrame({"Feature": feature_names, "ANOVA Score": anova_scores})
anova_df = anova_df.sort_values(by="ANOVA Score", ascending=False)

# Weak ANOVA features
weak_features = anova_df[anova_df["ANOVA Score"] < 0.5]["Feature"].tolist()
print("Schwache Features (ANOVA Score < 0.5):", weak_features)

# ---- PLOT ----

plt.figure(figsize=(10, 4))
plt.bar(feature_names, anova_scores, color='#296099')
plt.xticks(rotation=90)
plt.xlabel("Feature")
plt.ylabel("ANOVA Score")
plt.savefig("../data/figures/02_feature_anova_chart.pdf", bbox_inches="tight")
plt.show()


## SAVE FILTERED FEATURE MATRIX

In [None]:
# ---- SAVE FILTERED FEATURE MATRIX ----

X_scaled_with_y = pd.concat([X_scaled, y.reset_index(drop=True)], axis=1)
X_scaled_with_y.to_csv("../data/features/filtered_labeled_feature_matrix.csv", index=False)
print("Feature Selection finished! Filtered FeatureSpace was saved to data/features/selected_features.csv")
