# Import Libraries

In [16]:
import pandas as pd

from feature_engine.selection import (
    DropConstantFeatures,
    DropDuplicateFeatures,
    SmartCorrelatedSelection,
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score
from collections import Counter



# Read Data

In [17]:
train_df = pd.read_csv("../../data/train_data_v2.csv")
validate_df = pd.read_csv("../../data/validate_data_v2.csv")
test_df = pd.read_csv("../../data/test_data_v2.csv")


# Define features and target

In [18]:

X_train = train_df.drop(columns=["Severity"])
y_train = train_df["Severity"]

X_validate = validate_df.drop(columns=["Severity"])
y_validate = validate_df["Severity"]

X_test = test_df.drop(columns=["Severity"])
y_test = test_df["Severity"]


# Remove constant features

Constant features are those which contain only 1 value for all the observations.

In [19]:
# with tol=1 we tell the transformer to remove constant features
constant = DropConstantFeatures(tol=1)

# finds the constant features on the train set
constant.fit(X_train)

In [20]:
# the constant features can be found in the attribute
# features_to_drop_

len(constant.features_to_drop_)

2

In [21]:
# show the names of the constant features

constant.features_to_drop_

['Roundabout', 'Turning_Loop']

In [22]:
# check that the feature is indeed constant (that is,
# it has only 1 value in all the observations)

X_train['Roundabout'].unique()


array([0])

In [23]:
# check that the feature is indeed constant (that is,
# it has only 1 value in all the observations)

X_train['Turning_Loop'].unique()


array([0])

In [24]:
# remove constant features - transform method

print('Number of variables before removing constant: ', X_train.shape[1])
print('Number of variables before removing constant: ', X_validate.shape[1])
print('Number of variables before removing constant: ', X_test.shape[1])


X_train = constant.transform(X_train)
X_validate = constant.transform(X_validate)
X_test = constant.transform(X_test)

print('Number of variables after removing constant: ', X_train.shape[1])
print('Number of variables after removing constant: ', X_validate.shape[1])
print('Number of variables after removing constant: ', X_test.shape[1])

Number of variables before removing constant:  74
Number of variables before removing constant:  74
Number of variables before removing constant:  74
Number of variables after removing constant:  72
Number of variables after removing constant:  72
Number of variables after removing constant:  72


# Remove duplicated features
That is, features that are identical

In [25]:
duplicates = DropDuplicateFeatures()

# find duplicated features in the train set
duplicates.fit(X_train)

In [26]:
# the groups or identical variables can be seen in the 
# attribute duplicated_feature_sets

duplicates.duplicated_feature_sets_

[]

# Drop Correlated Features Smartly
With this class, each feature in the correlated group is selected based on different characteristics:

- the variance
- the cardinality
- the importance derived from a machine learning model

The transformer will select the feature with highest variance, cardinality or performance, depending what we choose on the selection_method parameter.



In [27]:
# Define an alternative scoring function
scorer = make_scorer(f1_score, average="weighted")  # Use weighted F1 score for multi-class

rf_model = RandomForestClassifier(n_estimators=10, random_state=1)

selection_methods = ["cardinality", "variance", "model_performance"]
methods = ["pearson", "spearman", "kendall"]

drop_counts = Counter()

for selection_method in selection_methods:
    for method in methods:
        print(f"\n- Testing Selection Method: {method}-{selection_method}")
        
        smart_corr = SmartCorrelatedSelection(
            method=method,
            threshold=0.7,
            missing_values="ignore",
            selection_method=selection_method,
            estimator=rf_model if selection_method == "model_performance" else None,
            scoring=scorer
        )
        
        smart_corr.fit(X_train, y_train)
        drop_counts.update(smart_corr.features_to_drop_)

        print("Correlated Feature Groups:", smart_corr.correlated_feature_sets_)
        print("Dropped Features:", smart_corr.features_to_drop_)
        print("Selected Features:", [f for f in X_train.columns if f not in smart_corr.features_to_drop_])



- Testing Selection Method: pearson-cardinality
Correlated Feature Groups: [{'Start_Time', 'End_Time'}, {'Weather_Condition_Rain', 'Weather_Condition_Clear / Cloudy'}]
Dropped Features: ['Start_Time', 'Weather_Condition_Rain']
Selected Features: ['End_Time', 'Distance(mi)', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Duration', 'Region', 'State_CA', 'State_FL', 'State_Others', 'State_TX', 'City_Charlotte', 'City_Houston', 'City_Los Angeles', 'City_Miami', 'City_Orlando', 'City_Others', 'Weather_Condition_Clear / Cloudy', 'Weather_Condition_Fog / Low Visibility', 'Weather_Condition_Others', 'Weather_Condition_Snow', 'Weather_Condition_Thunderstorms / Severe Weather', 'Weather_Year', 'Weather_Month', 'Weather_Day', 'Weather_Hour', 'Wind_Direction_CALM', 'Wind_Direction_E', 'Wind_Direction_ENE'

In [28]:
# Sort features by most commonly dropped
most_common_dropped = drop_counts.most_common()

# Print results
print("\nMost Commonly Dropped Features:")
for feature, count in most_common_dropped:
    print(f"{feature}: {count} times")


Most Commonly Dropped Features:
Start_Time: 9 times
Weather_Condition_Rain: 9 times
Weather_Hour: 4 times
End_Time: 2 times


Correlated Feature Groups: 
- [{'End_Time', 'Start_Time'}, {'Weather_Condition_Clear / Cloudy', 'Weather_Condition_Rain'}] or
- [{'Weather_Hour', 'End_Time', 'Start_Time'}, {'Weather_Condition_Clear / Cloudy', 'Weather_Condition_Rain'}]

Common intersection among all 4 methods is dropping 'Start_Time', 'Weather_Hour' & 'Weather_Condition_Rain'.

In [14]:
# remove correlated variables

print('Number of variables before removing correlated: ', X_train.shape[1])
print('Number of variables before removing correlated: ', X_validate.shape[1])
print('Number of variables before removing correlated: ', X_test.shape[1])

X_train = X_train.drop(columns=["Start_Time", "Weather_Hour", "Weather_Condition_Rain"])
X_validate = X_validate.drop(columns=["Start_Time", "Weather_Hour", "Weather_Condition_Rain"])
X_test = X_test.drop(columns=["Start_Time", "Weather_Hour", "Weather_Condition_Rain"])

print('Number of variables after removing correlated: ', X_train.shape[1])
print('Number of variables after removing correlated: ', X_validate.shape[1])
print('Number of variables after removing correlated: ', X_test.shape[1])

Number of variables before removing correlated:  72
Number of variables before removing correlated:  72
Number of variables before removing correlated:  72
Number of variables after removing correlated:  69
Number of variables after removing correlated:  69
Number of variables after removing correlated:  69


# Save to CSV

In [15]:
train_df = pd.concat([X_train, y_train], axis=1)
validate_df = pd.concat([X_validate, y_validate], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)
train_df.to_csv("../../data/train_data_v3.csv", index=False)
validate_df.to_csv("../../data/validate_data_v3.csv", index=False)
test_df.to_csv("../../data/test_data_v3.csv", index=False)