In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier

In [2]:
pd.set_option("display.max_columns", None)

In [3]:
df_features_clean = pd.read_csv('../data/notebooks/features_difference.csv')

Feature Selection

In [4]:
X = df_features_clean.drop('winner', axis=1)
y = df_features_clean['winner']

In [5]:
print(X.shape, '\n', X.columns)

(4584, 82) 
 Index(['year', 'month', 'day_of_week', 'height_diff', 'weight_diff',
       'reach_diff', 'leg_reach_diff', 'sig_strikes_landed_per_minute_diff',
       'sig_strikes_absorbed_per_minute_diff', 'takedowns_avg_diff',
       'submission_avg_diff', 'knockdown_avg_diff', 'fight_time_avg_diff',
       'avg_rounds_diff', 'avg_time_diff', 'avg_knockdowns_diff',
       'avg_sig_attempts_diff', 'avg_sig_strikes_diff',
       'avg_total_strikes_attempts_diff', 'avg_total_strikes_diff',
       'avg_sub_attempts_diff', 'avg_takedowns_diff',
       'avg_takedown_attempts_diff', 'avg_head_strikes_diff',
       'avg_head_attempts_diff', 'avg_body_strikes_diff',
       'avg_body_attempts_diff', 'avg_leg_strikes_diff',
       'avg_leg_attempts_diff', 'avg_distance_diff',
       'avg_distance_attempts_diff', 'avg_clinch_strikes_diff',
       'avg_clinch_attempts_diff', 'avg_ground_strikes_diff',
       'avg_ground_attempts_diff', 'wins_before_diff', 'losses_before_diff',
       'total_fights

Variance Threshold

In [6]:
vt = VarianceThreshold(threshold=0.05)
vt.fit_transform(X)
vt_selected = X.columns[vt.get_support()]
vt_dropped = set(X.columns) - set(vt_selected)

print('Columns selected: ', vt_selected, '\n\n', len(vt_selected), 
    '\n\n', 'Columns dropped: ', vt_dropped, '\n\n', len(vt_dropped))

X = X[vt_selected]

Columns selected:  Index(['year', 'month', 'day_of_week', 'height_diff', 'weight_diff',
       'reach_diff', 'leg_reach_diff', 'sig_strikes_landed_per_minute_diff',
       'sig_strikes_absorbed_per_minute_diff', 'takedowns_avg_diff',
       'submission_avg_diff', 'knockdown_avg_diff', 'fight_time_avg_diff',
       'avg_rounds_diff', 'avg_time_diff', 'avg_knockdowns_diff',
       'avg_sig_attempts_diff', 'avg_sig_strikes_diff',
       'avg_total_strikes_attempts_diff', 'avg_total_strikes_diff',
       'avg_sub_attempts_diff', 'avg_takedowns_diff',
       'avg_takedown_attempts_diff', 'avg_head_strikes_diff',
       'avg_head_attempts_diff', 'avg_body_strikes_diff',
       'avg_body_attempts_diff', 'avg_leg_strikes_diff',
       'avg_leg_attempts_diff', 'avg_distance_diff',
       'avg_distance_attempts_diff', 'avg_clinch_strikes_diff',
       'avg_clinch_attempts_diff', 'avg_ground_strikes_diff',
       'avg_ground_attempts_diff', 'wins_before_diff', 'losses_before_diff',
       'total_

Recursive Feature Elimination (RFECV)

In [7]:
model = RandomForestClassifier(random_state=42)
cv = StratifiedKFold(5)

rfecv = RFECV(estimator=model, step=1, cv=cv, scoring='accuracy')
rfecv.fit(X, y)

rfe_selected = X.columns[rfecv.support_]
rfe_dropped = set(X.columns) - set(rfe_selected)

print('Columns selected: ', rfe_selected, '\n\n', len(rfe_selected), 
    '\n\n', 'Columns dropped: ', rfe_dropped, '\n\n', len(rfe_dropped))

X = X[rfe_selected]

Columns selected:  Index(['year', 'month', 'height_diff', 'weight_diff', 'reach_diff',
       'leg_reach_diff', 'sig_strikes_landed_per_minute_diff',
       'sig_strikes_absorbed_per_minute_diff', 'takedowns_avg_diff',
       'submission_avg_diff', 'knockdown_avg_diff', 'fight_time_avg_diff',
       'avg_rounds_diff', 'avg_time_diff', 'avg_knockdowns_diff',
       'avg_sig_attempts_diff', 'avg_sig_strikes_diff',
       'avg_total_strikes_attempts_diff', 'avg_total_strikes_diff',
       'avg_sub_attempts_diff', 'avg_takedowns_diff',
       'avg_takedown_attempts_diff', 'avg_head_strikes_diff',
       'avg_head_attempts_diff', 'avg_body_strikes_diff',
       'avg_body_attempts_diff', 'avg_leg_strikes_diff',
       'avg_leg_attempts_diff', 'avg_distance_diff',
       'avg_distance_attempts_diff', 'avg_clinch_strikes_diff',
       'avg_clinch_attempts_diff', 'avg_ground_strikes_diff',
       'avg_ground_attempts_diff', 'wins_before_diff', 'losses_before_diff',
       'total_fights_before_d

Feature Importance Pruning

In [None]:
model.fit(X,y)
importances = pd.Series(model.feature_importances_, index=X.columns)

top_features = importances.sort_values(ascending=False).head(63).index
top_features

importance_dropped = set(X.columns) - set(top_features)

print('Columns selected: ', top_features, '\n\n', len(top_features), 
    '\n\n', 'Columns dropped: ', importance_dropped, '\n\n', len(importance_dropped))

X = X[top_features]

Columns selected:  Index(['year', 'fight_time_avg_diff', 'days_since_debut_diff',
       'win_pct_before_diff', 'knockdown_avg_diff',
       'sig_strikes_landed_per_minute_diff', 'takedowns_avg_diff',
       'days_since_last_win_diff', 'avg_ground_strikes_diff',
       'avg_ground_attempts_diff', 'avg_body_strikes_diff',
       'control_time_roll_5_diff', 'avg_clinch_strikes_diff',
       'avg_sub_attempts_diff', 'takedowns_ema_diff', 'avg_body_attempts_diff',
       'avg_clinch_attempts_diff', 'avg_takedowns_diff', 'avg_time_diff',
       'takedowns_success_rate_roll_5_diff', 'weight_diff',
       'takedown_attempts_ema_diff', 'avg_rounds_diff', 'submission_avg_diff',
       'avg_takedown_attempts_diff', 'avg_leg_attempts_diff', 'reach_diff',
       'avg_knockdowns_diff', 'losses_before_diff', 'head_strikes_ema_diff',
       'avg_leg_strikes_diff', 'knockdowns_ema_diff', 'avg_sig_strikes_diff',
       'sig_strikes_ema_diff', 'takedown_attempts_roll_5_diff',
       'avg_head_strikes_di

In [11]:
X.shape

(4584, 63)

In [12]:
# Save top feature names
top_features_df = pd.DataFrame(top_features, columns=['feature'])
top_features_df.to_csv('../data/notebooks/top_features.csv', index=False)

Date Preview

In [None]:
sns.pairplot(df_features_clean)
plt.show()

In [None]:
plt.figure(figsize=(35,25))
sns.heatmap(df_features_clean.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

Concatenate and Save

In [13]:
df_final = pd.concat([X.reset_index(drop=True), y.reset_index(drop=True)], axis=1)
df_final.to_csv('../data/notebooks/features_selected.csv', index=False)