# Feature Engineering

-   Create new features based on insights from EDA
-   Handle categorical variables (encoding)
-   Normalize or standardize numerical features
-   Feature selection

## Feature Ideas


In [7]:
%load_ext autoreload
%autoreload 2

import sys
import os
import pandas as pd
import numpy as np

# Add the project root to the Python path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))

from src.features.preprocessing import (
    preprocess_data,
    engineer_features,
    select_features
)
from src.features.feature_eng import (
    FEATURES,
    get_feature_names
)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
# Load the data
df_all_years = pd.read_parquet('../data/02_interim/df_all_years.parquet')
df_2016_plus = pd.read_parquet('../data/02_interim/df_2016_plus.parquet')

In [4]:
def process_and_analyze_df(df, df_name):
    print(f"\nProcessing {df_name}:")
    
    # Engineer features
    df_engineered = engineer_features(df)

    # Display some of the engineered features
    print("Engineered features:")
    engineered_feature_names = get_feature_names("engineered")
    print(df_engineered[engineered_feature_names].head())

    # Select all features
    df_selected = select_features(df_engineered)

    # Display the selected features
    print("\nSelected features:")
    print(df_selected.columns.tolist())

    # Preprocess the data (this combines engineering and selection)
    df_preprocessed = preprocess_data(df)

    # Compare the shapes
    print(f"\nOriginal shape: {df.shape}")
    print(f"Preprocessed shape: {df_preprocessed.shape}")

    # Check for NaN values
    print("\nNaN values in preprocessed data:")
    print(df_preprocessed.isna().sum())

    # Basic statistics of preprocessed data
    print("\nBasic statistics of preprocessed data:")
    print(df_preprocessed.describe())

    return df_preprocessed

In [None]:
df_all_years.point_difference

In [41]:
# Process and analyze each DataFrame
df_all_years_preprocessed = process_and_analyze_df(df_all_years, "All Years Data")
df_2016_plus_preprocessed = process_and_analyze_df(df_2016_plus, "2016+ Data")


Processing All Years Data:
Engineered features:
   totalYards_last_3  totalYards_last_10  totalYards_season_to_date  \
0                NaN                 NaN                        NaN   
1                NaN                 NaN                        NaN   
2                NaN                 NaN                        NaN   
3                NaN                 NaN                        NaN   
4                NaN                 NaN                        NaN   

   totalYards_weighted  firstDowns_last_3  firstDowns_last_10  \
0                  NaN                NaN                 NaN   
1                  NaN                NaN                 NaN   
2                  NaN                NaN                 NaN   
3                  NaN                NaN                 NaN   
4                  NaN                NaN                 NaN   

   firstDowns_season_to_date  firstDowns_weighted  possessionTime_last_3  \
0                        NaN                  NaN        

In [43]:
# Save the preprocessed data
output_path_all_years = '../data/03_processed/preprocessed_all_years.parquet'
output_path_2016_plus = '../data/03_processed/preprocessed_2016_plus.parquet'

df_all_years_preprocessed.to_parquet(output_path_all_years, index=False)
df_2016_plus_preprocessed.to_parquet(output_path_2016_plus, index=False)

print(f"\nPreprocessed all years data saved to: {output_path_all_years}")
print(f"Preprocessed 2016+ data saved to: {output_path_2016_plus}")



Preprocessed all years data saved to: ../data/03_processed/preprocessed_all_years.parquet
Preprocessed 2016+ data saved to: ../data/03_processed/preprocessed_2016_plus.parquet
