In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import sys
sys.path.append('../')
from src.processor import FantasyDataProcessor
from src.scraper import ProFootballReferenceScraper
from src.feature_engineering import FantasyFeatureEngineer
from src.modelling import FantasyModel

scraper = ProFootballReferenceScraper(data_dir="../data")
processor = FantasyDataProcessor(data_dir="../data")
engineer = FantasyFeatureEngineer(
    data_dir="../data",
    metadata_cols=["player", "year"],
    target_cols=["standard_fantasy_points", "standard_fantasy_points_per_game", "ppr_fantasy_points", "ppr_fantasy_points_per_game", "value_over_replacement"]
)
model = FantasyModel(data_dir="../data")

# Data Collection

Let's use the ProFootballReferenceScraper to collect NFL player and team statistics.

In [6]:
scraper.scrape_years(start_year=1998, end_year=1998)

Scraping years:   0%|          | 0/1 [00:00<?, ?it/s]2025-07-21 15:23:29,391 - src.scraper - INFO - Scraping data from 1998
2025-07-21 15:23:33,571 - src.scraper - INFO - Requesting https://www.pro-football-reference.com/years/1998/fantasy.htm
2025-07-21 15:23:33,610 - src.scraper - ERROR - Error fetching https://www.pro-football-reference.com/years/1998/fantasy.htm: 403 Client Error: Forbidden for url: https://www.pro-football-reference.com/years/1998/fantasy.htm
2025-07-21 15:23:33,610 - src.scraper - ERROR - Failed to get data for https://www.pro-football-reference.com/years/1998/fantasy.htm
2025-07-21 15:23:38,004 - src.scraper - INFO - Requesting https://www.pro-football-reference.com/years/1998/passing.htm
2025-07-21 15:23:38,042 - src.scraper - ERROR - Error fetching https://www.pro-football-reference.com/years/1998/passing.htm: 403 Client Error: Forbidden for url: https://www.pro-football-reference.com/years/1998/passing.htm
2025-07-21 15:23:38,043 - src.scraper - ERROR - Faile

# Data Processing

Build the silver layer from the scraped data

In [7]:
processor.process_all_data()

Processing files matching: *_player_fantasy_stats.csv: 100%|██████████| 25/25 [00:00<00:00, 231.45it/s]
2025-07-21 15:25:58,392 - src.processor - INFO - Saved player_fantasy_stats.csv to ../data/silver/player_fantasy_stats.csv
Processing files matching: *_player_receiving_stats.csv: 100%|██████████| 25/25 [00:00<00:00, 175.46it/s]
2025-07-21 15:26:04,842 - src.processor - INFO - Saved player_receiving_stats.csv to ../data/silver/player_receiving_stats.csv
Processing files matching: *_player_receiving_advanced_stats.csv: 100%|██████████| 7/7 [00:00<00:00, 179.93it/s]
2025-07-21 15:26:06,340 - src.processor - INFO - Saved player_receiving_advanced_stats.csv to ../data/silver/player_receiving_advanced_stats.csv
Processing files matching: *_player_rushing_stats.csv: 100%|██████████| 25/25 [00:00<00:00, 181.14it/s]
2025-07-21 15:26:10,871 - src.processor - INFO - Saved player_rushing_stats.csv to ../data/silver/player_rushing_stats.csv
Processing files matching: *_player_rushing_advanced_st

## Feature Engineering

Determine which features have the most potential
Prepare features for use in the model


Let's see what the silver table looks like.

In [4]:
silver_data = engineer.load_silver_table()

2025-07-21 16:20:52,804 - src.feature_engineering - INFO - Loaded silver table: 12749 rows


In [4]:
print("Missing values in silver data:")
print(silver_data.isnull().sum())

# Check data types
print("\nData types in silver data:")
print(silver_data.dtypes)

# Check for duplicates
print(f"\nDuplicate rows in silver data: {silver_data.duplicated().sum()}")

Missing values in silver data:
player                          0
year                            0
team                            0
age                             0
standard_fantasy_points         0
                               ..
team_points_3_yr_avg            0
team_yards_3_yr_avg             0
team_plays_3_yr_avg             0
team_yards_per_play_3_yr_avg    0
awards                          0
Length: 185, dtype: int64

Data types in silver data:
player                           object
year                              int64
team                             object
age                               int64
standard_fantasy_points         float64
                                 ...   
team_points_3_yr_avg            float64
team_yards_3_yr_avg             float64
team_plays_3_yr_avg             float64
team_yards_per_play_3_yr_avg    float64
awards                          float64
Length: 185, dtype: object

Duplicate rows in silver data: 0


In [None]:
engineer.build_gold_table(must_include_features=['age'])

## Modelling

- Start with linear regression
- Become more advanced
- Forward, Backward, Feature Selection
- Models per target

In [14]:
gold_data = model.load_gold_table()

In [None]:
metadata_cols = ["player", "year", "team"]
target_cols = ["ppr_fantasy_points", "standard_fantasy_points", "ppr_fantasy_points_per_game", "standard_fantasy_points_per_game", "value_over_replacement"]
feature_cols = [col for col in gold_data.columns if col not in metadata_cols + target_cols]

X = gold_data[feature_cols]
y = gold_data["ppr_fantasy_points"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline = model.create_pipeline(LinearRegression())

pipeline.fit(X_train, y_train)

score = pipeline.score(X_test, y_test)
print(f"Score: {score}")

Score: 0.6106441430693899


In [23]:
y_pred = pipeline.predict(X_test)

preds = pd.DataFrame({
    "player": gold_data["player"],
    "year": gold_data["year"],
    "team": gold_data["team"],
    "ppr_fantasy_points": y_pred,
})

print(preds['year' == 2024])

ValueError: array length 2550 does not match index length 12749