## NFL API

Webscrape NFL data from "http://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard" for a range of dates, iterating week by week, to collect scores for predicting future matchup winners and losers.

In [None]:
# Libraries
import requests
from datetime import datetime, timedelta
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [None]:
# Uncomment if needed
# %pip install requests

In [None]:
# Looking at 2024 regular season of 272 games
start_date = '20240905'
end_date = '20250105'

## Iterate through dates

Loop through each week within the defined date range.

In [None]:
start_date_dt = datetime.strptime(start_date, '%Y%m%d')
end_date_dt = datetime.strptime(end_date, '%Y%m%d')

date_list = []
current_date = start_date_dt

# Add specific dates outside of the regular schedule
specific_dates = [
    '20240906', # Friday, September 6th 2024
    '20241129', # Friday, November 29th 2024 (Thanksgiving Friday)
    '20241221', # Saturday, December 21st 2024
    '20241225', # Wednesday, December 25th 2024 (Christmas)
    '20241228', # Saturday, December 28th 2024
    '20250104'  # Saturday, January 4th 2025
]
date_list.extend(specific_dates)


while current_date <= end_date_dt:
    # Check if the current day is Sunday (weekday() returns 6 for Sunday)
    if current_date.weekday() == 6:
        date_list.append(current_date.strftime('%Y%m%d'))
    # Check if the current day is Thursday (weekday() returns 3 for Thursday)
    elif current_date.weekday() == 3:
        date_list.append(current_date.strftime('%Y%m%d'))
    # Check if the current day is Monday (weekday() returns 0 for Monday)
    elif current_date.weekday() == 0:
        date_list.append(current_date.strftime('%Y%m%d'))
    current_date += timedelta(days=1) # Increment by one day to check all days

# Remove duplicates and sort the date list
date_list = sorted(list(set(date_list)))

print(date_list)
print(f"Total number of dates: {len(date_list)}")

['20240905', '20240906', '20240908', '20240909', '20240912', '20240915', '20240916', '20240919', '20240922', '20240923', '20240926', '20240929', '20240930', '20241003', '20241006', '20241007', '20241010', '20241013', '20241014', '20241017', '20241020', '20241021', '20241024', '20241027', '20241028', '20241031', '20241103', '20241104', '20241107', '20241110', '20241111', '20241114', '20241117', '20241118', '20241121', '20241124', '20241125', '20241128', '20241129', '20241201', '20241202', '20241205', '20241208', '20241209', '20241212', '20241215', '20241216', '20241219', '20241221', '20241222', '20241223', '20241225', '20241226', '20241228', '20241229', '20241230', '20250102', '20250104', '20250105']
Total number of dates: 59


In [None]:
all_game_data = []

base_url = "http://site.api.espn.com/apis/site/v2/sports/football/nfl/scoreboard"

for date in date_list:
    complete_url = f"{base_url}?dates={date}&calendar=blacklist"
    response = requests.get(complete_url)

    if response.status_code == 200:
        data = response.json()
        if 'events' in data:
            for event in data['events']:
                game_info = {
                    'game_id': event.get('id'),
                    'date': event.get('date'),
                    'status': event.get('status', {}).get('type', {}).get('detail'),
                    'teams': [],
                    'scores': []
                }
                if 'competitions' in event:
                    for competition in event['competitions']:
                        if 'competitors' in competition:
                            for competitor in competition['competitors']:
                                game_info['teams'].append(competitor.get('team', {}).get('displayName'))
                                game_info['scores'].append(competitor.get('score'))
                all_game_data.append(game_info)
    else:
        print(f"Error fetching data for date {date}: {response.status_code}")

print(f"Total number of games found: {len(all_game_data)}")

Total number of games found: 272


In [None]:
    if response.status_code == 200:
        data = response.json()
        if 'events' in data:
            for event in data['events']:
                game_info = {
                    'game_id': event.get('id'),
                    'date': event.get('date'),
                    'status': event.get('status', {}).get('type', {}).get('detail'),
                    'teams': [],
                    'scores': []
                }
                if 'competitions' in event:
                    for competition in event['competitions']:
                        if 'competitors' in competition:
                            for competitor in competition['competitors']:
                                game_info['teams'].append(competitor.get('team', {}).get('displayName'))
                                game_info['scores'].append(competitor.get('score'))
                all_game_data.append(game_info)
    else:
        print(f"Error fetching data for date {date}: {response.status_code}")

In [None]:
df_all_game_data = pd.DataFrame(all_game_data)
display(df_all_game_data.head())

Unnamed: 0,game_id,date,status,teams,scores
0,401671789,2024-09-06T00:40Z,Final,"[Kansas City Chiefs, Baltimore Ravens]","[27, 20]"
1,401671805,2024-09-07T00:15Z,Final,"[Philadelphia Eagles, Green Bay Packers]","[34, 29]"
2,401671744,2024-09-08T17:00Z,Final,"[Atlanta Falcons, Pittsburgh Steelers]","[10, 18]"
3,401671617,2024-09-08T17:00Z,Final,"[Buffalo Bills, Arizona Cardinals]","[34, 28]"
4,401671719,2024-09-08T17:00Z,Final,"[Chicago Bears, Tennessee Titans]","[24, 17]"


In [None]:
# Analyze the structure and check for missing values
print("DataFrame Info:")
df_all_game_data.info()

print("\nMissing Values:")
print(df_all_game_data.isnull().sum())

# Extract winning and losing teams and scores
def get_winner_loser(row):
    teams = row['teams']
    scores = row['scores']
    if len(teams) == 2 and len(scores) == 2:
        try:
            score1 = int(scores[0])
            score2 = int(scores[1])
            if score1 > score2:
                return teams[0], teams[1], score1, score2
            elif score2 > score1:
                return teams[1], teams[0], score2, score1
            else:
                return None, None, None, None # Handle ties if necessary, though rare in NFL
        except ValueError:
            return None, None, None, None # Handle cases where scores are not valid numbers
    return None, None, None, None # Handle cases with unexpected number of teams or scores

df_all_game_data[['winning_team', 'losing_team', 'winning_score', 'losing_score']] = df_all_game_data.apply(get_winner_loser, axis=1, result_type='expand')

# Display the first few rows of the updated DataFrame
print("\nDataFrame with Winner/Loser Information:")
display(df_all_game_data.head())

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   game_id  286 non-null    object
 1   date     286 non-null    object
 2   status   286 non-null    object
 3   teams    286 non-null    object
 4   scores   286 non-null    object
dtypes: object(5)
memory usage: 11.3+ KB

Missing Values:
game_id    0
date       0
status     0
teams      0
scores     0
dtype: int64

DataFrame with Winner/Loser Information:


Unnamed: 0,game_id,date,status,teams,scores,winning_team,losing_team,winning_score,losing_score
0,401671789,2024-09-06T00:40Z,Final,"[Kansas City Chiefs, Baltimore Ravens]","[27, 20]",Kansas City Chiefs,Baltimore Ravens,27,20
1,401671805,2024-09-07T00:15Z,Final,"[Philadelphia Eagles, Green Bay Packers]","[34, 29]",Philadelphia Eagles,Green Bay Packers,34,29
2,401671744,2024-09-08T17:00Z,Final,"[Atlanta Falcons, Pittsburgh Steelers]","[10, 18]",Pittsburgh Steelers,Atlanta Falcons,18,10
3,401671617,2024-09-08T17:00Z,Final,"[Buffalo Bills, Arizona Cardinals]","[34, 28]",Buffalo Bills,Arizona Cardinals,34,28
4,401671719,2024-09-08T17:00Z,Final,"[Chicago Bears, Tennessee Titans]","[24, 17]",Chicago Bears,Tennessee Titans,24,17


Calculate wins and losses

In [None]:
# Calculate wins for each team
wins = df_all_game_data['winning_team'].value_counts().reset_index()
wins.columns = ['team', 'wins']

# Calculate losses for each team
losses = df_all_game_data['losing_team'].value_counts().reset_index()
losses.columns = ['team', 'losses']

# Merge wins and losses DataFrames
team_performance = pd.merge(wins, losses, on='team', how='outer').fillna(0)

# Calculate total games played
team_performance['total_games'] = team_performance['wins'] + team_performance['losses']

# Calculate win percentage
team_performance['win_percentage'] = (team_performance['wins'] / team_performance['total_games']) * 100

# Display the win percentages
display(team_performance[['team', 'win_percentage']].sort_values(by='win_percentage', ascending=False))

Unnamed: 0,team,win_percentage
10,Detroit Lions,88.888889
15,Kansas City Chiefs,83.333333
25,Philadelphia Eagles,83.333333
20,Minnesota Vikings,77.777778
3,Buffalo Bills,72.222222
31,Washington Commanders,72.222222
2,Baltimore Ravens,70.588235
17,Los Angeles Chargers,66.666667
9,Denver Broncos,61.111111
11,Green Bay Packers,61.111111


## Model Selection

Selecting a model capable of handling time-series data and performing binary classification (win vs loss)

**Model Options**:

1.  **Logistic Regression**: A simple yet often effective baseline for binary classification. While not inherently a time-series model, it can work well with time-based features we've created (like rolling averages).
2.  **Gradient Boosting Machines (e.g., XGBoost, LightGBM)**: These are powerful ensemble models that can capture complex non-linear relationships in the data and have shown strong performance in many tabular data tasks. The time-series aspect is handled by the features we engineer.
3.  **Time Series Specific Models (e.g., using `pmdarima` for ARIMA variants or potentially more complex models like LSTMs if the time dependency is very strong and data volume is high)**: While typically used for forecasting numerical values, some time series models can be adapted for classification or their outputs used as features in a classification model. However, given the discrete nature of games and the features we've created, simpler models might be sufficient and easier to interpret.

For this task, given the features we've engineered (which capture historical context), a Gradient Boosting Machine like **XGBoost** is a strong candidate. It's known for its performance and ability to handle tabular data effectively. Logistic Regression could also be a good starting point as a simpler, interpretable model.

We will proceed using **XGBoost** for the model training.

## Data preparation for time series

Prepare the game data for time-series analysis by sorting the data by date.


In [None]:
df_all_game_data['date'] = pd.to_datetime(df_all_game_data['date'])
df_all_game_data_sorted = df_all_game_data.sort_values(by='date')
display(df_all_game_data_sorted.head())

Unnamed: 0,game_id,date,status,teams,scores,winning_team,losing_team,winning_score,losing_score
0,401671789,2024-09-06 00:40:00+00:00,Final,"[Kansas City Chiefs, Baltimore Ravens]","[27, 20]",Kansas City Chiefs,Baltimore Ravens,27,20
1,401671805,2024-09-07 00:15:00+00:00,Final,"[Philadelphia Eagles, Green Bay Packers]","[34, 29]",Philadelphia Eagles,Green Bay Packers,34,29
2,401671744,2024-09-08 17:00:00+00:00,Final,"[Atlanta Falcons, Pittsburgh Steelers]","[10, 18]",Pittsburgh Steelers,Atlanta Falcons,18,10
3,401671617,2024-09-08 17:00:00+00:00,Final,"[Buffalo Bills, Arizona Cardinals]","[34, 28]",Buffalo Bills,Arizona Cardinals,34,28
4,401671719,2024-09-08 17:00:00+00:00,Final,"[Chicago Bears, Tennessee Titans]","[24, 17]",Chicago Bears,Tennessee Titans,24,17


## Feature Engineering

Calculate rolling averages for scores and integrate win/loss data.

In [None]:
# Calculate rolling average scores for each team
# We need to explode the teams and scores columns to properly group by team and date
df_exploded = df_all_game_data_sorted.explode(['teams', 'scores'])

# Convert scores to numeric, coercing errors
df_exploded['scores'] = pd.to_numeric(df_exploded['scores'], errors='coerce')

# Sort by team and date for correct rolling calculation
df_exploded = df_exploded.sort_values(by=['teams', 'date'])

# Calculate rolling average score for each team over the last 5 games
df_exploded['rolling_avg_score'] = df_exploded.groupby('teams')['scores'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())

# To merge rolling averages back correctly, we need a way to link the exploded rows back to the original game
# Let's create a temporary identifier for home and away teams in the exploded dataframe
df_exploded['team_type'] = df_exploded.groupby('game_id').cumcount().apply(lambda x: 'home' if x == 0 else 'away')

# Now pivot the exploded dataframe to get home and away rolling averages in separate columns for each game
df_rolling_avg_pivoted = df_exploded.pivot_table(index='game_id', columns='team_type', values='rolling_avg_score').reset_index()
df_rolling_avg_pivoted.rename(columns={'home': 'home_rolling_avg_score', 'away': 'away_rolling_avg_score'}, inplace=True)


# Merge rolling averages back to the original dataframe
df_all_game_data_sorted = pd.merge(df_all_game_data_sorted, df_rolling_avg_pivoted, on='game_id', how='left')

# Merge the team performance (wins/losses) data
# We need to merge win percentages for both home and away teams
df_all_game_data_sorted = pd.merge(df_all_game_data_sorted, team_performance[['team', 'win_percentage']].rename(columns={'team': 'home_team', 'win_percentage': 'home_win_percentage'}), left_on=df_all_game_data_sorted['teams'].apply(lambda x: x[0]), right_on='home_team', how='left').drop(columns=['home_team'])
df_all_game_data_sorted = pd.merge(df_all_game_data_sorted, team_performance[['team', 'win_percentage']].rename(columns={'team': 'away_team', 'win_percentage': 'away_win_percentage'}), left_on=df_all_game_data_sorted['teams'].apply(lambda x: x[1]), right_on='away_team', how='left').drop(columns=['away_team'])


# Display the first few rows with new features
display(df_all_game_data_sorted.head())

Unnamed: 0,game_id,date,status,teams,scores,winning_team,losing_team,winning_score,losing_score,away_rolling_avg_score,home_rolling_avg_score,home_win_percentage,away_win_percentage
0,401671789,2024-09-06 00:40:00+00:00,Final,"[Kansas City Chiefs, Baltimore Ravens]","[27, 20]",Kansas City Chiefs,Baltimore Ravens,27,20,27.0,20.0,83.333333,70.588235
1,401671805,2024-09-07 00:15:00+00:00,Final,"[Philadelphia Eagles, Green Bay Packers]","[34, 29]",Philadelphia Eagles,Green Bay Packers,34,29,34.0,29.0,83.333333,61.111111
2,401671744,2024-09-08 17:00:00+00:00,Final,"[Atlanta Falcons, Pittsburgh Steelers]","[10, 18]",Pittsburgh Steelers,Atlanta Falcons,18,10,18.0,10.0,44.444444,58.823529
3,401671617,2024-09-08 17:00:00+00:00,Final,"[Buffalo Bills, Arizona Cardinals]","[34, 28]",Buffalo Bills,Arizona Cardinals,34,28,34.0,28.0,72.222222,50.0
4,401671719,2024-09-08 17:00:00+00:00,Final,"[Chicago Bears, Tennessee Titans]","[24, 17]",Chicago Bears,Tennessee Titans,24,17,17.0,24.0,33.333333,16.666667


## Target Variable Definition

Define the target variable for the prediction model, which is the winner of the game.

In [None]:
# Define the target variable
# We can create a binary target where 1 indicates the home team won, and 0 indicates the away team won.
# We need to compare the 'winning_team' with the home team in the 'teams' list.

df_all_game_data_sorted['home_team_won'] = df_all_game_data_sorted.apply(lambda row: 1 if row['winning_team'] == row['teams'][0] else 0, axis=1)

# Display the first few rows with the target variable
display(df_all_game_data_sorted.head())

Unnamed: 0,game_id,date,status,teams,scores,winning_team,losing_team,winning_score,losing_score,away_rolling_avg_score,home_rolling_avg_score,home_win_percentage,away_win_percentage,home_team_won
0,401671789,2024-09-06 00:40:00+00:00,Final,"[Kansas City Chiefs, Baltimore Ravens]","[27, 20]",Kansas City Chiefs,Baltimore Ravens,27,20,27.0,20.0,83.333333,70.588235,1
1,401671805,2024-09-07 00:15:00+00:00,Final,"[Philadelphia Eagles, Green Bay Packers]","[34, 29]",Philadelphia Eagles,Green Bay Packers,34,29,34.0,29.0,83.333333,61.111111,1
2,401671744,2024-09-08 17:00:00+00:00,Final,"[Atlanta Falcons, Pittsburgh Steelers]","[10, 18]",Pittsburgh Steelers,Atlanta Falcons,18,10,18.0,10.0,44.444444,58.823529,0
3,401671617,2024-09-08 17:00:00+00:00,Final,"[Buffalo Bills, Arizona Cardinals]","[34, 28]",Buffalo Bills,Arizona Cardinals,34,28,34.0,28.0,72.222222,50.0,1
4,401671719,2024-09-08 17:00:00+00:00,Final,"[Chicago Bears, Tennessee Titans]","[24, 17]",Chicago Bears,Tennessee Titans,24,17,17.0,24.0,33.333333,16.666667,1


## Model Training

Split the data into training and testing sets and train the XGBoost model.

In [None]:
# Define features (X) and target (y)
# Exclude columns that are not features or the target
features = ['home_rolling_avg_score', 'away_rolling_avg_score', 'home_win_percentage', 'away_win_percentage']
X = df_all_game_data_sorted[features]
y = df_all_game_data_sorted['home_team_won']

# For time series data, it's important to split the data chronologically
# Let's use the first 80% of the data for training and the remaining 20% for testing

split_index = int(len(df_all_game_data_sorted) * 0.8)

X_train = X.iloc[:split_index]
X_test = X.iloc[split_index:]
y_train = y.iloc[:split_index]
y_test = y.iloc[split_index:]

# Initialize and train the XGBoost Classifier
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')
xgb_model.fit(X_train, y_train)

print("Model training complete.")

Model training complete.


## Model Evaluation

Evaluate the performance of the trained XGBoost model.

In [None]:
# Make predictions on the test set
y_pred = xgb_model.predict(X_test)
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1] # Get probabilities for the positive class

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")

Accuracy: 0.6552
Precision: 0.8000
Recall: 0.5714
F1-Score: 0.6667
ROC AUC Score: 0.7043


## Prediction

Use the trained XGBoost model to predict the outcome of games.

In [None]:
# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Add the predictions to the test set DataFrame for comparison
X_test['predicted_home_team_won'] = y_pred

# Display the test set with actual and predicted outcomes
# We'll merge with the original sorted data to show team names and other game info
predictions_df = pd.merge(X_test, df_all_game_data_sorted[['game_id', 'teams', 'date', 'winning_team', 'home_team_won']], left_index=True, right_index=True, how='left')

display(predictions_df[['date', 'teams', 'winning_team', 'home_team_won', 'predicted_home_team_won']].head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['predicted_home_team_won'] = y_pred


Unnamed: 0,date,teams,winning_team,home_team_won,predicted_home_team_won
228,2024-12-22 18:00:00+00:00,"[Washington Commanders, Philadelphia Eagles]",Washington Commanders,1,0
229,2024-12-22 18:00:00+00:00,"[New York Jets, Los Angeles Rams]",Los Angeles Rams,0,0
230,2024-12-22 18:00:00+00:00,"[Chicago Bears, Detroit Lions]",Detroit Lions,0,0
231,2024-12-22 18:00:00+00:00,"[Cincinnati Bengals, Cleveland Browns]",Cincinnati Bengals,1,1
232,2024-12-22 18:00:00+00:00,"[Atlanta Falcons, New York Giants]",Atlanta Falcons,1,1


## Summary:

### Insights or Next Steps

*   While the model demonstrates some predictive capability (around 65% F1-score), the performance indicates room for improvement. Further feature engineering using more complex game statistics or player data, such as trades or more in-game metrics, could enhance the model's accuracy.
*   Investigate alternative time series modeling techniques, such as ARIMA, or ensemble methods to potentially capture more intricate patterns in the data and improve prediction performance.
