In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import pearsonr

In [None]:
# Load NFL data from CSV file
df = pd.read_csv("../CSV_files/spreadspoke_scores.csv")
df


In [None]:
#Filter out last 5 years (seasons) of data (2019-2020 season thru 2023-2024 season)
five_year_df = df[df['schedule_season'] > 2018]
#Change former "Raiders" team ID to its current ID LVR across all seasons.(2019-2020 Season lists Raiders as LV, not LVR)
five_year_df.loc[five_year_df['schedule_season'] > 2018, 'team_favorite_id'] = five_year_df['team_favorite_id'].replace('LV', 'LVR')
#Show first 10 rows of filtered results
five_year_df.head(10)

In [None]:
#Count the number of times each team was picked as a favorite in its game
favorite_counts = five_year_df["team_favorite_id"].value_counts()
favorite_counts

In [None]:
#Plot the number of times was picked as a favorite using a barchart
plt.figure(figsize=(10, 6))
favorite_counts.plot(kind='bar')
plt.title('Number of Times Each NFL Team was Picked as Favorite (2019-2023)')
plt.xlabel('Team')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()

In [None]:
#get a count of wins for chosen team favorites 
favorite_wins = five_year_df[five_year_df["score_home"] > five_year_df["score_away"]]["team_favorite_id"].value_counts()
favorite_wins

In [None]:
#Plot the number of wins for each team favored
plt.figure(figsize=(10, 6))
favorite_wins.plot(kind='bar')
plt.title('Number of Wins for Teams Favored (2019-2023)')
plt.xlabel('Team')
plt.ylabel('Number of Wins')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()

In [None]:
#find out what percentage of time the favorite actually wins
win_percentage = (favorite_wins / favorite_counts) * 100
win_percentage_sorted = win_percentage.sort_values(ascending=False)
win_percentage_sorted

In [None]:
#plot the top 10 winning percentages of teams most favored
plt.figure(figsize=(12, 8))
win_percentage_sorted.head(10).plot(kind='bar', color='skyblue')
plt.title('Top 10 Winning Percentages of Teams Most Favored (2019-2023)')
plt.xlabel('Team')
plt.ylabel('Win Percentage')
plt.xticks(rotation=45, ha='right')  
plt.tight_layout()
plt.show()

In [None]:
#Create a DataFrame for the top 10 most picked favorites
favorite_counts_df = pd.DataFrame({'Team': favorite_counts.index, 'Count': favorite_counts.values})
favorite_counts_df.columns = ['Team', 'Count']
print(f"The top 10 most picked favorites are: {favorite_counts_df.head(10)}")

In [None]:
# Plotting the top 10 most picked favorites sorted from highest to lowest
plt.figure(figsize=(10, 6))
plt.bar(favorite_counts_df['Team'].head(10), favorite_counts_df['Count'].head(10))
plt.title('Top 10 Most Picked Favorites (2019-2023)')
plt.xlabel('Team')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()

In [None]:
#creating a data frame for winning percentages, as index and winning percentages
win_percentage_df = pd.DataFrame({"Team" : win_percentage.index, "Win Percentage" : win_percentage.values})
#renaming columns to Team and Win Percentage
win_percentage_df.columns = ['Team', 'Win Percentage']

win_percentage_df_sorted = win_percentage_df.sort_values(by='Win Percentage', ascending=False)
print(f"The top 10 winning percentages of teams most favorited are:\n{win_percentage_df_sorted.head(10)}")

In [None]:
#Plotting the top 10 winning percentages of teams most favored
plt.figure(figsize=(10, 6))
plt.bar(win_percentage_df_sorted['Team'].head(10), win_percentage_df_sorted['Win Percentage'].head(10))
plt.title('Top 10 Winning Percentages of Teams Most Favored (2019-2023)')
plt.xlabel('Team')
plt.ylabel('Win Percentage')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()

In [None]:
#creating DataFrame which calculates correlation between favorites picked and favorite wins for top 10
correlation_df = pd.merge(favorite_counts_df, favorite_wins, left_on='Team', right_index=True, how='outer')
correlation_df.columns = ['Team', 'Favorite Picks', 'Favorite Wins']
correlation_df['Win Percentage'] = (correlation_df['Favorite Wins'] / correlation_df['Favorite Picks']) * 100
correlation_df = correlation_df.sort_values(by='Favorite Picks', ascending=False)
print(correlation_df.head(10))

In [None]:
#plot the output from the correlation calculation to a scatterplot for the top 10 teams
plt.figure(figsize=(10, 6))
plt.scatter(correlation_df['Favorite Picks'].head(10), correlation_df['Favorite Wins'].head(10), color='blue', alpha=0.7)
plt.title('Correlation Between Favorite Picks and Favorite Wins for Top 10 Teams')
plt.xlabel('Favorite Picks')
plt.ylabel('Favorite Wins')
plt.grid(True)
plt.tight_layout()
plt.show()
pearson_corr, pearson_p_value = pearsonr(correlation_df['Favorite Picks'].head(10), correlation_df['Favorite Wins'].head(10))
#calculate Pearson's R to illustrate strength of correlation
print("Pearson Correlation Coefficient:", pearson_corr)
#calculate the p value to test whether Ho: there is no relationship between picked favorites and favorites who win
#is significant at a 95% confidence interval.  At p <.05 we may reject this null hypothesis
print("Pearson Correlation p-value:", pearson_p_value)