In [230]:
import pandas as pd
import numpy as np
from plotnine import *

In [231]:
# Function to get first and last year
def first_and_last(group):
    return group[group['Year'].isin([group['Year'].min(), group['Year'].max()])]

In [232]:
# Custom function for formatting y-axis labels
def format_y_labels(value):
    if value >= 1000:
        return f"{int(value/1000)}k"
    else:
        return f"{int(value)}"

In [233]:
# Custom colors
custom_colors = {
    "MLS": "#212F88",
    "NWSL": "#E31837"
}

In [234]:
# Read and preprocess the data
soccer = pd.read_csv("soccer_attendance_mls_nwsl.csv")

In [235]:
# Remove commas from the Attendance column and convert to numeric
soccer['Attendance'] = soccer['Attendance'].str.replace(',', '').astype(float)

In [236]:
soccer['Date'] = pd.to_datetime(soccer['Date'], format='%m/%d/%Y')
soccer['Year'] = soccer['Date'].dt.year

In [237]:
# Filter data for both leagues
soccer_both_leagues = soccer[soccer['Both Leagues'] == "Yes"]
soccer_both_leagues = soccer_both_leagues[~soccer_both_leagues['Team'].isin(["LAFC", "New York City FC"])]

In [238]:
# Get the first NWSL year
first_NWSL_year = soccer_both_leagues[soccer_both_leagues['League'] == "NWSL"].groupby('City')['Year'].min().reset_index().rename(columns={'Year': 'start_year'})

In [239]:
# Filter soccer data based on the first NWSL year
soccer_filtered = pd.merge(soccer_both_leagues, first_NWSL_year, on='City')
soccer_filtered = soccer_filtered[soccer_filtered['Year'] >= soccer_filtered['start_year']]

In [240]:
summary_soccer = (soccer_filtered.groupby(['League', 'City', 'Team', 'Year'])['Attendance']
                  .mean()
                  .reset_index()
                  .groupby(['League', 'City', 'Team'])
                  .apply(first_and_last)
                  .reset_index(drop=True))

In [241]:
# Overall start year
overall_start_year = first_NWSL_year['start_year'].min()

In [242]:
# Prepare overall data
overall = (soccer.groupby(['League', 'Year'])['Attendance']
           .mean()
           .reset_index()
           .query(f"Year >= {overall_start_year}")
           .groupby('League')
           .apply(first_and_last)
           .reset_index(drop=True))
overall['City'] = 'ALL'
overall['Team'] = overall['League']

In [243]:
# Prepare simplified data
simplified_data = pd.concat([summary_soccer, overall])

In [244]:
simplified_data

Unnamed: 0,League,City,Team,Year,Attendance
0,MLS,Chicago,Chicago Fire,2013,15228.235294
1,MLS,Chicago,Chicago Fire,2024,18389.6
2,MLS,Houston,Houston Dynamo,2014,20117.294118
3,MLS,Houston,Houston Dynamo,2024,17461.3
4,MLS,Kansas City,Sporting Kansas City,2022,18343.5
5,MLS,Kansas City,Sporting Kansas City,2024,23168.636364
6,MLS,Los Angeles,LA Galaxy,2022,22841.235294
7,MLS,Los Angeles,LA Galaxy,2024,23284.444444
8,MLS,New York City,New York Red Bulls,2013,19616.444444
9,MLS,New York City,New York Red Bulls,2024,18578.3


In [245]:
# Prepare data for ribbons
ribbon_data = simplified_data.pivot_table(index=['City', 'Year'], columns='League', values='Attendance').reset_index()
ribbon_data['Year'] = pd.to_datetime(ribbon_data['Year'].astype(str) + '-01-01')  # Convert 'Year' to datetime
ribbon_data = ribbon_data.groupby('City').apply(lambda x: x.set_index('Year').resample('Y').first().interpolate(method='linear').reset_index()).reset_index(drop=True)
ribbon_data['ymin'] = ribbon_data[['MLS', 'NWSL']].min(axis=1)
ribbon_data['ymax'] = ribbon_data[['MLS', 'NWSL']].max(axis=1)
ribbon_data = ribbon_data.dropna()

In [246]:
# Extract year for x-axis breaks
year_min = ribbon_data['Year'].dt.year.min()
year_max = ribbon_data['Year'].dt.year.max()

In [173]:
ribbon_data

League,Year,City,MLS,NWSL,ymin,ymax
0,2013-12-31,ALL,19317.190625,4608.444444,4608.444444,19317.190625
11,2024-12-31,ALL,23251.747212,11403.485714,11403.485714,23251.747212
12,2013-12-31,Chicago,15228.235294,1710.636364,1710.636364,15228.235294
23,2024-12-31,Chicago,18389.6,8317.0,8317.0,18389.6
24,2014-12-31,Houston,20117.294118,4650.083333,4650.083333,20117.294118
34,2024-12-31,Houston,17461.3,6118.25,6118.25,17461.3
35,2022-12-31,Kansas City,18343.5,7656.636364,7656.636364,18343.5
37,2024-12-31,Kansas City,23168.636364,11500.0,11500.0,23168.636364
38,2022-12-31,Los Angeles,22841.235294,19104.818182,19104.818182,22841.235294
40,2024-12-31,Los Angeles,23284.444444,19864.714286,19864.714286,23284.444444


In [247]:
# Extract year for x-axis breaks
year_min = ribbon_data['Year'].dt.year.min()
year_max = ribbon_data['Year'].dt.year.max()

In [248]:
# Melt ribbon_data for plotting
ribbon_data_melted = ribbon_data.melt(id_vars=['City', 'Year'], value_vars=['MLS', 'NWSL'], var_name='League', value_name='Attendance')
ribbon_data_melted['City_League'] = ribbon_data_melted['City'] + '_' + ribbon_data_melted['League']

In [323]:
plot = (ggplot()
        # Adding a ribbon plot to show the range between ymin and ymax for each City
        + geom_ribbon(data=ribbon_data,
                      mapping=aes(x='Year', ymin='ymin', ymax='ymax', group='City'),
                      fill='grey', alpha=0.5)

        # Adding a dashed line plot to show the Attendance over the years, colored by League and grouped by City_League
        + geom_line(data=ribbon_data_melted,
                    mapping=aes(x='Year', y='Attendance', color='League', group='City_League'),
                    size=1.2, linetype="dashed")

        # Adding points to the line plot for each data point
        + geom_point(data=ribbon_data_melted,
                     mapping=aes(x='Year', y='Attendance', color='League'),
                     size=3)

        # Faceting the plot by City, with each facet having its own y-scale
        + facet_wrap('~City', scales='free_y')

        # Customizing the color scale for the League variable
        + scale_color_manual(values=custom_colors, labels={"MLS": "Major League Soccer (MLS)", "NWSL": "National Women's Soccer League (NWSL)"})

        # Customizing the x-axis to show breaks every 2 years and label them with the last two digits of the year
        + scale_x_date(breaks=pd.date_range(start=f'{year_min}-01-01', end=f'{year_max}-01-01', freq='2Y'),
                       labels=lambda x: [f"'{str(i.year)[-2:]}" for i in x])

        # Customizing the y-axis labels to format large numbers as 'k' (e.g., 20000 -> 20k)
        + scale_y_continuous(labels=lambda breaks: [format_y_labels(b) for b in breaks])

        # Applying a minimal theme with customized text and element settings
        + theme_minimal(base_size=15)
        + theme(
            panel_grid_minor_x=element_blank(),  # Removing minor grid lines on the x-axis
            panel_grid_minor_y=element_blank(),  # Removing minor grid lines on the y-axis
            plot_title=element_text(ha='center', size=18, weight='bold', margin={'t': 20, 'b': 10}),  # Centering the title with custom size, weight, and margin
            plot_subtitle=element_text(ha='center', size=14, margin={'b': 20}),  # Centering the subtitle with custom size and margin
            strip_text=element_text(weight='bold'),  # Making facet strip text bold
            axis_title=element_text(size=14),  # Customizing axis titles
            axis_text=element_text(size=12),  # Customizing axis text
            legend_position='bottom',  # Placing the legend at the bottom
            legend_title=element_blank(),  # Removing the legend title
            legend_text=element_text(size=12),  # Customizing the legend text size
            plot_caption=element_text(ha='left', size=10, color="grey", style="italic")  # Adjusting the caption
        )
        # Adding labels for the axes, title, subtitle, and caption
        + labs(
            x='Year',
            y='Average Annual Match Attendance',
            title="Entering the Golden Age of U.S. Women's Soccer",
            subtitle="Men's soccer (MLS) maintains broad popularity - Women's soccer (NWSL) is closing the gap",
            caption = "Note: ALL includes 41 MLS (men's) and NWSL (women's) teams. The 10 cities shown are those that have had both MLS and NWSL teams since at least 2022.\n\nAuthor: Timothy A. Model"
        )
)


In [325]:
# Save or show the plot
plot.save('soccer_attendance_plot.png', width=14, height=10, dpi=300)

