GIANLUIGI BONIGLIA

In [None]:
pip install altair 

In [None]:
import altair as alt
print(alt.__version__)

In [None]:
import altair as alt
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('/Users/gianluigiboniglia/Downloads/wnba-shots-2021.csv')
alt.data_transformers.disable_max_rows()  # to handle datasets larger than the default limit.
df = pd.DataFrame(data)

#Remove rows and columns with missing values
df.dropna(how='any', axis=0, inplace=True)  # Drops rows with ANY missing values
df.dropna(how='all', axis=1, inplace=True)  # Drops columns with ALL missing values

# Create an interactive dropdown to select a team1
team_dropdown1 = alt.binding_select(options=data['shooting_team'].unique().tolist(), name='Team 1')
selection1 = alt.selection_point(fields=['shooting_team'], bind=team_dropdown1)

# Create an interactive dropdown to select a team2
team_dropdown2 = alt.binding_select(options=data['shooting_team'].unique().tolist(), name='Team 2')
selection2 = alt.selection_point(fields=['shooting_team'], bind=team_dropdown2)

# I transform the time (game_seconds_remaining) from remaining seconds to passing minutes. 
df['Minute'] = round((2400 - df['game_seconds_remaining']) / 60)

# Determine home or away location for each shot
df['Location'] = np.where(df['shooting_team'] == df['home_team_name'], 'Home', 'Away') 
# Create a unique game identifier
df['Game'] = df['home_team_name'] + ' vs. ' + df['away_team_name']  

# Create an interactive dropdown for location selection (home/away)
location_dropdown = alt.binding_select(options=['Home', 'Away'], name='Location')  
location_selection = alt.selection_point(fields=['Location'], bind=location_dropdown)


# Interactive dropdown for game selection
game_dropdown = alt.binding_select(options=df['Game'].unique().tolist(), name='Game')
game_selection = alt.selection_point(fields=['Game'], bind=game_dropdown)


## **How is the distribution of shot attempts of a team?**

To answer this question, I decided to create two graphs. The first one where the spatial distribution of expected shots is represented by a histogram. On the x-axis, there is the x-coordinate defined as 'Horizontal location in feet of shot attempt where the hoop would be located at 25 feet', and on the y-axis, there is the count of shots. I created this first graph to show in which area, relative to the x-axis, players take more shots, it shows how most of the shots are taken from the central part of the field (based on the x-axis). The second graph is a scatter plot where the x-axis contains the previously defined x-coordinate and the y-axis contains the y-coordinate defined as 'Vertical location in feet of shot attempt with respect to the target hoop'. I created this graph to show in which areas of the court players shoot more frequently, this graph shows how most of the shots taken are either under the rim or near the three-point line.

In [None]:
# Count shot attempts for each team, counting how many times each team appears in the 'shooting_team' column
counting = df['shooting_team'].value_counts()

# Create a DataFrame for shot attempt counts
counting_df = pd.DataFrame({'shooting_team': counting.index,
                            'counting': counting.values, })

#Filter out rows with invalid coordinates
filtered_df = df[(df['coordinate_x'] != -214748340) & (df['coordinate_y'] != -214748340)]

# Create a histogram of shot attempt distribution
chart_1 = (alt.Chart(filtered_df).mark_bar().encode(
        x=alt.X('coordinate_x:Q', bin=alt.Bin(maxbins=30), title='Coordinate X (Orizzontale)'),
        y=alt.Y('count()', title='Numero di Tentativi'),
        tooltip=['coordinate_x', 'count()','shooting_team']
    )
    .add_params(
        selection1
    ).transform_filter(
        selection1  
    ).properties(
        title="Distribuzione dei Tentativi di Tiro per Squadra-LA",
        width=300,
        height=400
    )
)

# Display the chart
chart_1 

In [None]:
chart_2 = alt.Chart(data).mark_point().encode(
        alt.X('coordinate_x:Q', scale=alt.Scale(domain=[0,50])) , #Quantitative scale for the x-axis, set domain for better visualization.
        alt.Y('coordinate_y:Q', scale=alt.Scale(domain=[-5,89])) , #Quantitative scale for the y-axis, set domain for better visualization.
        tooltip=['shot_type', 'shooting_team', 'game_id'] #Adds tooltip to display shot information
).add_params( #Adds interactivity
   selection1
).transform_filter( #Filters the data based on the team selected in the dropdown
   selection1
).transform_filter( #Filters out rows with invalid coordinates
    (alt.datum.coordinate_x != -214748340) & (alt.datum.coordinate_y != -214748340)
).properties(
    title="Spatial Distribution of Shots-LA",
    width=300,
    height=400
)

chart_2

# How do two different teams compare in terms of successful or failed shots?

To answer this question, I decided to create a stacked bar chart to clearly show how two different teams compare in terms of successful and missed shots. On the x-axis, we have the two teams that the user can choose for comparison, and on the y-axis, the count of successful shots + missed shots, highlighted in two different colors.This graph clearly shows the difference in expected shots between two teams, which in certain cases can be substantial, but how the ratio of made shots to missed shots does not undergo significant variations.


In [None]:
# Count successful and missed shots for each team, Groups the data by team and whether the shot was successful or not
counting = df.groupby(["shooting_team", "made_shot"]).size().reset_index(name="counting")

pivoted = counting.pivot(index='shooting_team', columns='made_shot', values='counting').fillna(0)

# Renaming columns for clarity
pivoted.columns = ['missed_shots', 'made_shots']

# Calculate the shot ratio
pivoted['shot_ratio'] = pivoted['made_shots'] / pivoted['missed_shots'].replace(0, pd.NA)

# Reset index to turn the DataFrame back into a regular format
final_df = pivoted.reset_index()

print(final_df)


# Create a stacked bar chart to compare shot success rates between two selected teams
chart_3 = alt.Chart(counting).mark_bar().encode(
    x=alt.X('shooting_team:N', title='Team'),
    y=alt.Y('counting:Q', title='Shots Attempts'),
    color=alt.Color('made_shot:N',scale=alt.Scale(domain=[True, False], range=['orange', 'blue']),legend=alt.Legend(title="Shot Type",orient='bottom')), #Color-codes the bars based on whether the shot was successful or not
    tooltip=[
        alt.Tooltip('shooting_team:N', title='Team'),
        alt.Tooltip('made_shot:N', title='Successful Shot'),
        alt.Tooltip('counting:Q', title='Number of Shots')
    ]
).add_params( #Adds interactivity for the selection of the first team
    selection1
).add_params( # Adds interactivity for the selection of the second team
    selection2
).transform_filter( #Filters the data to show only the selected teams
    selection2 | selection1
).properties(
    title="Comparison Teams: Successful or Failed Shots",
    width=300,
    height=300
)

chart_3

# Is the distribution of shot attempts of a team different when playing home than when playing away?

To answer this question, I decided to create two graphs. The first is a grouped bar chart showing the count of home and away shot attempts for each team. This graph shows how there are teams that are much more confident at home and have a higher shot count (Chicago, Las Vegas). But, counterintuitively, it also shows that there are teams that take more shots in hostile environments (Washington, Phoenix). The second is a scatter plot with coordinate_x on the x-axis and coordinate_y on the y-axis. In the second graph, it is shown that the teams that are more confident at home take more three-point shots when playing on their own court.In this scatter plot the user can choose to watch the distribution of home or away shots attempts.

In [None]:
# Determine the location (home or away) for each shot attempt
df['location'] = 'unknown'  # Initialize location column
for index, row in df.iterrows(): #Iterates through each row to determine if the shot was taken at home or away
    if row['shooting_team'] == row['home_team_name']:
        df.loc[index, 'location'] = 'home'
    elif row['shooting_team'] == row['away_team_name']:
        df.loc[index, 'location'] = 'away'

# Remove rows where location couldn't be determined
df = df[df['location'] != 'unknown']

# Create a bar chart showing the distribution of shot attempts (home vs. away)
chart_4 = alt.Chart(df).mark_bar().encode(
    x=alt.X('location:N', title='Location'),
    y=alt.Y('count()', title='Number of Shot Attempts'),
    color=alt.Color('location:N', title='Location'),
    tooltip=['shooting_team']
).properties(
    title='Shot Attempt Distribution: Home vs. Away',
     width=300,
    height=400
).add_params( #Adds interactivity
    selection1
).transform_filter( #Filters data based on selected team
    selection1
).interactive() #Makes the chart interactive

chart_4

In [None]:
# Determine home/away location for each shot
df['location'] = 'unknown'  # Initialize location column
for index, row in df.iterrows(): #Iterates through each row to determine if the shot was taken at home or away
    if row['shooting_team'] == row['home_team_name']:
        df.loc[index, 'location'] = 'home'
    elif row['shooting_team'] == row['away_team_name']:
        df.loc[index, 'location'] = 'away'

# Remove rows where location is still unknown
df = df[df['location'] != 'unknown']

# Create a scatter plot of shot locations, interactive with team and location selections
chart_5 = alt.Chart(df).mark_circle().encode(
    x=alt.X('coordinate_x:Q', title='Coordinate X',scale=alt.Scale(domain=[0,50]) ),
    y=alt.Y('coordinate_y:Q', title='Coordinate Y', scale=alt.Scale(domain=[-5,89])),
    color=alt.Color('location:N', title='Location'),
    tooltip=['game_id', 'shooting_team', 'location', 'coordinate_x', 'coordinate_y']
).properties(
    title='Shot Locations: Home vs. Away',
    width=300,
    height=400
).add_params( #Adds interactivity for team selection
    selection1
).add_params( #Adds interactivity for location selection
    location_selection
).transform_filter( #Filters the data based on selected team
    selection1
).transform_filter( #Filters the data based on selected location
    location_selection
).transform_filter( #Filters out rows with invalid coordinates
    (alt.datum.coordinate_x != -214748340) & (alt.datum.coordinate_y != -214748340)
).interactive() #Makes the chart interactive

chart_5


# How the shots statistics (successful vs failed) compare per quarter?

To this question, I decided to create a grouped bar chart that distinguishes between two cases: made shots and missed shots. On the x-axis, there is the quarter number, and on the y-axis, the count of shots. As seen in the chart, on the left (in blue) is the count of missed shots, while on the right (in orange) is the count of made shots. This graph shows how there are teams that "under pressure" as time is running out have a higher shot count scored in the last quarter (Indiana, New York).



In [None]:
# Add a column to count individual shot attempts (needed for aggregation)
df['shot_count'] = 1


# Create a bar chart showing shot attempts per quarter, split by shot success/failure
chart_6 = alt.Chart(df).mark_bar().encode(
    x=alt.X('qtr:O', title='Quarter'),
    y=alt.Y('sum(shot_count):Q', title='Number of Shots'), #Aggregates the shot attempts per quarter
    color=alt.Color('made_shot:N', title='Shot Attempt'),
    tooltip=['sum(shot_count)', 'shooting_team']
).properties(
    title='Shot Result per quarter-LA',
    width=300,
    height=300
).add_params( #Adds interactivity
    selection1
).transform_filter( #Filters the data based on selected team
    selection1
).interactive() #Makes the chart interactive

chart_6

# How do the scores of two teams compare for a certain game?

They employ different strategies to achieve this, each with minor shortcomings, I haven't yet been able to resolve completely.  I'm presenting both because neither perfectly addresses all potential issues in the data. Neither method is entirely foolproof due to potential inconsistencies in the data (multiple score updates within a single minute).  I've presented both because they represent different interpretations of how to handle those inconsistencies, and which approach is better might depend on the specifics of the data.  Ideally, a more robust solution would involve pre-processing the data to ensure each minute has only one, correctly updated score.

This approach focuses on getting the first score recorded for each minute. The underlying assumption is that the first score entry for a given minute is representative of the score at that time. The first() aggregation in the groupby() operation reflects this. The is strenghts Simple and relatively efficient. It avoids complex cumulative sum calculations. The weaknesses is if there are multiple score updates within a minute (e.g., due to quick successive scores), this approach might not accurately reflect the actual score at the end of that minute. It only captures the very first score of that minute.




In [None]:
# Create columns indicating team and score for each shot
df['team_name'] = np.where(df['Location'] == 'Home', df['home_team_name'], df['away_team_name']) #Creates a column that specifies the team that took the shot
df['team_score'] = np.where(df['Location'] == 'Home', df['home_score'], df['away_score']) #Creates a column that specifies the score of the team that took the shot

# Sort the DataFrame for correct cumulative sum calculation
df = df.sort_values(['Game', 'team_name', 'Minute', 'team_score']) #Sorts the dataframe to compute the cumulative sum correctly.


# Group data to get the first score for each minute for each team in each game
df_grouped = df.groupby(['Game', 'team_name', 'Minute'], as_index=False)['team_score'].first() #Groups the data and gets the first score for each minute for each team in each game


# Create the line chart showing score evolution
chart_7 = alt.Chart(df_grouped).mark_line(point=True).encode(
    x=alt.X('Minute', title='Minute', scale=alt.Scale(domain=[0,50])),
    y=alt.Y('team_score:Q', title='Score', axis=alt.Axis(labelFontSize=14)),
    color=alt.Color('team_name:N', scale=alt.Scale(range=['black', 'red'])),
    tooltip=['team_name:N', 'Minute:Q', 'team_score:Q']
).properties(
    title='Evolution of Score per Team',
    width=400,
    height=400
).add_params(
    game_selection
).transform_filter(
    game_selection
).interactive()

chart_7

This approach attempts to get the maximum score recorded in each minute. The assumption here is that if there are multiple entries within a minute, the largest value is likely the most up-to-date score. The strengths is It might be slightly more accurate than just taking the first score if score increments are frequently recorded. The weaknesses is The max() aggregation isn't a perfectly reliable way to get an accurate score. It will not correctly show a decrease in score if a score was, for example, reversed. It still might not accurately represent the cumulative score at the end of each minute if scores are frequently being updated.

In [None]:
# Sort the DataFrame for correct cumulative sum calculation
df = df.sort_values(['Game', 'Minute', 'home_score', 'away_score']) #Sorts the dataframe to compute the cumulative sum correctly.


# Group data to get the maximum score for each minute for each team in each game.  This assumes that multiple entries within the same minute represent score increments.
df_agg = df.groupby(['Game', 'home_team_name', 'away_team_name', 'Minute'])[['home_score', 'away_score']].max().reset_index()

# Prepare data for Altair plotting: create a dataframe with team, score, minute, and game information
team_data = []
for _, row in df_agg.iterrows():
    team_data.append({'Game':row['Game'], 'team':row['home_team_name'], 'Minute':row['Minute'], 'score':row['home_score']})
    team_data.append({'Game':row['Game'], 'team':row['away_team_name'], 'Minute':row['Minute'], 'score':row['away_score']})

df_team = pd.DataFrame(team_data)


base = alt.Chart(df_team).encode(
    x='Minute:O',
    y=alt.Y('score:Q', title='Score'),
    color=alt.Color('team:N', title='Team'),
    tooltip=['Game', 'team', 'Minute', 'score']
)

line = base.mark_line(point=True) #Specifies the chart type

chart_8 = line.properties(
    title='Score Evolution by Team',
    width=400,
    height=400
).add_params(
    game_selection
).transform_filter(
    game_selection
).interactive() #Makes the chart interactive

chart_8

I had to create two separate dashboards because there was a problem with interfering dropdowns that I couldn't resolve. This approach allows all the charts to be displayed correctly.

In [None]:
row1 = alt.hconcat(chart_1, chart_2)  

row3 = alt.hconcat(chart_3, chart_6, spacing=20)  


dashboard = alt.vconcat(row1,row3,)  


dashboard

In [None]:
row2 = alt.hconcat(chart_4, chart_5) 
row4 = alt.hconcat(chart_7, chart_8)

dashboard = alt.vconcat(row2, row4)  

dashboard