In [9]:
# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd
import requests
from pathlib import Path

# Libraries to help with data visualization
import matplotlib.pyplot as plt
from scipy.stats import linregress

In [10]:
# Path to the CSV file containing the game-by-game records for the 5 NBA teams appearing in the Google search results
searched_teamWL_path = "data/searched_teams_results.csv" 
team_record_data = pd.read_csv(searched_teamWL_path)

# Read the team-record data into a Pandas DataFrame
team_recordsDF = pd.DataFrame(team_record_data)

# Convert the "Date" column to datetime format
team_recordsDF["Date"] = pd.to_datetime(team_recordsDF["Date"], format="%a, %b %d, %Y")
# Get the week ending (on a Sunday) for each dat--to align with the Google search data for later merging and analysis
team_recordsDF['Week'] = team_recordsDF['Date'] + pd.to_timedelta(6 - team_recordsDF['Date'].dt.dayofweek, unit='D')

# Create a new column to store the updated winning percentage for each team's games in each week
team_recordsDF['Weekly Win%'] = 0.0
# Initialize a new column "PW_win%" in the DataFrame to store the team's winning percentage from the previous week
team_recordsDF['PW_win_pct'] = 0.0

# Remove some unnecessary columns
team_recordsDF.drop(columns=['G','Start (ET)','Unnamed: 4','Unnamed: 5','Unnamed: 6','Opponent','Unnamed: 9','Tm','Opp','Streak','Notes'],axis=1,inplace=True)

# Group the DataFrame by the "TEAM" column
grouped = team_recordsDF.groupby('TEAM')

# Iterate through each group (team) in the grouped DataFrame
for team, group in grouped:
    
    # Initialize variables to track total wins and games played
    wins = 0
    total_games = 0

    # Initialize variables to track wins and games played for each week
    weekly_wins = 0
    weekly_total_games = 0
    prev_week = None
    prev_wk_win_pct = None

    # Sort the group (of data sorted by 'TEAM') by the "Week" and "Date", too
    group = group.sort_values(by=['Week', 'Date'])

    # Iterate through each row in the group
    for index, row in group.iterrows():

        # Update the "PW_win%" column with the previous week's "Weekly Win%" value
        if prev_week is not None and row['Week']!=prev_week:
            team_recordsDF.loc[(team_recordsDF['TEAM'] == row['TEAM']) & (team_recordsDF['Date'] == row['Date']), 'PW_win_pct'] = PW_win_pct

        # Reset wins and total games at the start of each week
        if weekly_total_games == 0 or row['Week'] != prev_week:
            weekly_wins = 0
            weekly_total_games = 0
            if prev_week is not None:
                PW_win_pct = weekly_win_percentage# Update the previous week's "Weekly Win%" value
        
        # Update wins and total games based on game results
        if row['game_result'] == 'W':
            wins += 1
            weekly_wins +=1
        total_games += 1
        weekly_total_games += 1
    
        # Calculate and print the cumulative winning percentage after each game
        win_percentage = wins / total_games
        team_recordsDF.loc[(team_recordsDF['TEAM'] == row['TEAM']) & (team_recordsDF['Date']==row['Date']), "Cum Win%"] = win_percentage

        # Calculate the winning percentage for each week and update the new column
        weekly_win_percentage = weekly_wins / weekly_total_games
        team_recordsDF.loc[(team_recordsDF['TEAM'] == row['TEAM']) & (team_recordsDF['Date'] == row['Date']), "Weekly Win%"] = weekly_win_percentage
        
        # Update PW_win_pct for every week so can filter out individual game "Date"s
        if prev_week is not None:
            PW_win_pct = PW_win_pct  # Add the same value to PW_win_pct for rows with the same week
            team_recordsDF.loc[(team_recordsDF['TEAM'] == row['TEAM']) & (team_recordsDF['Date'] == row['Date']), 'PW_win_pct'] = PW_win_pct
        else:
            PW_win_pct = weekly_win_percentage  # Update PW_win_pct when a new week begin

        # Update prev_week (previous week) to the current week before exiting the loop and moving to the next week
        prev_week = row['Week']

# Group the DataFrame by 'TEAM' and 'Week' and get the index of the row with the latest date in each group
latest_dates_idx = team_recordsDF.groupby(['TEAM', 'Week'])['Date'].idxmax()

# Filter the DataFrame to just the latest date (containing the full current week's winning percentage)
latest_records = team_recordsDF.loc[latest_dates_idx]

# Remove unneeded columns and 2023 records to better match the search data
latest_records.drop(columns=['Date','game_result','W','L'],index=1,inplace=True)
latest_records = latest_records[latest_records['Week'].dt.year == 2024]

# Display the filtered DataFrame with only the latest records for each 'TEAM' and 'Week'
latest_records.head(25)

Unnamed: 0,TEAM,Week,Weekly Win%,PW_win_pct,Cum Win%
34,Boston Celtics,2024-01-07,0.666667,1.0,0.8
38,Boston Celtics,2024-01-14,0.5,0.666667,0.769231
42,Boston Celtics,2024-01-21,0.75,0.5,0.767442
45,Boston Celtics,2024-01-28,0.666667,0.75,0.76087
49,Boston Celtics,2024-02-04,0.75,0.666667,0.76
52,Boston Celtics,2024-02-11,1.0,0.75,0.773585
54,Boston Celtics,2024-02-18,1.0,1.0,0.781818
56,Boston Celtics,2024-02-25,1.0,1.0,0.789474
59,Boston Celtics,2024-03-03,1.0,1.0,0.8
62,Boston Celtics,2024-03-10,0.333333,1.0,0.777778


In [13]:
import csv

# Define the output file path
output_file = "data/team_weekly_stats.csv"

# Define the header for the CSV file
header = ["TEAM", "Week", "PW_win_pct","Weekly_Win%", "Cum_Win%"]

# Open the output file and write the header
with open(output_file, "w", newline='') as datafile:
    writer = csv.writer(datafile)
    writer.writerow(header)

    # Iterate through the rows of the DataFrame and write each row to the CSV file
    for index, row in latest_records.iterrows():
        team = row["TEAM"]
        week = row["Week"]
        pw_win_pct = "{:.3f}".format(row["PW_win_pct"])
        weekly_win_pct = "{:.3f}".format(row["Weekly Win%"])
        cum_win_pct = "{:.3f}".format(row["Cum Win%"])

        data_row = [team, week, pw_win_pct, weekly_win_pct, cum_win_pct]
        writer.writerow(data_row)