In [1]:
# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd
import requests
from pathlib import Path

# Libraries to help with data visualization
import matplotlib.pyplot as plt
from scipy.stats import linregress

In [6]:
# Read in Google-query data files
google_query_path = "data/Google_Bigquery_Top_Search_Jan_Jun_2024.csv"
google_query_data = pd.read_csv(google_query_path)

# Read the CSV into a Pandas DataFrame
google_queryDF = pd.DataFrame(google_query_data)

# Convert 'week' column to datetime format
google_queryDF['week'] = pd.to_datetime(google_queryDF['week'])

# Group by month, term. and DMA and aggregate the data
monthly_data = google_queryDF.groupby(['term', 'dma_name', google_queryDF['week'].dt.to_period('M')]).agg({
    'score': 'mean',
    'rank': 'mean',
    'refresh_date': 'max',  # Get the latest refresh_date in the month
    'dma_id': 'first',       # Get the first dma_id in the month
  }).reset_index()

monthly_data.rename(columns={'week': 'month'}, inplace=True)

# Display the monthly data
print(monthly_data)
monthly_data.tail()

#Print out a sample case (searches for "Celtics" in SLC DMA) of aggregating search terms by month 
filtered_df = google_queryDF[(google_queryDF['term'] == 'Celtics') & (google_queryDF['dma_name'] == 'Salt Lake City UT')]
print(filtered_df)

month_filtered_df = monthly_data[(monthly_data['term'] == 'Celtics') & (monthly_data['dma_name'] == 'Salt Lake City UT')]
month_filtered_df

                       term               dma_name    month  score  rank  \
0              Aaron Gordon  Abilene-Sweetwater TX  2024-01    NaN  10.0   
1              Aaron Gordon  Abilene-Sweetwater TX  2024-02    NaN  10.0   
2              Aaron Gordon  Abilene-Sweetwater TX  2024-03    NaN  10.0   
3              Aaron Gordon  Abilene-Sweetwater TX  2024-04    NaN  10.0   
4              Aaron Gordon  Abilene-Sweetwater TX  2024-05    NaN  10.0   
...                     ...                    ...      ...    ...   ...   
50669  When is Father's Day          Zanesville OH  2024-01    NaN  14.0   
50670  When is Father's Day          Zanesville OH  2024-02    NaN  14.0   
50671  When is Father's Day          Zanesville OH  2024-03    NaN  14.0   
50672  When is Father's Day          Zanesville OH  2024-04    NaN  14.0   
50673  When is Father's Day          Zanesville OH  2024-05    NaN  14.0   

      refresh_date  dma_id  
0       2024-05-13     662  
1       2024-05-13     662  


Unnamed: 0,term,dma_name,month,score,rank,refresh_date,dma_id
9344,Celtics,Salt Lake City UT,2024-01,8.25,24.0,2024-05-13,770
9345,Celtics,Salt Lake City UT,2024-02,6.571429,24.0,2024-05-13,770
9346,Celtics,Salt Lake City UT,2024-03,6.833333,24.0,2024-05-13,770
9347,Celtics,Salt Lake City UT,2024-04,14.5,24.0,2024-05-13,770
9348,Celtics,Salt Lake City UT,2024-05,17.0,24.0,2024-05-11,770


In [7]:
# Create a dictionary to associate Teams with Terms
team_terms = {
    'Boston Celtics': ['Celtics'],
    'Dallas Mavericks': ['Mavericks'],
    'Denver Nuggets': ['Aaron Gordon', 'Nikola Jokic', 'Nuggets vs Timberwolves'],
    'Minnesota Timberwolves': ['Mike Conley', 'Nuggets vs Timberwolves', 'Naz Reid'],
    'New York Knicks': ['Knicks', 'Donte DiVincenzo']
}

# Filter seach dataset of 'Terms' based on the 'Teams' using list comprehension:
# all_terms = ['Term1', 'Term2', 'Term3', 'Term4', 'Term5', 'Term6']

# Create an empty list to store the filtered DataFrames with the 'Team' value
filtered_dfs = []

# # Iterate through each team in team_terms and filter the DataFrame based on each team's terms
for team, terms in team_terms.items():
    filtered_df = google_queryDF[google_queryDF['term'].apply(lambda x: x in terms)].copy()
    filtered_df['Team'] = team  # Add the 'Team' value to the filtered DataFrame
    filtered_dfs.append(filtered_df)

# Concatenate all filtered DataFrames into a single DataFrame
filtered_combined_df = pd.concat(filtered_dfs)

# Access and print the combined filtered DataFrame
print("Combined Filtered DataFrame:")
print(filtered_combined_df[filtered_combined_df['Team']=="Minnesota Timberwolves"])

print(filtered_combined_df["week"].value_counts())
print(filtered_combined_df["week"].nunique())

#This approach provides a flexible and efficient way to associate terms with teams and filter a large dataset based on these associations.

Combined Filtered DataFrame:
            week  score  rank refresh_date               dma_name  dma_id  \
38148 2024-01-28    NaN    24   2024-05-15     Portland-Auburn ME     500   
38149 2024-02-04    NaN    24   2024-05-15     Portland-Auburn ME     500   
38150 2024-03-03    NaN    24   2024-05-15     Portland-Auburn ME     500   
38151 2024-03-10    NaN    24   2024-05-15     Portland-Auburn ME     500   
38152 2024-03-17    NaN    24   2024-05-15     Portland-Auburn ME     500   
...          ...    ...   ...          ...                    ...     ...   
86377 2024-02-11    NaN     3   2024-05-11  Abilene-Sweetwater TX     662   
86378 2024-03-24    NaN     3   2024-05-11  Abilene-Sweetwater TX     662   
86379 2024-03-31    NaN     3   2024-05-11  Abilene-Sweetwater TX     662   
86380 2024-04-07    NaN     3   2024-05-11  Abilene-Sweetwater TX     662   
86381 2024-05-05   19.0     3   2024-05-11  Abilene-Sweetwater TX     662   

                          term                

In [8]:
#Output the file containing the search terms filtered by NBA teams
import csv

# Define the output file path
output_file = "data/terms_by_teams.csv"

# Define the header for the CSV file
header = ["Team", "Week", "Term","Rank","dma_name","dma_id","score","refresh_date"]

# Open the output file and write the header
with open(output_file, "w", newline='') as datafile:
    writer = csv.writer(datafile)
    writer.writerow(header)

    # Iterate through the rows of the DataFrame and write each row to the CSV file
    for index, row in filtered_combined_df.iterrows():
        Team = row["Team"]
        Week = row["week"]
        term = row["term"]
        rank = row["rank"]
        dma_name = row["dma_name"]
        dma_id = row["dma_id"]
        score = row["score"]
        refresh_date = row["refresh_date"]
        data_row = [Team, Week, term, score, rank, refresh_date,dma_name,dma_id]
        writer.writerow(data_row)


In [12]:
# Study data files
NBA_2024_searches = "data/terms_by_teams.csv"
NBA_2024_records = "data/teams_playoffs.csv"


NBA_query_data = pd.read_csv(NBA_2024_searches)
NBA_records_data = pd.read_csv(NBA_2024_records)

# Read the CSV into a Pandas DataFrame
NBA_queryDF = pd.DataFrame(NBA_query_data)
NBA_recordsDF = pd.DataFrame(NBA_records_data)

# Display the data table for preview
print(NBA_queryDF)
#print(NBA_recordsDF)

#Merge the two DataFrames NBA_queryDF and NBA_recordsDF on the fields "Team" and "Week"
NBA_analysis_df = pd.merge(NBA_queryDF, NBA_recordsDF, on=['Team', 'Week'])

# Display the data table for preview
NBA_analysis_df

                  Team                 Week     Term  Rank  dma_name  \
0       Boston Celtics  2024-01-14 00:00:00  Celtics  26.0        24   
1       Boston Celtics  2024-01-21 00:00:00  Celtics  25.0        24   
2       Boston Celtics  2024-02-25 00:00:00  Celtics  23.0        24   
3       Boston Celtics  2024-03-03 00:00:00  Celtics  27.0        24   
4       Boston Celtics  2024-03-17 00:00:00  Celtics  25.0        24   
...                ...                  ...      ...   ...       ...   
23557  New York Knicks  2024-03-17 00:00:00   Knicks   NaN         4   
23558  New York Knicks  2024-04-07 00:00:00   Knicks   NaN         4   
23559  New York Knicks  2024-04-14 00:00:00   Knicks   NaN         4   
23560  New York Knicks  2024-04-21 00:00:00   Knicks   NaN         4   
23561  New York Knicks  2024-05-05 00:00:00   Knicks  56.0         4   

           dma_id                  score  refresh_date  
0      2024-05-11     Portland-Auburn ME           500  
1      2024-05-11    

Unnamed: 0,Team,Week,Term,Rank,dma_name,dma_id,score,refresh_date,PW_win_pct,Weekly_win_pct,Cum_win_pct,Round
0,Boston Celtics,2024-01-14 00:00:00,Celtics,26.0,24,2024-05-11,Portland-Auburn ME,500,0.667,0.50,0.769,
1,Boston Celtics,2024-01-21 00:00:00,Celtics,25.0,24,2024-05-11,Portland-Auburn ME,500,0.500,0.75,0.767,
2,Boston Celtics,2024-02-25 00:00:00,Celtics,23.0,24,2024-05-11,Portland-Auburn ME,500,1.000,1.00,0.789,
3,Boston Celtics,2024-03-03 00:00:00,Celtics,27.0,24,2024-05-11,Portland-Auburn ME,500,1.000,1.00,0.800,
4,Boston Celtics,2024-03-17 00:00:00,Celtics,25.0,24,2024-05-11,Portland-Auburn ME,500,0.333,1.00,0.791,
...,...,...,...,...,...,...,...,...,...,...,...,...
23557,New York Knicks,2024-03-17 00:00:00,Knicks,,4,2024-05-11,Abilene-Sweetwater TX,662,0.333,1.00,0.597,
23558,New York Knicks,2024-04-07 00:00:00,Knicks,,4,2024-05-11,Abilene-Sweetwater TX,662,0.500,0.50,0.590,
23559,New York Knicks,2024-04-14 00:00:00,Knicks,,4,2024-05-11,Abilene-Sweetwater TX,662,0.500,1.00,0.610,
23560,New York Knicks,2024-04-21 00:00:00,Knicks,,4,2024-05-11,Abilene-Sweetwater TX,662,,,,1.0


In [13]:
correlation = NBA_analysis_df['Round'].corr(NBA_analysis_df['Rank'])
print(f"Pearson correlation coefficient between 'Round' and 'Rank': {correlation}")

correlation = NBA_analysis_df['Cum_win_pct'].corr(NBA_analysis_df['Rank'])
print(f"Pearson correlation coefficient between 'Cum_win_pct' and 'Rank': {correlation}")

Pearson correlation coefficient between 'Round' and 'Rank': 0.127711418878646
Pearson correlation coefficient between 'Cum_win_pct' and 'Rank': -0.3881474294044364


In [28]:
#Import the necessary libraries:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


# Handle missing values in the dataset:
# Check for missing values in the dataset
print(NBA_analysis_df.isnull().sum())

# Handle missing values by dropping rows with NaN values
NBA_analysis_df['Rank'].fillna(0, inplace=True)
NBA_analysis_df['Round'].fillna(0, inplace=True)

#Define the independent variable 'Rank' and the dependent variable 'Cum_win_pct':
X = NBA_analysis_df[['Round']]  # Independent variable (Rank)
y = NBA_analysis_df['Rank']  # Dependent variable (Cumulative Win Percentage)

#Split the data into training and testing sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Fit a linear regression model to the training data:
model = LinearRegression()
model.fit(X_train, y_train)

#Make predictions using the model:
y_pred = model.predict(X_test)

#Evaluate the model performance:
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

#Interpret the model coefficients:
print("Intercept:", model.intercept_)
print("Coefficient:", model.coef_[0])

#Predict how changes in search rankings may impact what round a playoff team reaches:
# Example prediction for a specific team ranking
playoff_round = 4
predicted_rank = model.predict([[playoff_round]])

print(f"Predicted search rank for playoff round {playoff_round} (NBA championship): {predicted_rank[0]}")
#By following these steps and running the provided Python code, you can conduct regression analysis to model the relationship between 'Rank' and 'Cum_win_pct' in your DataFrame and predict how changes in team rankings may impact their cumulative win percentages.

Team                 0
Week                 0
Term                 0
Rank                 0
dma_name             0
dma_id               0
score                0
refresh_date         0
PW_win_pct        4581
Weekly_win_pct    4581
Cum_win_pct       4581
Round                0
dtype: int64
Mean Squared Error: 298.100792110836
Intercept: 5.464241709402251
Coefficient: 23.18737102104605
Predicted search rank for playoff round 4 (NBA championship): 98.21372579358645


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  NBA_analysis_df['Round'].fillna(0, inplace=True)
