In [1]:
# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd
import requests
from pathlib import Path

# Libraries to help with data visualization
import matplotlib.pyplot as plt
from scipy.stats import linregress

In [2]:
# Study data files
NBA_2024_searches = "data/terms_by_teams.csv"
NBA_2024_records = "../ucb-data-analytics-project1-group5/data/teams_playoffs.csv"


NBA_query_data = pd.read_csv(NBA_2024_searches)
NBA_records_data = pd.read_csv(NBA_2024_records)

# Read the CSV into a Pandas DataFrame
NBA_queryDF = pd.DataFrame(NBA_query_data)
NBA_recordsDF = pd.DataFrame(NBA_records_data)

# Display the data table for preview
print(NBA_queryDF)
#print(NBA_recordsDF)

#Merge the two DataFrames NBA_queryDF and NBA_recordsDF on the fields "Team" and "Week"
NBA_analysis_df = pd.merge(NBA_queryDF, NBA_recordsDF, on=['Team', 'Week'])

# Display the data table for preview
NBA_analysis_df

                  Team                 Week     Term  Rank  dma_name  \
0       Boston Celtics  2024-01-14 00:00:00  Celtics  26.0        24   
1       Boston Celtics  2024-01-21 00:00:00  Celtics  25.0        24   
2       Boston Celtics  2024-02-25 00:00:00  Celtics  23.0        24   
3       Boston Celtics  2024-03-03 00:00:00  Celtics  27.0        24   
4       Boston Celtics  2024-03-17 00:00:00  Celtics  25.0        24   
...                ...                  ...      ...   ...       ...   
23557  New York Knicks  2024-03-17 00:00:00   Knicks   NaN         4   
23558  New York Knicks  2024-04-07 00:00:00   Knicks   NaN         4   
23559  New York Knicks  2024-04-14 00:00:00   Knicks   NaN         4   
23560  New York Knicks  2024-04-21 00:00:00   Knicks   NaN         4   
23561  New York Knicks  2024-05-05 00:00:00   Knicks  56.0         4   

           dma_id                  score  refresh_date  
0      2024-05-11     Portland-Auburn ME           500  
1      2024-05-11    

Unnamed: 0,Team,Week,Term,Rank,dma_name,dma_id,score,refresh_date,PW_win_pct,Weekly_win_pct,Cum_win_pct,Round
0,Boston Celtics,2024-01-14 00:00:00,Celtics,26.0,24,2024-05-11,Portland-Auburn ME,500,0.667,0.50,0.769,
1,Boston Celtics,2024-01-21 00:00:00,Celtics,25.0,24,2024-05-11,Portland-Auburn ME,500,0.500,0.75,0.767,
2,Boston Celtics,2024-02-25 00:00:00,Celtics,23.0,24,2024-05-11,Portland-Auburn ME,500,1.000,1.00,0.789,
3,Boston Celtics,2024-03-03 00:00:00,Celtics,27.0,24,2024-05-11,Portland-Auburn ME,500,1.000,1.00,0.800,
4,Boston Celtics,2024-03-17 00:00:00,Celtics,25.0,24,2024-05-11,Portland-Auburn ME,500,0.333,1.00,0.791,
...,...,...,...,...,...,...,...,...,...,...,...,...
23557,New York Knicks,2024-03-17 00:00:00,Knicks,,4,2024-05-11,Abilene-Sweetwater TX,662,0.333,1.00,0.597,
23558,New York Knicks,2024-04-07 00:00:00,Knicks,,4,2024-05-11,Abilene-Sweetwater TX,662,0.500,0.50,0.590,
23559,New York Knicks,2024-04-14 00:00:00,Knicks,,4,2024-05-11,Abilene-Sweetwater TX,662,0.500,1.00,0.610,
23560,New York Knicks,2024-04-21 00:00:00,Knicks,,4,2024-05-11,Abilene-Sweetwater TX,662,,,,1.0


In [21]:
correlation = NBA_analysis_df['Round'].corr(NBA_analysis_df['Rank'])
print(f"Pearson correlation coefficient between 'Round' and 'Rank': {correlation}")

correlation = NBA_analysis_df['Cum_win_pct'].corr(NBA_analysis_df['Rank'])
print(f"Pearson correlation coefficient between 'Cum_win_pct' and 'Rank': {correlation}")

Pearson correlation coefficient between 'Round' and 'Rank': 0.127711418878646
Pearson correlation coefficient between 'Cum_win_pct' and 'Rank': -0.3881474294044364


In [28]:
#Import the necessary libraries:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


# Handle missing values in the dataset:
# Check for missing values in the dataset
print(NBA_analysis_df.isnull().sum())

# Handle missing values by dropping rows with NaN values
NBA_analysis_df['Rank'].fillna(0, inplace=True)
NBA_analysis_df['Round'].fillna(0, inplace=True)

#Define the independent variable 'Rank' and the dependent variable 'Cum_win_pct':
X = NBA_analysis_df[['Round']]  # Independent variable (Rank)
y = NBA_analysis_df['Rank']  # Dependent variable (Cumulative Win Percentage)

#Split the data into training and testing sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Fit a linear regression model to the training data:
model = LinearRegression()
model.fit(X_train, y_train)

#Make predictions using the model:
y_pred = model.predict(X_test)

#Evaluate the model performance:
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

#Interpret the model coefficients:
print("Intercept:", model.intercept_)
print("Coefficient:", model.coef_[0])

#Predict how changes in search rankings may impact what round a playoff team reaches:
# Example prediction for a specific team ranking
playoff_round = 4
predicted_rank = model.predict([[playoff_round]])

print(f"Predicted search rank for playoff round {playoff_round} (NBA championship): {predicted_rank[0]}")
#By following these steps and running the provided Python code, you can conduct regression analysis to model the relationship between 'Rank' and 'Cum_win_pct' in your DataFrame and predict how changes in team rankings may impact their cumulative win percentages.

Team                 0
Week                 0
Term                 0
Rank                 0
dma_name             0
dma_id               0
score                0
refresh_date         0
PW_win_pct        4581
Weekly_win_pct    4581
Cum_win_pct       4581
Round                0
dtype: int64
Mean Squared Error: 298.100792110836
Intercept: 5.464241709402251
Coefficient: 23.18737102104605
Predicted search rank for playoff round 4 (NBA championship): 98.21372579358645


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  NBA_analysis_df['Round'].fillna(0, inplace=True)
