# üèÄ NBA Player Statistics Analysis and Score Prediction
In this project, we aim to predict the points an NBA player will score in the next game using historical data (Assists, Rebounds, Minutes Played, etc.).

**Data Set:** 2024-2025 Season Player
**Method:**  Regression Analysis & Machine Learning

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import shapiro, normaltest, kurtosis, skew
from scipy import stats as ss
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import r2_score, mean_squared_error
import warnings
import importlib 

In [None]:
try:
    df_raw = pd.read_csv('data/data.csv')
    print(f"Dataset loaded successfully. Total rows: {df_raw.shape[0]}.")
except FileNotFoundError:
    print("Error: 'data.csv' is not found.")

numeric_cols = ['MP', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 
                'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc']
categorical_cols = ['Tm']

df_avg_numeric = df_raw.groupby('Player')[numeric_cols].mean()
df_avg_categorical = df_raw.groupby('Player')[categorical_cols].first()
df_avg = pd.concat([df_avg_categorical, df_avg_numeric], axis=1).reset_index()

df_avg['FG%'] = (df_avg['FG'] / df_avg['FGA']).fillna(0)
df_avg['3P%'] = (df_avg['3P'] / df_avg['3PA']).fillna(0)
df_avg['FT%'] = (df_avg['FT'] / df_avg['FTA']).fillna(0)

column_mapping = {
    'Player': 'Name', 'Tm': 'Team',
    '3P': 'ThreeP', '3PA': '3PA', '3P%': '3P%'
}
df = df_avg.rename(columns=column_mapping)

X = df[["MP", "FG", "FGA", "FG%", "ThreeP", "3PA", "3P%", 
        "FT", "FTA", "FT%", "ORB", "TRB", "DRB", 
        "AST", "STL", "BLK", "TOV", "PF"]]
y = df["PTS"]

df.head()

## Statistical Analysis and Normality Tests
The normality of the data distribution was analyzed using Shapiro-Wilk and D'Agostino tests.

In [None]:
stat_dagostino, p_dagostino = normaltest(X['FG'])

print(f"Field Goals (FG) Skewness: {skew(X['FG']):.2f}")
print(f"Field Goals (FG) Kurtosis: {kurtosis(X['FG']):.2f}")
print(f"D'Agostino Test p-value: {p_dagostino:.20f}")

plt.figure(figsize=(10, 5))
sns.histplot(X['FG'], kde=True, color='skyblue')
plt.title("Distribution of Field Goals (FG)")
plt.xlabel("FG Values")
plt.show()

In [None]:
plt.figure(figsize=(16, 10))
sns.heatmap(X.corr(), annot=True, fmt='.2f', cmap='coolwarm', linewidths=.5)
plt.title('Correlation Heatmap of Features')
plt.show()

## Machine Learning Model (Linear Regression)
The dataset was split into 80% training and 20% testing sets, and the model was trained.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Model R¬≤ Score: {r2:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.20f}")

plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5, color="blue")
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
plt.xlabel("Actual Points")      
plt.ylabel("Predicted Points")           
plt.title('Actual vs Predicted Points')
plt.show()

In [None]:

X_opt_df = df[['FG', 'ThreeP', 'FT']]
X_opt_with_const = sm.add_constant(X_opt_df)
regressor_OLS = sm.OLS(endog=y, exog=X_opt_with_const).fit()

print(regressor_OLS.summary())

In [None]:

example_data = [1.0, 6.0, 1.0, 3.0] 
prediction = regressor_OLS.predict(example_data)
print(f"Predicted Points: {prediction[0]:.2f}")

print("\n--- Players with Similar Performance ---")
similar_players = df[abs(df["PTS"] - prediction[0]) <= 0.5]
similar_players[['Name', 'Team', 'PTS']].head() 

In [None]:

similarity_threshold = 0.5

similar_players = df[abs(df["PTS"] - prediction[0]) <= similarity_threshold]

print(f"\n--- Real Players Within ¬±{similarity_threshold} Points of Prediction ---")

if similar_players.empty:
    print("No players found close to the prediction.")
else:
    display_table = similar_players[['Name', 'Team', 'PTS', 'FG', 'ThreeP', 'FT']].head(5)
    
    from IPython.display import display
    display(display_table)

In [None]:
print("\n--- TOP 12 PLAYERS OF THE 2024-2025 SEASON ---")

df_top_players = df.sort_values(by='GmSc', ascending=False)

top_12_team = df_top_players[['Name', 'Team', 'GmSc', 'PTS', 'AST', 'TRB']].head(12)

try:
    top_12_team.to_csv('data/top_12_player.csv', index=False, encoding='utf-8-sig')
    print("‚úÖ 'data/top_12_player.csv' file saved successfully.")
except Exception as e:
    print(e)

top_12_team