In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.optimize import minimize, LinearConstraint, Bounds

In [2]:
df=pd.read_csv('data.csv')
df.drop(['Unnamed: 0', 'Player_ID'], axis=1, inplace=True)

In [3]:
#Organize the dataframe into dictionary
df_multi = df.set_index(['PLAYER_NAME']).sort_index()

In [4]:
#Drop players with less than 15 minutes average playing time
avg_minutes = df_multi.groupby('PLAYER_NAME')['MIN_num'].mean()
eligible_players = avg_minutes[avg_minutes >= 15].index
df_min15 = df_multi.loc[eligible_players]
player_means=df_min15.groupby('PLAYER_NAME').mean()
stat_stds = df_min15.groupby('PLAYER_NAME').std()

In [5]:
#Normalize dataframe values

df_score = df_min15[['FG_PCT', 'FT_PCT', 'FG3M', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PTS']]
df_attempts = df_min15[['FGA', 'FTA']]

scaler = StandardScaler()
mm_scaler = MinMaxScaler()

scaled = scaler.fit_transform(df_score)
att_scaled = mm_scaler.fit_transform(df_attempts)

# Put back into DataFrame
scaled_df = pd.DataFrame(scaled, columns=df_score.columns, index=df_score.index)
att_scaled_df = pd.DataFrame(att_scaled, columns=df_attempts.columns, index=df_attempts.index)

# Combine back with string columns
scaled_score_df = pd.concat([att_scaled_df ,scaled_df], axis=1)

scaled_score_df["FG_PCT"] = scaled_score_df["FGA"] * scaled_score_df["FG_PCT"]
scaled_score_df["FT_PCT"] = scaled_score_df["FTA"] * scaled_score_df["FT_PCT"]
scaled_score_df.drop(['FGA','FTA'], axis=1, inplace=True)

perc_df = scaled_score_df[['FG_PCT', 'FT_PCT']]
perc_scaled = scaler.fit_transform(perc_df)
perc_scaled_df = pd.DataFrame(perc_scaled, columns=perc_df.columns, index=perc_df.index)
scaled_score_df[['FG_PCT', 'FT_PCT']] = perc_scaled_df


In [6]:
#Calculate overall variance for categories among all players
scaled_player_means=scaled_score_df.groupby('PLAYER_NAME').mean()
scaled_player_stds =scaled_score_df.groupby('PLAYER_NAME').std()
total_stds=scaled_player_stds.sum()

In [7]:
scaled_player_cov=scaled_score_df.groupby('PLAYER_NAME').cov()


stats_cols = ['PTS', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'FG_PCT', 'FT_PCT', 'FG3M']
total_cov = pd.DataFrame(0.0, index=stats_cols, columns=stats_cols)
for name, group in scaled_player_cov.groupby('PLAYER_NAME'):
    #total_cov.values=total_cov.values+scaled_player_cov.loc['name'].values
    total_cov = total_cov.add(scaled_player_cov.loc[name], fill_value=0.0)
print (total_cov)  

cov_mat=total_cov.values

  baseCov = np.cov(mat.T)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


               AST         BLK        FG3M      FG_PCT      FT_PCT  \
AST     184.739763   12.890881   23.232372    7.192503   16.032646   
BLK      12.890881  284.736357    9.969845    2.853753   12.305658   
FG3M     23.232372    9.969845  246.945068  121.927854   14.791476   
FG_PCT    7.192503    2.853753  121.927854  321.738168    9.137702   
FT_PCT   16.032646   12.305658   14.791476    9.137702  255.035034   
PTS      37.008680   21.096834  150.490816  158.639381   87.678715   
REB      42.805568   34.799763   29.386129   11.758720   24.842632   
STL      32.758607   12.872863   27.150181   18.047403   17.570389   
TOV      24.468446   14.248845   19.394669   12.809124   17.216476   

               PTS         REB         STL         TOV  
AST      37.008680   42.805568   32.758607   24.468446  
BLK      21.096834   34.799763   12.872863   14.248845  
FG3M    150.490816   29.386129   27.150181   19.394669  
FG_PCT  158.639381   11.758720   18.047403   12.809124  
FT_PCT   87.67

In [8]:
import numpy as np
from scipy.optimize import minimize

def minimize_quad_wrt_simplex(A, tol=1e-9):
    """
    Minimize w^T A w subject to sum(w) = 1 and w >= 0.

    Parameters
    ----------
    A : (n,n) ndarray
        Symmetric cost matrix.
    tol : float
        Tolerance for optimization termination.

    Returns
    -------
    w_opt : (n,) ndarray
        Optimal weights (sum to 1 and non-negative).
    res : OptimizeResult
        Full scipy result for diagnostics.
    """
    A = np.asarray(A, dtype=float)
    n = A.shape[0]
    if A.shape[0] != A.shape[1]:
        raise ValueError("A must be square")

    # objective and gradient (use gradient for faster convergence)
    def obj(w):
        return float(w @ (A @ w))

    def jac(w):
        # gradient: 2*A*w
        return 2.0 * (A @ w)

    # equality constraint: sum(w) - 1 == 0
    cons = {
        'type': 'eq',
        'fun': lambda w: np.sum(w) - 1.0,
        'jac': lambda w: np.ones(n)
    }

    # bounds: w_i >= 0
    bounds = [(0.0, None)] * n
    w0 = np.ones(n)/9

    # run SLSQP
    res = minimize(obj, w0, method='SLSQP', jac=jac,
                   bounds=bounds,
                   constraints=cons,
                   options={'ftol': tol, 'maxiter': 1000})
   
    w_opt = res.x

    if np.sum(w_opt) <= 0:
        raise RuntimeError("Optimization failed to produce positive weights")
    w_opt = w_opt / np.sum(w_opt)

    return w_opt, res

In [9]:
weights, res = minimize_quad_wrt_simplex(cov_mat)
print (weights)

[0.18183757 0.13809232 0.09414026 0.10136576 0.14968239 0.
 0.118063   0.09144886 0.12536985]


In [10]:
url = 'https://www.basketball-reference.com/leagues/NBA_2026_per_game.html'
tables = pd.read_html(url)

df = tables[0]

df = df[['Player', 'Pos', 'Team', 'Age', 'G', 'MP', 'FG', 'FGA', 'FG%', '3P',
          '3PA', 'FT', 'FTA', 'FT%', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PTS']]

# Drop partial rows for players traded mid-season
sorted_df = df.sort_values("Team")
df = sorted_df.drop_duplicates(subset="Player", keep='first')
df.reset_index(drop=True, inplace=True)

# TODO: Choose filtering method to have a smaller pool of relevant players
# Drop all players with less than 18 minutes played
df.drop(df[df['MP'] <= 15.0].index, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [11]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
df_score = df[['FG%', 'FT%', '3P', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PTS']]
df_attempts = df[['FGA', 'FTA']]

scaler = StandardScaler()
mm_scaler = MinMaxScaler()

scaled = scaler.fit_transform(df_score)
att_scaled = mm_scaler.fit_transform(df_attempts)

# Put back into DataFrame
scaled_df = pd.DataFrame(scaled, columns=df_score.columns, index=df_score.index)
att_scaled_df = pd.DataFrame(att_scaled, columns=df_attempts.columns, index=df_attempts.index)

# Combine back with string columns
scaled_score_df = pd.concat([df[['Player', 'Pos', 'Team']], att_scaled_df ,scaled_df], axis=1)

In [12]:
scaled_score_df["TOV"] *= -1
scaled_score_df["FG%"] = scaled_score_df["FGA"] * scaled_score_df["FG%"]
scaled_score_df["FT%"] = scaled_score_df["FTA"] * scaled_score_df["FT%"]
scaled_score_df.drop(['FGA','FTA'], axis=1, inplace=True)
perc_df = scaled_score_df[['FG%', 'FT%']]
perc_scaled = scaler.fit_transform(perc_df)
perc_scaled_df = pd.DataFrame(perc_scaled, columns=perc_df.columns, index=perc_df.index)
scaled_score_df[['FG%', 'FT%']] = perc_scaled_df

In [13]:
#scaled_score_df[['AST', 'BLK', '3P', 'FG%', 'FT%', 'PTS', 'TRB', 'STL', 'TOV']]=scaled_score_df[['AST', 'BLK', '3P', 'FG%', 'FT%', 'PTS', 'TRB', 'STL', 'TOV']].mul(weights, axis=1)
scaled_score_df[['AST', 'BLK', '3P', 'FG%', 'FT%', 'PTS', 'TRB', 'STL', 'TOV']]=scaled_score_df[['AST', 'BLK', '3P', 'FG%', 'FT%', 'PTS', 'TRB', 'STL', 'TOV']]*weights.T
print (weights)

[0.18183757 0.13809232 0.09414026 0.10136576 0.14968239 0.
 0.118063   0.09144886 0.12536985]


In [14]:
df_basic_rank = scaled_score_df
df_reordered = df.iloc[:, [0, 2, 1]]
df_basic_rank["Val"] = df_basic_rank.select_dtypes(include=["number"]).sum(axis=1)
df_basic_rank.sort_values(by='Val', ascending=False, inplace=True)
df_basic_rank.reset_index(drop=True, inplace=True)

In [15]:
def highlight_group_boundaries(row):
    # row.name gives the row index
    if (row.name + 1) % 8 == 0:  # every 8th row
        return ["border-bottom: 3px solid black"] * len(row)
    else:
        return [""] * len(row)

In [16]:
df_basic_rank[:170].style.apply(highlight_group_boundaries, axis=1)

Unnamed: 0,Player,Pos,Team,FG%,FT%,3P,TRB,AST,STL,BLK,TOV,PTS,Val
0,Nikola Jokić,C,DEN,0.452994,0.206217,0.063907,0.376353,0.729785,0.128225,0.050769,-0.293023,0.0,1.715228
1,Victor Wembanyama,C,SAS,0.095712,0.218396,0.023433,0.400776,0.093464,0.039381,0.896858,-0.293023,0.0,1.474997
2,Shai Gilgeous-Alexander,PG,OKC,0.250304,0.475685,0.074026,0.01001,0.34246,0.150437,0.079945,-0.014055,0.0,1.368811
3,Tyrese Maxey,PG,PHI,-0.00968,0.325851,0.235924,0.000241,0.416236,0.150437,0.079945,-0.146198,0.0,1.052755
4,Tyler Herro,SG,MIA,0.192041,0.435031,0.154975,0.029548,0.001244,0.083803,0.050769,0.044676,0.0,0.992086
5,Luka Dončić,PG,LAL,0.013786,0.102703,0.225805,0.185855,0.591455,0.194859,0.021594,-0.366436,0.0,0.96962
6,Scottie Barnes,PF,TOR,0.117507,0.115522,-0.017042,0.166316,0.185685,0.128225,0.313348,-0.10215,0.0,0.907412
7,James Harden,PG,LAC,-0.103181,0.494462,0.225805,0.058856,0.5269,0.039381,-0.036757,-0.366436,0.0,0.839029
8,Mikal Bridges,SF,NYK,0.099292,-0.014528,0.074026,-0.004644,0.130352,0.261492,0.196646,0.074041,0.0,0.816679
9,Jalen Williams,SG,OKC,-0.4167,0.367015,-0.047398,-0.033951,0.462346,0.239281,0.138296,0.088723,-0.0,0.797612


In [17]:
import numpy as np
from scipy.optimize import minimize

def minimize_quad_wrt_simplex(A, tol=1e-9):
    """
    Minimize w^T A w subject to sum(w) = 1 and w >= 0.

    Parameters
    ----------
    A : (n,n) ndarray
        Symmetric cost matrix.
    tol : float
        Tolerance for optimization termination.

    Returns
    -------
    w_opt : (n,) ndarray
        Optimal weights (sum to 1 and non-negative).
    res : OptimizeResult
        Full scipy result for diagnostics.
    """
    A = np.asarray(A, dtype=float)
    # Symmetrize A to avoid numerical asymmetry
    #A = 0.5 * (A + A.T)
    n = A.shape[0]
    if A.shape[0] != A.shape[1]:
        raise ValueError("A must be square")

    # objective and gradient (use gradient for faster convergence)
    def obj(w):
        return float(w @ (A @ w))

    def jac(w):
        # gradient: 2*A*w
        return 2.0 * (A @ w)

    # equality constraint: sum(w) - 1 == 0
    cons = {
        'type': 'eq',
        'fun': lambda w: np.sum(w) - 1.0,
        'jac': lambda w: np.ones(n)
    }

    # bounds: w_i >= 0
    bounds = [(0.0, None)] * n

    # initial guess: use closed-form solution (may have negatives) then project to simplex
    # closed form w_cf = A^{-1} 1 / (1^T A^{-1} 1)
    try:
        invA1 = np.linalg.solve(A, np.ones(n))
        w0 = invA1 / (np.sum(invA1))
    except np.linalg.LinAlgError:
        # use pseudo-inverse if A singular or ill-conditioned
        invA = np.linalg.pinv(A)
        w0 = invA @ np.ones(n)
        w0 = w0 / np.sum(w0)

    # if w0 has negatives, project to simplex as warm start
    def project_to_simplex(v):
        # fast algorithm to project onto simplex {x: x>=0, sum x = 1}
        # (from: Duchi et al., 2008)
        v = np.asarray(v, dtype=float)
        u = np.sort(v)[::-1]
        cssv = np.cumsum(u)
        rho = np.nonzero(u * np.arange(1, len(u) + 1) > (cssv - 1))[0][-1]
        theta = (cssv[rho] - 1.0) / (rho + 1.0)
        w = np.maximum(v - theta, 0.0)
        return w

    if np.any(w0 < 0):
        w0 = project_to_simplex(w0)

    # run SLSQP
    res = minimize(obj, w0, method='SLSQP', jac=jac,
                   bounds=bounds,
                   constraints=cons,
                   options={'ftol': tol, 'maxiter': 1000})

    # Ensure numerical feasibility: project final result to simplex (can fix tiny negatives)
    w_opt = res.x
    # clip small negative values and renormalize
    w_opt = np.maximum(w_opt, 0.0)
    if np.sum(w_opt) <= 0:
        raise RuntimeError("Optimization failed to produce positive weights")
    w_opt = w_opt / np.sum(w_opt)

    return w_opt, res