In [48]:
import pandas as pd
# importing classifiers and function for evaluation metrics
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
# importing functions for later use
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
import plotly.express as px

Hypotheses:
    It will be easiest to predict center and point guards because they are typically at the extremes of height, wingspan, rebounding, and assists
    Height, Wingspan, TRB%, AST%, STL%, and BLK% will be the best indicators of position

Defining Functions to be used in cleaning data

In [49]:
def find_pos(string):
    '''
    finds the position of the player
    
    Args:
        string (str) the value in column name that contains the player's name, position, and team
    '''
    if string[-1] == 'C':
        return string[-1]
    else:
        return string[-2:]
def find_team(string):
    '''
    finds the team of the player
    
    Args:
        string (str) the value in column name that contains the player's name, position, and team
    '''
    if string[-1] == 'C':
        return string[-7:-4]
    else:
        return string[-8:-5]
def keep_only_name(string):
    '''
    returns only the player's name
    
    Args:
        string (str) the value in column name that contains the player's name, position, and team
    '''
    if string[-1] == 'C':
        return string[:-7]
    else:
        return string[:-8]
def feet_to_inches(string):
    '''
    Converts wingspan and height figures to inches by parsing strings i.e. 6'0" -> 72 
    
    Args:
        string (str) height or wingspan description
    '''
    feet = string[0]
    apos = False
    inches = ''
    for char in range(len(string)):
        if (char > 1 and char < len(string) - 1):
            inches += string[char]
    return int(feet) * 12 + float(inches)


Importing and Cleaning Wingspan and Height Data

In [50]:
wingspan_data = pd.read_csv('Wingspan Data.csv')
wingspan_data['position'] = wingspan_data['Name'].apply(find_pos)
wingspan_data['team'] = wingspan_data['Name'].apply(find_team)
wingspan_data['Name'] = wingspan_data['Name'].apply(keep_only_name)
wingspan_data['height'] = wingspan_data['height'].apply(feet_to_inches)
wingspan_data['wingspan'] = wingspan_data['wingspan'].apply(feet_to_inches)

Importing and Cleaning player stats data

In [51]:
import unidecode
player_stats = pd.read_html('https://www.basketball-reference.com/leagues/NBA_2021_advanced.html')[0]
player_stats = player_stats.append(pd.read_html('https://www.basketball-reference.com/leagues/NBA_2020_advanced.html')[0])
player_stats = player_stats.append(pd.read_html('https://www.basketball-reference.com/leagues/NBA_2019_advanced.html')[0])
# A few values for Player Name are 'Player' which can be deleted
player_stats = player_stats[player_stats.Player != 'Player']
player_stats = player_stats.rename(columns = {'Player': 'Name', 'Tm': 'team'})
# Takes any accents out of player names
player_stats['Name'] = player_stats['Name'].map(unidecode.unidecode)
# Accounting for differences in team abbreviations
abbrev_diffs = {'CHO': 'CHA', 'BKN': 'BRK', 'PHO': 'PHX'}
wingspan_data['team'].replace(abbrev_diffs, inplace = True)
player_stats['team'].replace(abbrev_diffs, inplace = True)

Combining wingspan, height, and player stats data and Cleaning

In [52]:
player_final = player_stats.merge(wingspan_data, left_on = ['Name'], right_on = ['Name'], how = 'inner')
num_cols = player_final.drop(columns = ['Name', 'team_x', 'team_y', 'Pos', 'position']).columns
for col in num_cols:
    player_final[col] = player_final[col].apply(float)
player_final['M'] = player_final['MP'] / player_final['G']
# Keeping only players that played more than 20 mins per game and played in more than 20 games
player_final = player_final[(player_final.M > 20) & (player_final.G > 20)]

In [53]:
len(player_final)

712

Visualizing player attributes against position to see the distribution across the league and look into outliers

In [32]:
px.scatter(player_final, x = 'position', y = 'height', color = 'Name')

In [72]:
px.scatter(player_final, x = 'position', y = 'wingspan', color = 'Name')

In [11]:
px.scatter(player_final, x = 'position', y = 'TRB%', color = 'Name')

In [12]:
px.scatter(player_final, x = 'position', y = 'TS%', color = 'Name')

In [13]:
px.scatter(player_final, x = 'position', y = 'AST%', color = 'Name')

Changing positions to numbers


In [54]:
pos_dict = {'PG': 1, 'SG': 2, 'SF': 3, 'PF': 4, 'C': 5}

In [56]:
# Features under consideration must be numeric and have some relation to position; the columns dropped do not fit this criteria
features = player_final.drop(columns = ['Name', 'Rk', 'position', 'Pos', 'Age', 'team_x', 'team_y', 'G', 'MP', 'Unnamed: 19', 'Unnamed: 24', '#', 'M', 'length'])
target = player_final['position'].replace(pos_dict)
# splits the scaled dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state = 300)
features = X_train.rename(columns = {"Rk": 0, "position": 1, "PER": 2, "TS%": 3, "3PAr": 4, "FTr": 5, "ORB%": 6, "DRB%": 7, "TRB%": 8, "AST%": 9, "STL%": 10, "BLK%": 11, "TOV%": 12, "USG%": 13, "OWS": 14, "DWS": 15, "WS": 16, "WS/48": 17, "OBPM": 18, "DBPM": 19, "BPM": 20, "VORP": 21, "height": 22, "wingspan": 23})

Selecting features to be used in classifying players by position

In [113]:
# selects the X most important features to be used for our model
def feature_selection(x_train, x_test, y_train, features, num_feats):
    
    # instantiate
    select = RFE(DecisionTreeRegressor(random_state = 300), n_features_to_select = num_feats)
    
    # fit the RFE selector to the training data
    select.fit(x_train, y_train)
    
    # transform training and testing sets so only the selected features are retained
    X_train_selected = select.transform(x_train)
    X_test_selected = select.transform(x_test)
    
    # prints selected features/Sample Output
    selected_features = [feature for feature, status in zip(features, select.get_support()) if status == True]
    print('Selected features:')
    feats = []
    for feature in selected_features:
        # indexing finds name of statistic instead of index tied to statistic
        print(X_train.columns[feature - 2])
        feats.append(X_train.columns[feature - 2])

    # returns selected features
    return X_train_selected, X_test_selected,  feats

In [114]:
# performs feature selection/Sample Output
X_train_selected, X_test_selected, feats = feature_selection(X_train, X_test, y_train, features, 5)

Selected features:
TRB%
AST%
USG%
height
wingspan


In [68]:
# dictionary of the classifiers and their names
estimators = {"KNeighborsClassifier": KNeighborsClassifier(),
             "LinearSVC": LinearSVC(random_state = 3000, max_iter = 1000000),
             "GaussianNB": GaussianNB(),
             "DecisionTreeClassifier": DecisionTreeClassifier()}

# sample output
estimators

{'KNeighborsClassifier': KNeighborsClassifier(),
 'LinearSVC': LinearSVC(max_iter=1000000, random_state=3000),
 'GaussianNB': GaussianNB(),
 'DecisionTreeClassifier': DecisionTreeClassifier()}

In [70]:
# fits the four classifiers using a percentage-split approach
def classifiers_percentage_split():
    # for each classifier in the estimators dictionary
    for estimator_name, estimator_object in estimators.items():
        if (estimator_name == "DecisionTreeClassifier"):
            estimator_object = DecisionTreeClassifier(max_depth = len(X_train_selected[0]))
        # create the model by fitting the selected training data
        estimator_object.fit(X=X_train_selected, y=y_train)

        # make predictions on the test set
        predicted = estimator_object.predict(X = X_test_selected)

        # training accuracy
        train_accuracy = estimator_object.score(X_train_selected, y_train)
        
        # testing accuracy
        test_accuracy = estimator_object.score(X_test_selected, y_test)

        # formats and prints out the results on the test data
        print(estimator_name, ":" , end="")
        print("\n\tPrediction accuracy on the train data:", end=" ")
        print(format(train_accuracy*100, ".2f"), end="") 
        print("%\n\tPrediction accuracy on the test data:", end=" ")
        print(format(test_accuracy*100, ".2f"), end="") 
        print("%\n")
        class_report = classification_report(y_true = y_test, y_pred = predicted)#zero_division = 0)
        print(class_report, "\n------------------------------------------------------\n")
        

In [118]:
# fits the four classifiers using a percentage-split approach
def classifiers_performance_data():
    # dataframe with classifier performance metrics recorded
    results = pd.DataFrame()
    # for each classifier in the estimators dictionary
    for estimator_name, estimator_object in estimators.items():
        if (estimator_name == "DecisionTreeClassifier"):
            estimator_object = DecisionTreeClassifier(max_depth = len(X_train_selected[0]))
        # create the model by fitting the selected training data
        estimator_object.fit(X=X_train_selected, y=y_train)

        # make predictions on the test set
        predicted = estimator_object.predict(X = X_test_selected)

        # training accuracy
        train_accuracy = estimator_object.score(X_train_selected, y_train)
        
        # testing accuracy
        test_accuracy = estimator_object.score(X_test_selected, y_test)
        
        results = results.append({'CLF': estimator_name, 'Features': feats,
                                  'Train Accuracy': train_accuracy, 'Test Accuracy': test_accuracy}, ignore_index = True)
    return results

In [110]:
X_train_selected, X_test_selected, feats = feature_selection(X_train, X_test, y_train, features, 5)
classifiers_percentage_split()

Selected features:
TRB%
AST%
USG%
height
wingspan
KNeighborsClassifier :
	Prediction accuracy on the train data: 80.34%
	Prediction accuracy on the test data: 67.42%

              precision    recall  f1-score   support

           1       0.71      0.89      0.79        36
           2       0.82      0.56      0.67        48
           3       0.47      0.74      0.57        27
           4       0.59      0.61      0.60        33
           5       0.91      0.62      0.74        34

    accuracy                           0.67       178
   macro avg       0.70      0.68      0.67       178
weighted avg       0.72      0.67      0.68       178
 
------------------------------------------------------

LinearSVC :
	Prediction accuracy on the train data: 57.30%
	Prediction accuracy on the test data: 57.30%

              precision    recall  f1-score   support

           1       0.62      0.83      0.71        36
           2       0.58      0.71      0.64        48
           3      

GaussianNB with four features (height, wingspan, TRB%, AST%) had the best results in terms of precision

In [124]:
import sys, os
# Disable
def blockPrint():
    sys.stdout = open(os.devnull, 'w')

# Restore
def enablePrint():
    sys.stdout = sys.__stdout__
blockPrint()
# Compiling accuracy results into a DataFrame to draw conclusions from
final_data = pd.DataFrame()
for num_feats in range(3, 8):
    X_train_selected, X_test_selected, feats = feature_selection(X_train, X_test, y_train, features, num_feats)
    final_data = final_data.append(classifiers_performance_data())
enablePrint()
final_data

Unnamed: 0,CLF,Features,Test Accuracy,Train Accuracy
0,KNeighborsClassifier,"[AST%, height, wingspan]",0.623596,0.779026
1,LinearSVC,"[AST%, height, wingspan]",0.522472,0.531835
2,GaussianNB,"[AST%, height, wingspan]",0.589888,0.651685
3,DecisionTreeClassifier,"[AST%, height, wingspan]",0.578652,0.685393
0,KNeighborsClassifier,"[AST%, USG%, height, wingspan]",0.651685,0.780899
1,LinearSVC,"[AST%, USG%, height, wingspan]",0.533708,0.548689
2,GaussianNB,"[AST%, USG%, height, wingspan]",0.567416,0.646067
3,DecisionTreeClassifier,"[AST%, USG%, height, wingspan]",0.707865,0.749064
0,KNeighborsClassifier,"[TRB%, AST%, USG%, height, wingspan]",0.674157,0.803371
1,LinearSVC,"[TRB%, AST%, USG%, height, wingspan]",0.573034,0.573034


In [125]:
# function to see predicted value for given player's position
def position_prediction_dtc(player_name):
    
    dtc = DecisionTreeClassifier(max_depth = 5)
    dtc.fit(X=X_train_selected, y=y_train)
    
    player_data = player_final[player_final.Name == player_name]
    player_data = player_data[['TRB%', 'AST%', 'USG%', 'height', 'wingspan']]
    
    # make predictions on the test set
    predicted = dtc.predict(X = player_data)

    return predicted

In [105]:
# function to see predicted value for given player's position
def position_prediction_gnb(player_name):
    
    gnb = GaussianNB()
    gnb.fit(X=X_train_selected, y=y_train)
    
    player_data = player_final[player_final.Name == player_name]
    player_data = player_data[['TRB%', 'AST%', 'USG%', 'height', 'wingspan']]
    # make predictions on the test set
    predicted = gnb.predict(X = player_data)

    return predicted

In [108]:
position_prediction_gnb('Joe Ingles')

array([3, 3, 3], dtype=int64)