# Cleaning Data

The data sets that we have come with a lot of stats about the players. In this step we are going through and removing players that didn't come from the NCAA (Europe) and trimming down the attributes to the things that we believe are important.

In [13]:
import operator
import random
import math
import numpy as np
import utils

def get_table(filename):
    table = []
    infile = open(filename)
    lines = infile.readlines()
    
    i = 0
    for line in lines:
        add_it = True
        line = line.strip()
        line = line.strip('\n')
        values = line.split(",")
       
        #this is only grabbing columns that are completely filled in
        if len(values) > 27:        
            for val in values:
                #Get rid of columns with NA in them
                if val == 'NA':
                    add_it = False
            if add_it:     
                #player - 6
                #pick - 4
                #MP - 31
                #FG - 32
                #FG% - 34
                #3p - 38
                #3pt% - 40
                #FT - 41
                #PTS - 52
                #SOS - 54
                #NBA PTS - 15
                
                to_add = []
                to_add.append(values[6])
                to_add.append(values[4])
                to_add.append(values[31])
                to_add.append(values[32])
                to_add.append(values[34])
                to_add.append(values[38])
                if values[40] == '':
                    to_add.append(0.0)
                else:
                    to_add.append(values[40])
                to_add.append(values[41])
                to_add.append(values[52])
                to_add.append(values[54])
                to_add.append(values[15])
                utils.convert_to_numeric(to_add)
                table.append(to_add)
    infile.close()
    return table



# Basic KNN_functions

## get_column and compute_distances
Here are 2 helper functions for the KNN algorithm. 

Here is the knn guess function where i take in the training set and the test set and find its nearest neighbors, compute the avg of the nearest neighbors and return the value

In [14]:
def knn_guess(train_set, test_set, k_val):
    '''
    classifier using knn given a test set, train set and k value
    '''
    
    init_len = len(train_set)
    init_row_len = len(train_set[0])
    
    right = 0
    wrong = 0

    for row in train_set:
        row.append(utils.compute_distances(row, test_set, train_set))  

    k = k_val
    
    length_of_rows = len(train_set[0])
    train_set.sort(key=operator.itemgetter(length_of_rows-1))

    top_k = train_set[:k]

    # calculate the averages from the nearest neighbors
    sum_ppg = 0
    for player in top_k:
        sum_ppg += player[init_row_len-1]
    avg_ppg = sum_ppg/len(top_k)

    for row in train_set:
        row.pop()
    
    return avg_ppg

Here is a the knn function that runs over the adds them to a list to be computed at the end

In [15]:
def knnPrediction(table, headers, avg_error, median_error):    
    folds = utils.k_fold(table)
    differences = []
    for i in range(len(folds)):
        train_set = []

        for x in folds:
            if x != folds[i]:
                for item in x:
                    train_set.append(item)
        for j in folds[i]:
            my_guess = knn_guess(train_set, j, 10)
            #uncomment below to see the specific predicted points per game vs actual NBA points
            #print ("NBA PPG: " + str(j[-1]))
            #print ("predicted PPG: " + str(my_guess))
            diff = j[-1] - my_guess
            differences.append(abs(diff))

    avg_error.append(np.mean(differences))
    median_error.append(np.median(differences))
            

This function outputs a table of correelation coefficents for all the attributes we used for our kNN classifier

In [16]:
#fxn that calls KNN-self-done

start_table = get_table('datasets/firstRoundPicks_withCollegeStats.csv')
headers = start_table[0]
table = start_table[1:]

ensemble_size = 6

list_of_ind = [i for i in range(1,10)]

avg_error = []
median_error = []

for _ in range(ensemble_size):
    X = []
    attributes = list_of_ind
    random.shuffle(attributes)
    attributes = attributes[:4]

    for row in table:
        inter = []
        for i in attributes:
            inter.append(row[i])
        inter.append(row[-1])
        X.append(inter)
    
    knnPrediction(X, headers, avg_error, median_error)

total_avg_error = sum(avg_error) / len(avg_error)
total_median_error = sum(median_error) / len(median_error)

print("---------SELF MADE KNN----------")
print("Average error : %.2f" % total_avg_error)
print("Median error : %.2f" % total_median_error)

---------SELF MADE KNN----------
Average error : 4.04
Median error : 3.55


# Scikit-learn

Below does the same as above but using the scikit-learn library. 

In [17]:
from sklearn.neighbors import KNeighborsRegressor

list_of_ind = [i for i in range(1,10)]
player_pred = [0 for _ in range(len(table))]
Y = [row[10] for row in table]

for _ in range(ensemble_size):
    X = []
    #randomly generate 4 numbers to be included
    attributes = list_of_ind
    random.shuffle(attributes)
    attributes = attributes[:4]

    for row in table:
        inter = []
        for i in attributes:
            inter.append(row[i])
        X.append(inter)

    n = KNeighborsRegressor(n_neighbors=5)
    n.fit(X,Y)

    for i, player in enumerate(table):
        rows_to_analyze = [player[a] for a in attributes]
        player_pred[i] += (n.predict([rows_to_analyze]))[0]

final_pred = [pred / ensemble_size for pred in player_pred]

total_error = 0

for i,player in enumerate(table):
    #uncomment if you want to see each player
    #print("Player -> " , player[0])
    #print("Guessed PPG " , final_pred[i])
    #print("Actual PPG " , player[10])
    #print()
    total_error += abs(final_pred[i] - player[10])

print("---------------------------------")
print("Total points off for sci-kit learn: %.2f" % (total_error / len(table)))




---------------------------------
Total points off for sci-kit learn: 2.89


Below is a function that computes the correlation coefficient for each attribute, to give a sense of our attributes.

In [18]:
def find_stat_correlation_to_NBA_PTS(table, headers):
    NBA_PTS = utils.get_column(table, 10)
    for i in range(1, len(headers)-1):
        cur_column = utils.get_column(table, i)
        vals = np.corrcoef(cur_column, NBA_PTS)
        r = str(vals[0][-1])
        print (headers[i] + " correlation coefficent = " + r)

find_stat_correlation_to_NBA_PTS(table, headers)

Pk correlation coefficent = -0.5273993822093086
MP correlation coefficent = 0.21762848137756619
FG correlation coefficent = 0.36933962519403424
FG% correlation coefficent = -0.1449147250609018
3P correlation coefficent = 0.16441215828431333
3P% correlation coefficent = -0.046193036818175305
FT correlation coefficent = 0.5101851304616711
PTS correlation coefficent = 0.44179743881307093
SOS correlation coefficent = 0.1796059783207265
