# Cleaning Data

The data sets that we have come with a lot of stats about the players. In this step we are going through and removing players that didn't come from the NCAA (Europe) and trimming down the attributes to the things that we believe are important.

In [1]:
import operator
import random
import math
import numpy as np

def get_table(filename):
    table = []
    infile = open(filename)
    lines = infile.readlines()
    
    i = 0
    for line in lines:
        add_it = True
        line = line.strip()
        line = line.strip('\n')
        values = line.split(",")
       
        #this is only grabbing columns that are completely filled in
        if len(values) > 27:        
            for val in values:
                #Get rid of columns with NA in them
                if val == 'NA':
                    add_it = False
            if add_it:
                
                #player - 6
                #pick - 4
                #MP - 31
                #FG - 32
                #FG% - 34
                #3p - 38
                #3pt% - 40
                #FT - 41
                #PTS - 52
                #SOS - 54
                #NBA PTS - 15
                
                to_add = []
                to_add.append(values[6])
                to_add.append(values[4])
                to_add.append(values[31])
                to_add.append(values[32])
                to_add.append(values[34])
                to_add.append(values[38])
                if values[40] == '':
                    to_add.append(0.0)
                else:
                    to_add.append(values[40])
                to_add.append(values[41])
                to_add.append(values[52])
                to_add.append(values[54])
                to_add.append(values[15])
                convert_to_numeric(to_add)
                table.append(to_add)
    infile.close()
    return table


def convert_to_numeric(values):
    '''
    converts values read from a file into correct types
    '''
    for i in range(len(values)):
        try:
            #converts numerical values from strings to floats
            numeric_val = float(values[i])
            values[i] = numeric_val
        except ValueError:
            values[i] = values[i].strip('\"')

# Basic KNN_functions

## get_column and compute_distances
Here are 2 helper functions for the KNN algorithm. I have get_frequencies commented out because we currently do not use it

In [2]:

'''
def get_frequencies(table, column_index):
    get values and counts given a table and 
    the column_index
    column = sorted(get_column(table, column_index))
    values = []
    counts = []
    for value in column:
        if value not in values:
            values.append(value)
            counts.append(1)
        else:
            counts[-1] += 1
    return values, counts
'''

def get_column(table, column_index):
    column = []
    for row in table:
        if row[column_index] != 'NA':
            column.append(row[column_index])
    return column

def compute_distances(v1,v2,table):
    assert(len(v1) == len(v2))

    dist = 0
    for i in range(1, len(v1)-1):
        Min = min(get_column(table, i))
        Max = max(get_column(table, i))
        v1_N = normalize(v1[i], Min, Max)
        v2_N = normalize(v2[i], Min, Max)
    
        dist += (v1_N- v2_N) ** 2
    return math.sqrt(dist)

def normalize(x, Min, Max):
    '''
    normalizes data before distance calculations
    '''
    normalized = ((x - Min) / (Max - Min)) * 1.0
    return normalized

Here is the knn guess function where i take in the training set and the test set and find its nearest neighbors, compute the avg of the nearest neighbors and return the list of values.

In [3]:
def knn_guess(train_set, test_set, k_val):
    '''
    classifier using knn given a test set, train set and k value
    '''
    
    init_len = len(train_set)
    
    right = 0
    wrong = 0

    for row in train_set:
        row.append(compute_distances(row, test_set, train_set))  

    k = k_val
    
    length_of_rows = len(train_set[0])
    train_set.sort(key=operator.itemgetter(length_of_rows-1))

    top_k = train_set[:k]

    # calculate the averages from the nearest neighbors
    sum_ppg = 0
    for player in top_k:
        sum_ppg += player[10]
    avg_ppg = sum_ppg/len(top_k)

    for row in train_set:
        row.pop()
    
    return avg_ppg

Here i created a k-fold that just works for testing

In [4]:
def k_fold(table):
    randomized = table[:]
    n = len(randomized)

    for i in range(n):
        rand_index = random.randrange(0,n)
        randomized[i], randomized[rand_index] = randomized[rand_index], randomized[i]


    folds = [[] for i in range(10)]
    x = 0
    for i in range(len(randomized)):
        if x > 9:
            x = 0
        folds[x].append(randomized[i])
        x += 1
    return folds

Here is a the knn function that runs over the folds and outputs the answers.

In [5]:
def knnPrediction(table, headers):    
    folds = k_fold(table)
    print("---------------------")
    print("Self-Coded: KNN")
    print("---------------------")
    differences = []
    for i in range(len(folds)):
        train_set = []

        for x in folds:
            if x != folds[i]:
                for item in x:
                    train_set.append(item)
        for j in folds[i]:
            my_guess = knn_guess(train_set, j, 10)
            print ("Player: " + j[0])
            print ("NBA PPG: " + str(j[10]))
            print ("predicted PPG: " + str(my_guess))
            diff = j[10] - my_guess
            differences.append(abs(diff))
    print("-----------------------------------------------------------------------------")
    print ("average error: " + str(np.mean(differences)))
    print ("median error: " + str(np.median(differences)))
            
            

This function outputs a table of correelation coefficents for all the attributes we used for our kNN classifier

In [6]:
def find_stat_correlation_to_NBA_PTS(table, headers):
    NBA_PTS = get_column(table, 10)
    for i in range(1, len(headers)-1):
        cur_column = get_column(table, i)
        vals = np.corrcoef(cur_column, NBA_PTS)
        r = str(vals[0][-1])
        print (headers[i] + " correlation coefficent = " + r)

In [7]:
#fxn that calls KNN-self-done

start_table = get_table('datasets/firstRoundPicks_withCollegeStats.csv')
headers = start_table[0]
table = start_table[1:]
    
knnPrediction(table, headers)
find_stat_correlation_to_NBA_PTS(table, headers)

---------------------
Self-Coded: KNN
---------------------
Player: Stanley Johnson\johnsst04
NBA PPG: 6.9
predicted PPG: 10.209999999999997
Player: Deandre Ayton\aytonde01
NBA PPG: 16.3
predicted PPG: 12.239999999999998
Player: Kevin Knox\knoxke01
NBA PPG: 12.7
predicted PPG: 9.95
Player: Joel Embiid\embiijo01
NBA PPG: 24.2
predicted PPG: 10.17
Player: Josh Jackson\jacksjo02
NBA PPG: 12.1
predicted PPG: 12.15
Player: Terry Rozier\roziete01
NBA PPG: 7.7
predicted PPG: 8.200000000000001
Player: Jacob Evans\evansja02
NBA PPG: 0.8
predicted PPG: 6.159999999999999
Player: Aaron Gordon\gordoaa01
NBA PPG: 12.4
predicted PPG: 8.85
Player: Lonzo Ball\balllo01
NBA PPG: 10.0
predicted PPG: 10.629999999999999
Player: Josh Okogie\okogijo01
NBA PPG: 7.9
predicted PPG: 6.210000000000001
Player: Nik Stauskas\stausni01
NBA PPG: 6.8
predicted PPG: 10.96
Player: Kevon Looney\looneke01
NBA PPG: 4.4
predicted PPG: 5.14
Player: Robert Williams\williro04
NBA PPG: 2.5
predicted PPG: 8.15
Player: Moritz Wagne

Player: Dejounte Murray\murrade01
NBA PPG: 6.6
predicted PPG: 6.19
Player: James Young\youngja01
NBA PPG: 2.3
predicted PPG: 6.760000000000001
Player: Willie Cauley-Stein\caulewi01
NBA PPG: 10.1
predicted PPG: 10.889999999999999
Player: Jabari Parker\parkeja01
NBA PPG: 15.1
predicted PPG: 13.64
Player: Shai Gilgeous-Alexander\gilgesh01
NBA PPG: 10.6
predicted PPG: 11.080000000000002
Player: Andrew Wiggins\wiggian01
NBA PPG: 19.4
predicted PPG: 12.830000000000002
Player: Taurean Prince\princta02
NBA PPG: 11.3
predicted PPG: 11.170000000000002
-----------------------------------------------------------------------------
average error: 3.32864
median error: 2.84
Pk correlation coefficent = -0.5994202468383477
MP correlation coefficent = 0.0629618794699425
FG correlation coefficent = 0.2517030362074402
FG% correlation coefficent = 0.0556473898095536
3P correlation coefficent = 0.027406926749840865
3P% correlation coefficent = -0.0555737387878667
FT correlation coefficent = 0.31986899198272

# Scikit-learn

In [32]:
#from sklearn.neighbors import KNeighborsClassifier
#n = KNeighborsClassifier(n_neighbors = 5)
#once we have the data, then X is our training data and Y is our target values

#n.fit(X, y)
#print(n.predict[[test_dealio]])