# Cleaning Data

The data sets that we have come with a lot of stats about the players. In this step we are going through and removing players that didn't come from the NCAA (Europe) and trimming down the attributes to the things that we believe are important.

In [64]:
from tabulate import tabulate
import operator
import random
import math
import numpy as np

def get_table(filename):
    table = []
    infile = open(filename)
    lines = infile.readlines()
    
    i = 0
    for line in lines:
        add_it = True
        line = line.strip()
        line = line.strip('\n')
        values = line.split(",")
        
        #this is only grabbing columns that are completely filled in
        if len(values) == 57:        
            for val in values:
                #Get rid of columns with NA in them
                if val == 'NA':
                    add_it = False
            if add_it:
                #player - 6
                #MP - 31
                #FG - 32
                #FG% - 34
                #3p - 38
                #3pt% - 40
                #FT - 41
                #PTS - 52
                #SOS - 54
                
                to_add = []
                to_add.append(values[6])
                to_add.append(values[31])
                to_add.append(values[32])
                to_add.append(values[34])
                to_add.append(values[38])
                to_add.append(values[40])
                to_add.append(values[41])
                to_add.append(values[52])
                to_add.append(values[54])
                table.append(to_add)
                
    infile.close()
    return table



# Basic KNN_functions

In [89]:


def get_frequencies(table, column_index):
    '''
    get values and counts given a table and 
    the column_index
    '''
    column = sorted(get_column(table, column_index))
    values = []
    counts = []
    for value in column:
        if value not in values:
            values.append(value)
            counts.append(1)
        else:
            counts[-1] += 1
    return values, counts

def get_column(table, column_index):
    column = []
    for row in table:
        if row[column_index] != 'NA':
            column.append(row[column_index])
    return column

def compute_distances(v1,v2):
    assert(len(v1) == len(v2))
    
    dist = math.sqrt(sum([(int(float(v1[i])) - int(float(v2[i]))) ** 2 for i in range(1,len(v1))]))
    return dist


In [90]:
def knn_guess(train_set, test_set, k_val):
    '''
    classifier using knn given a test set, train set and k value
    '''
    
    init_len = len(train_set)
    
    right = 0
    wrong = 0

    for row in train_set:
        row.append(compute_distances(row, test_set))  

    k = k_val
    
    length_of_rows = len(train_set[0])
    train_set.sort(key=operator.itemgetter(length_of_rows-1))

    top_k = train_set[:k]

    # calculate the averages from the nearest neighbors
    
    avg_list = [[] for i in range(len(top_k[0]))]
    for row in top_k:
        for i in range(1,len(row)):
            avg_list[i].append(float(row[i]))
    
    avg_list = avg_list[1:]
    avgs = [np.mean(i) for i in avg_list]
    
    for row in train_set:
        row.pop()
    
    return avgs

In [91]:
def k_fold(table):
    randomized = table[:]
    n = len(randomized)

    for i in range(n):
        rand_index = random.randrange(0,n)
        randomized[i], randomized[rand_index] = randomized[rand_index], randomized[i]


    folds = [[] for i in range(10)]
    x = 0
    for i in range(len(randomized)):
        if x > 9:
            x = 0
        folds[x].append(randomized[i])
        x += 1
    return folds

In [92]:
def knn_matrix(table):    
    folds = k_fold(table)
    print("---------------------")
    print("Self-Coded: KNN")
    print("---------------------")

    for i in range(len(folds)):
        train_set = []
        for x in folds:
            if x != folds[i]:
                for item in x:
                    train_set.append(item)
        for j in folds[i]:
            my_guess = knn_guess(train_set, j, 10)
            print("----------------------------------------")
            print("Player -> ", j[0])
            print(headers)
            print(my_guess)
            print("----------------------------------------")
            

In [93]:
#fxn that calls KNN-self-done

start_table = get_table('datasets/firstRoundPicks_withCollegeStats.csv')
headers = start_table[0]
table = start_table[1:]
    
knn_matrix(table)

---------------------
Self-Coded: KNN
---------------------
----------------------------------------
Player ->  Marvin Bagley\baglema01
['Player', 'MP', 'FG', 'FG%', '3P', '3P%', 'FT', 'PTS', 'SOS']
[26.522222222222222, 4.677777777777778, 0.523, 1.1444444444444446, 0.38522222222222224, 2.566666666666667, 13.077777777777778, 8.656666666666666, 12.001636868305688]
----------------------------------------
----------------------------------------
Player ->  Malik Monk\monkma01
['Player', 'MP', 'FG', 'FG%', '3P', '3P%', 'FT', 'PTS', 'SOS']
[26.522222222222222, 4.677777777777777, 0.523, 1.1444444444444446, 0.38522222222222224, 2.566666666666667, 13.077777777777778, 8.656666666666666, 9.85353609421622]
----------------------------------------
----------------------------------------
Player ->  Zach Collins\colliza01
['Player', 'MP', 'FG', 'FG%', '3P', '3P%', 'FT', 'PTS', 'SOS']
[28.74, 5.339999999999999, 0.5119, 1.34, 0.3785, 2.82, 14.85, 9.398000000000001, 13.935032251918852]
---------------

# Scikit-learn

In [7]:
from sklearn.neighbors import KNeighborsClassifier
n = KNeighborsClassifier(n_neighbors = 5)
#once we have the data, then X is our training data and Y is our target values

n.fit(X, y)
print(n.predict[[test_dealio]])

NameError: name 'X' is not defined