## Import libraries and packages

In [3]:
# some useful mysklearn package import statements and reloads
import importlib

import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

# uncomment once you paste your mypytable.py into mysklearn package
import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

# uncomment once you paste your myclassifiers.py into mysklearn package
import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyKNeighborsClassifier, MyDummyClassifier, MyNaiveBayesClassifier, MyDecisionTreeClassifier

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

from os import path
import utils

ImportError: cannot import name 'MyKNeighborsClassifier' from 'mysklearn.myclassifiers' (/home/NBA-All-Star-CPSC-322/mysklearn/myclassifiers.py)

# Data Import and Setup

In [None]:
filename = path.join('input_data', 'AllStarData.csv')
basketball_data = MyPyTable().load_from_file(filename)
# set up classifiers
knn_classifier = MyKNeighborsClassifier(n_neighbors=10)
dummy_classifier = MyDummyClassifier()
nb_classifier = MyNaiveBayesClassifier()
decision_tree = MyDecisionTreeClassifier()
classifiers = [knn_classifier, dummy_classifier, nb_classifier, decision_tree]
# Other vars
RANDOM_STATE = 6

## Discretize Data

In [None]:
# convert the values by column
discrete_columns = []
for index in range(len(basketball_data.column_names)-1):
    curr_column = basketball_data.get_column(index, False)
    cutoffs = utils.compute_equal_width_cutoffs(curr_column, 10)
    output = myutils.create_output_for_discrete(curr_column, 10)
    new_column = [myutils.discretize_ratings_custom(val, cutoffs, output) for val in curr_column]
    discrete_columns.append(new_column)
# now that the columns are converted, we convert them to rows
discrete_data = []
allstar_counter = 0
for column_index in range(len(discrete_columns[0])):
    new_row = []
    for row_index in range(len(discrete_columns)):
        try:
            new_row.append(discrete_columns[row_index][column_index])
        except IndexError:
            continue
    new_row.append(basketball_data.data[allstar_counter][-1])
    allstar_counter += 1
    discrete_data.append(new_row)

## Predict Using Effective Field Goal Percentage (eFG%)

In [None]:
# retrieve data from table
efg_data = [row[2] for row in discrete_data]
allstars = [row[-1] for row in discrete_data]
# build train and test sets
train_sets, test_sets = myevaluation.stratified_kfold_cross_validation(efg_data, allstars, n_splits=10,random_state=RANDOM_STATE)
test_length = 0
test_answers = []
for test in test_sets: 
    test_answers += [allstars[index] for index in test]
    test_length += len(test)
# run tests and record results
classifier_results = []
for classifier in classifiers:
    result_set = [[], 0] # [all the predictions, total_number_true] 
    for train, test in zip(train_sets, test_sets):
        # convert the indices to actual samples
        x_train = [[efg_data[index]] for index in train]
        y_train = [allstars[index] for index in train]
        x_test = [[efg_data[index]] for index in test]
        y_test = [allstars[index] for index in test]
        # fit and predict
        classifier.fit(x_train, y_train)
        prediction = classifier.predict(x_test)
        num_true = myevaluation.accuracy_score(y_test, prediction, normalize=False)
        # update counters
        result_set[0] += prediction
        result_set[1] += num_true
    # finalize results
    result_set[1] /= test_length
    classifier_results.append(result_set)

## Output Results

In [None]:
classifier_names = ['KNN', 'Dummy', 'Naive Bayes', 'Decision Tree']
headers = ['no', 'yes']
myevaluation.print_classifier_results(classifier_names, classifier_results, test_answers, headers)

KNN--------------------------
Summary:
	Accuracy..: 0.744
	Error Rate: 0.256 

Precision, Recall, F1:
       precision    recall         f1    support
---  -----------  --------  ---------  ---------
yes     0.333333  0.03125   0.0571429        128
no      0.753968  0.979381  0.852018         388 

Confusion Matrix:
       no    yes    Total    Recognition (%)
---  ----  -----  -------  -----------------
no    380      8      388             97.938
yes   124      4      128              3.125


Dummy--------------------------
Summary:
	Accuracy..: 0.752
	Error Rate: 0.248 

Precision, Recall, F1:
       precision    recall        f1    support
---  -----------  --------  --------  ---------
yes     0                0  0               128
no      0.751938         1  0.858407        388 

Confusion Matrix:
       no    yes    Total    Recognition (%)
---  ----  -----  -------  -----------------
no    388      0      388                100
yes   128      0      128                  0


Na

## Effective Field Goal Percentage
The ultimate challenge for every classifier is to beat 75.2% accuracy of the Dummy classifier.  
This data is mostly one of two options and that means Dummy is mostly correct.  

And no classifier is able to do it.  
It's a tie for the silver medal as both Naive Bayes and KNN produce a 74.4% accurate predictions.  
This leaves bronze to Decision Tree with 74%, just 0.4% less than silver.  

## More Traits

In [None]:
multi_trait_data = [[row[3], row[4], row[5], row[6], row[7]] for row in discrete_data]
# build train and test sets
train_sets, test_sets = myevaluation.stratified_kfold_cross_validation(multi_trait_data, allstars, n_splits=10,random_state=RANDOM_STATE)
test_length = 0
test_answers = []
for test in test_sets: 
    test_answers += [allstars[index] for index in test]
    test_length += len(test)
# run tests and record results
classifier_results = []
for classifier in classifiers:
    result_set = [[], 0] # [all the predictions, total_number_true] 
    for train, test in zip(train_sets, test_sets):
        # convert the indices to actual samples
        x_train = [multi_trait_data[index] for index in train]
        y_train = [allstars[index] for index in train]
        x_test = [multi_trait_data[index] for index in test]
        y_test = [allstars[index] for index in test]
        # fit and predict
        classifier.fit(x_train, y_train)
        prediction = classifier.predict(x_test)
        num_true = myevaluation.accuracy_score(y_test, prediction, normalize=False)
        # update counters
        result_set[0] += prediction
        result_set[1] += num_true
    # finalize results
    result_set[1] /= test_length
    classifier_results.append(result_set)

In [None]:
classifier_names = ['KNN', 'Dummy', 'Naive Bayes', 'Decision Tree']
headers = ['no', 'yes']
myevaluation.print_classifier_results(classifier_names, classifier_results, test_answers, headers)

KNN--------------------------
Summary:
	Accuracy..: 0.793
	Error Rate: 0.207 

Precision, Recall, F1:
       precision    recall        f1    support
---  -----------  --------  --------  ---------
yes     0.744186  0.25      0.374269        128
no      0.79704   0.971649  0.875726        388 

Confusion Matrix:
       no    yes    Total    Recognition (%)
---  ----  -----  -------  -----------------
no    377     11      388             97.165
yes    96     32      128             25


Dummy--------------------------
Summary:
	Accuracy..: 0.752
	Error Rate: 0.248 

Precision, Recall, F1:
       precision    recall        f1    support
---  -----------  --------  --------  ---------
yes     0                0  0               128
no      0.751938         1  0.858407        388 

Confusion Matrix:
       no    yes    Total    Recognition (%)
---  ----  -----  -------  -----------------
no    388      0      388                100
yes   128      0      128                  0


Naive Baye

## Multi Trait Classification Results
### These results are great!
Dummy has retaken its rightful place as the worst classifier.  
Naive Bayes wins with an accuracy of 85.1%, followed by Decision Tree with 84.1% accuracy.  
Next is KNN with 79.3% and lastly Dummy with 75.2% accuracy.  

This means that these classifiers are proven to be better than pure guessing! Hooray!

In [None]:
all_traits = [row[:-1] for row in discrete_data]
# build train and test sets
train_sets, test_sets = myevaluation.stratified_kfold_cross_validation(all_traits, allstars, n_splits=10,random_state=RANDOM_STATE)
test_length = 0
test_answers = []
for test in test_sets: 
    test_answers += [allstars[index] for index in test]
    test_length += len(test)
# run tests and record results
classifier_results = []
for classifier in classifiers:
    result_set = [[], 0] # [all the predictions, total_number_true] 
    for train, test in zip(train_sets, test_sets):
        # convert the indices to actual samples
        x_train = [all_traits[index] for index in train]
        y_train = [allstars[index] for index in train]
        x_test = [all_traits[index] for index in test]
        y_test = [allstars[index] for index in test]
        # fit and predict
        classifier.fit(x_train, y_train)
        prediction = classifier.predict(x_test)
        num_true = myevaluation.accuracy_score(y_test, prediction, normalize=False)
        # update counters
        result_set[0] += prediction
        result_set[1] += num_true
    # finalize results
    result_set[1] /= test_length
    classifier_results.append(result_set)

NameError: name 'discrete_data' is not defined

In [None]:
classifier_names = ['KNN', 'Dummy', 'Naive Bayes', 'Decision Tree']
headers = ['no', 'yes']
myevaluation.print_classifier_results(classifier_names, classifier_results, test_answers, headers)