# Assignment 2 - Part C: Trying alternative classifiers

This is a skeleton for trying alternative classifiers on the basketball dataset.

In [23]:
import csv

We can define, as done in Practicum 6, a data loading in a way to obtain the attributes set and class labels for each the training and the test sets.

In [24]:
ATTRS = ["LOCATION", "W", "FINAL_MARGIN", "SHOT_NUMBER", "PERIOD", "GAME_CLOCK", "SHOT_CLOCK", 
         "DRIBBLES", "TOUCH_TIME", "SHOT_DIST", "PTS_TYPE", "CLOSE_DEF_DIST", "SHOT_RESULT"]

ATTRS_WO_CLASS = 12

def load_train_data(filename):
    train_x = []
    train_y = []
    with open(filename, 'rt') as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',')
        i = 0
        for row in csvreader:
            if len(row) == ATTRS_WO_CLASS + 1:
                i += 1
                instance = [row[i] for i in range(ATTRS_WO_CLASS)]  # first ATTRS_WO_CLASS values are attributes
                label = row[ATTRS_WO_CLASS]  # (ATTRS_WO_CLASS + 1)th value is the class label
                train_x.append(instance)
                train_y.append(label)
                    
    return train_x, train_y

def load_test_data(filename):
    test_x = []  # instances to be tested
    test_y = ["made", "made", "made", "missed", "missed", "made", "made",
              "missed", "missed", "made", "made", "made", "missed", "made",
              "missed", "missed", "missed", "made", "missed", "missed"]  # from toy-data - just for testing
    with open(filename, 'rt') as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',')
        i = 0
        for row in csvreader:
            if len(row) == ATTRS_WO_CLASS:
                i += 1
                instance = [row[i] for i in range(ATTRS_WO_CLASS)]  
                test_x.append(instance)  
                if i > 20:  # append fake ground truth data to toy-data
                    label = "made"  # all made, fake ground truth for testing!
                    test_y.append(label)
                    
    return test_x, test_y

And then we can use it to load the data.

In [25]:
train_x, train_y = load_train_data("data/basketball.train.csv")
test_x, test_y = load_test_data("data/basketball.test.csv")

predictions evaluator:

In [26]:
def evaluate(predictions, true_labels):
    correct = 0
    incorrect = 0
    for i in range(len(predictions)):
        if predictions[i] == true_labels[i]:
            correct += 1
        else:
            incorrect += 1

    print("\tAccuracy:   ", correct / len(predictions))
    print("\tError rate: ", incorrect / len(predictions))
    
    return correct / len(predictions)  # return accuracy

Scikit-learn needs that all the attribute values to be numeric. This is, we need to binarize all the non-numeric attribute values, to obtain vectors: records having only numbers. The `DictVectorizer` class provided by scikit-learn allows to do this easily.

In [27]:
from sklearn.feature_extraction import DictVectorizer

Mind that each `train_x` and `test_x` are a list of lists.

We just need to obtain from each a list of dictionaries (as done in previous practica where each record was a dictionary).

In [28]:
dicts_train_x = []
for x in train_x:
    d = {}
    for i, attr in enumerate(ATTRS):
        if i < len(ATTRS) - 1: # we removed class from train_x elems
            val = x[i]
            # save as floats the values for the already-numeric attributes from dataset 
            # keep the rest as the strings they are
            if i not in [0, 1, 4, 10]:  # indices for "LOCATION", "W", "PERIOD", "PTS_TYPE" attributes
                val = float(val)
            d[attr] = val
    dicts_train_x.append(d)

Finally, the `fit_transform` method of the vectorizer binarizes the non-numeric attributes in the list of dictionaries, and returns the vector we need.

In [29]:
vectorizer_train = DictVectorizer()
vec_train_x = vectorizer_train.fit_transform(dicts_train_x).toarray()

We do similarly for vectorizing `test_x`.

In [30]:
dicts_test_x = []
for x in test_x:
    d = {}
    for i, attr in enumerate(ATTRS):
        if i < len(ATTRS) - 1: # we removed class from test_x elems
            val = x[i]
            # save as floats the values for the already-numeric attributes from dataset 
            # keep the rest as the strings they are
            if i not in [0, 1, 4, 10]:  # indices for "LOCATION", "W", "PERIOD", "PTS_TYPE" attributes
                val = float(val)
            d[attr] = val
    dicts_test_x.append(d)

vectorizer_test = DictVectorizer()
vec_test_x = vectorizer_train.fit_transform(dicts_test_x).toarray()

Having `evaluate` defined somewhere, we are ready to learn and apply the model, similarly to Task 3 of Practicum 6. But here, we use the vectors recently obtained for the input sets. E.g., for Naive Bayes classifier:

In [31]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC

In [32]:
classifiers = {
    #"Decision Tree": DecisionTreeClassifier(),
    #"Nearest Neighbors": KNeighborsClassifier(n_neighbors=2),
    #"Naive Bayes (Gaussian)": GaussianNB(priors=[0.35, 0.65]),  # priors = prior probabilities of the two classes
    #"Random Forest": RandomForestClassifier(n_estimators=5, max_features=None, class_weight={"made":0.3, "missed":0.7}), 
    #"AdaBoost": AdaBoostClassifier(learning_rate=0.5),
    "Support Vector Classification": SVC(kernel="rbf", decision_function_shape="ovo", max_iter=10000)
}

best_accuracy = 0

print("ALL EVALUATION IS WRONG. ONLY FOR TESTING AGAINST MADE UP GROUND TRUTH!\n")
for name, clf in classifiers.items():
    print(name)
    clf.fit(vec_train_x, train_y)
    predictions = clf.predict(vec_test_x)
    accuracy = evaluate(predictions, test_y)  # test_y is just a made up ground truth
    if accuracy > best_accuracy: 
        best_accuracy = accuracy
        best_preds = predictions
        best_clf = name

        
# Best so far (against made up GT): Naive Bayes
#
# The scores below are from Kaggle
#
# Naive Bayes
#    priors=[0.35, 0.65]: 0.57579 
#    priors=[0.3, 0.7]: 0.57433    
#    priors=[0.4, 0.6]: 0.56984
#    priors=[0.25, 0.75]: 0.56305 
#    priors=[0.5, 0.5]: 0.55338
#    priors=[0.6, 0.4]: 0.54682
#
# SVM.SVC
#    (kernel="linear", decision_function_shape="ovo", max_iter=10000): 0.55583
#    (kernel="rbf", decision_function_shape="ovo", max_iter=10000): 0.51233
#
# Random Forest
#    (n_estimators=10, max_features="auto"): 0.54945
#    (n_estimators=10, max_features=15): 0.54913
#    (n_estimators=5, max_features=None, class_weight={"made":0.3, "missed":0.7}): 0.54684
#
# Nearest Neighbors
#    (n=2): 0.52447


ALL EVALUATION IS WRONG. ONLY FOR TESTING AGAINST MADE UP GROUND TRUTH!

Linear Support Vector Classification




	Accuracy:    0.8367346938775511
	Error rate:  0.16326530612244897


In [33]:
OUTPUT_FILE = "data/classifier_basketball.pred.csv"

with open(OUTPUT_FILE, 'w') as output:
    output.write("Id,Target")
    for id, pred in enumerate(best_preds):
        output.write("\n%s,%s" % (id + 1, pred))
print("Results from %s written to '%s'" % (best_clf, OUTPUT_FILE))

Results from Linear Support Vector Classification written to 'data/classifier_basketball.pred.csv'
