# Assignment 2 - Part C: Trying alternative classifiers

This is a skeleton for trying alternative classifiers on the basketball dataset.

In [33]:
import csv

We can define, as done in Practicum 6, a data loading in a way to obtain the attributes set and class labels for each the training and the test sets.

In [34]:
ATTRS = ["LOCATION", "W", "FINAL_MARGIN", "SHOT_NUMBER", "PERIOD", "GAME_CLOCK", "SHOT_CLOCK", 
         "DRIBBLES", "TOUCH_TIME", "SHOT_DIST", "PTS_TYPE", "CLOSE_DEF_DIST", "SHOT_RESULT"]

ATTRS_WO_CLASS = 12

""" This method loads data for training and evaluating the classifiers """
def load_train_data(filename):
    train_x = []
    train_y = []
    test_x = []
    test_y = []
    with open(filename, 'rt') as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',')
        i = 0
        for row in csvreader:
            if len(row) == ATTRS_WO_CLASS + 1:
                i += 1
                instance = [row[i] for i in range(ATTRS_WO_CLASS)]  # first ATTRS_WO_CLASS values are attributes
                label = row[ATTRS_WO_CLASS]  # (ATTRS_WO_CLASS + 1)th value is the class label
                if i % 3 == 0:  # test instance
                    test_x.append(instance)
                    test_y.append(label)
                else:  # train instance
                    train_x.append(instance)
                    train_y.append(label)
                    
    return train_x, train_y, test_x, test_y

""" This method loads the data that the trained classifiers will used on """
def load_test_data(filename):
    test_data = []  # instances to be tested
    with open(filename, 'rt') as csvfile:
        csvreader = csv.reader(csvfile, delimiter=',')
        i = 0
        for row in csvreader:
            if len(row) == ATTRS_WO_CLASS:
                i += 1
                instance = [row[i] for i in range(ATTRS_WO_CLASS)]  
                test_data.append(instance)  
                    
    return test_data

And then we can use it to load the data.

In [35]:
OUTPUT_FILE = "data/classifier_basketball.pred.csv"
TRAINING_FILE = "data/basketball.train.csv"
TESTING_FILE = "data/basketball.test.csv"

train_x, train_y, test_x, test_y = load_train_data(TRAINING_FILE) 
test_data = load_test_data(TESTING_FILE) 

predictions evaluator:

In [36]:
def evaluate(predictions, true_labels):
    correct = 0
    incorrect = 0
    for i in range(len(predictions)):
        if predictions[i] == true_labels[i]:
            correct += 1
        else:
            incorrect += 1

    print("\tAccuracy:   ", correct / len(predictions))
    print("\tError rate: ", incorrect / len(predictions))
    
    return correct / len(predictions)  # return accuracy

Scikit-learn needs that all the attribute values to be numeric. This is, we need to binarize all the non-numeric attribute values, to obtain vectors: records having only numbers. The `DictVectorizer` class provided by scikit-learn allows to do this easily.

Mind that each `train_x` and `test_x` are a list of lists.

We just need to obtain from each a list of dictionaries (as done in previous practica where each record was a dictionary).

We do similarly for vectorizing test data.

In [37]:
from sklearn.feature_extraction import DictVectorizer

""" This method vectorizes whatever needs to be vectorized """
def vectorize(train_test):    
    dicts_train_test = []
    for x in train_test:
        d = {}
        for i, attr in enumerate(ATTRS):
            if i < len(ATTRS) - 1: # we removed class from test elems
                val = x[i]
                # save as floats the values for the already-numeric attributes from dataset 
                # keep the rest as the strings they are
                if i not in [0, 1, 4, 10]:  # indices for "LOCATION", "W", "PERIOD", "PTS_TYPE" attributes
                    val = float(val)
                d[attr] = val
        dicts_train_test.append(d)

    # Finally, the fit_transform method of the vectorizer binarizes 
    # the non-numeric attributes in the list of dictionaries, and returns the vector we need.
    vectorizer_train:test = DictVectorizer()
    vec_train_test = vectorizer_train.fit_transform(dicts_train_test).toarray()
    
    return vec_train_test

Having `evaluate` defined somewhere, we are ready to learn and apply the model, similarly to Task 3 of Practicum 6. But here, we use the vectors recently obtained for the input sets. E.g., for Naive Bayes classifier:

In [38]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC

In [39]:
classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "Nearest Neighbors": KNeighborsClassifier(n_neighbors=2),
    "Naive Bayes (Gaussian)": GaussianNB(priors=[0.35, 0.65]),  # priors = prior probabilities of the two classes
    "Random Forest": RandomForestClassifier(n_estimators=50, max_features="auto"), 
    "AdaBoost": AdaBoostClassifier(learning_rate=0.20),
    # "Support Vector Classification": SVC(kernel="rbf", decision_function_shape="ovo", max_iter=10000)
}

vec_test_x = vectorize(test_x)
vec_train_x = vectorize(train_x)

best_accuracy = 0
best_clf_name = ""
best_clf = None

print("Evaluations below are approximate. Consult Kaggle for final scores.")
for name, clf in classifiers.items():
    print(name)
    clf.fit(vec_train_x, train_y)
    predictions = clf.predict(vec_test_x)
    accuracy = evaluate(predictions, test_y)  
    if accuracy > best_accuracy: 
        best_accuracy = accuracy
        best_clf_name = name
        best_clf = clf


Evaluations below are approximate. Consult Kaggle for final scores.
Decision Tree
	Accuracy:    0.5352152491721186
	Error rate:  0.46478475082788145
Nearest Neighbors
	Accuracy:    0.5224133753331718
	Error rate:  0.4775866246668282
Naive Bayes (Gaussian)
	Accuracy:    0.5820208383813908
	Error rate:  0.41797916161860915
Random Forest
	Accuracy:    0.580930457959777
	Error rate:  0.4190695420402229
AdaBoost
	Accuracy:    0.6008399967692432
	Error rate:  0.3991600032307568


-----------------------------------------------------------------------------------------------------------------------------

After finding the best classifier, use it on the test data:

In [40]:
if best_accuracy > 0.57887:  # if the score is higher than what we got with the decision tree from part B
    vec_test_data = vectorize(test_data)  # ready the actual test data
    final_preds = best_clf.predict(vec_test_data)  # use the best classifier we found on the test data
    
    with open(OUTPUT_FILE, 'w') as output:
        output.write("Id,Target")
        for id, pred in enumerate(final_preds):
            output.write("\n%s,%s" % (id + 1, pred))
            
    print("Results from %s written to '%s'" % (best_clf_name, OUTPUT_FILE))

Results from AdaBoost written to 'data/classifier_basketball.pred.csv'
