### Classification using scikit-learn (with pandas)

In [3]:
import csv
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [4]:
# For compatibility across multiple platforms
import os
IB = os.environ.get('INSTABASE_URI',None) is not None
open = ib.open if IB else open

In [15]:
# Read Cities.csv into dataframe, add column for temperature category
# Note: For a dataframe D and integer i, D.loc[i] is the i-th row of D
f = open('datasets/Cities.csv','rU')
cities = pd.read_csv(f)
cats = []
for i in range(len(cities)):
    if cities.loc[i]['temperature'] < 5:
        cats.append('cold')
    elif cities.loc[i]['temperature'] < 9:
        cats.append('cool')
    elif cities.loc[i]['temperature'] < 15:
        cats.append('warm')
    else: cats.append('hot')
cities['category'] = cats
print("cold:", len(cities[(cities.category == 'cold')]))
print("cool:", len(cities[(cities.category == 'cool')]))
print("warm:", len(cities[(cities.category == 'warm')]))
print("hot:", len(cities[(cities.category == 'hot')]))

cold: 17
cool: 92
warm: 79
hot: 25


In [16]:
cities.head(3)

Unnamed: 0,city,country,latitude,longitude,temperature,category
0,Aalborg,Denmark,57.03,9.92,7.52,cool
1,Aberdeen,United Kingdom,57.17,-2.08,8.1,cool
2,Abisko,Sweden,63.35,18.83,0.2,cold


In [17]:
# Create training and test sets for cities data
numitems = len(cities)
percenttrain = 0.85
numtrain = int(numitems*percenttrain)
numtest = numitems - numtrain
print('Training set', numtrain, 'items')
print('Test set', numtest, 'items')
citiesTrain = cities[0:numtrain]
citiesTest = cities[numtrain:]

Training set 181 items
Test set 32 items


### K-nearest-neighbors classification

In [57]:
# Predict temperature category from other features
features_ = ['longitude', 'latitude']
neighbors_ = [i for i in range(1, 100)]

import itertools
def findsubsets(S,m):
    return set(itertools.combinations(S, m))

best_accuracy = 0
best_features = None
best_neighbors = 0

for m in range(1, len(features_) + 1):
    feature_list = list(findsubsets(features_, m))
    for features_tuple in feature_list:
        features = list(features_tuple)
        for neighbors in neighbors_:
            classifier = KNeighborsClassifier(neighbors)
            classifier.fit(citiesTrain[features], citiesTrain['category'])
            predictions = classifier.predict(citiesTest[features])
            # Calculate accuracy
            numtrain = len(citiesTrain)
            numtest = len(citiesTest)
            correct = 0
            for i in range(numtest):
                #print('Predicted:', predictions[i], ' Actual:', citiesTest.loc[numtrain+i]['category'])
                if predictions[i] == citiesTest.loc[numtrain+i]['category']: correct +=1
            #print('Accuracy:', float(correct)/float(numtest))
            accuracy = float(correct)/float(numtest)
            if accuracy > best_accuracy:
                #print(accuracy, features, neighbors)
                best_accuracy = accuracy
                best_features = features
                best_neighbors = neighbors
            # Comment out print, try other values for neighbors, other features
            
print("Best accuracy: ", best_accuracy)
print("Best features: ", best_features)
print("Best K parameter: ", best_neighbors)

Best accuracy:  0.84375
Best features:  ['longitude', 'latitude']
Best K parameter:  5


### <font color="green">Your Turn: K-nearest-neighbors on World Cup Data</font>

In [5]:
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# This cell does all the set-up, including reordering the data to avoid team bias.
f = open('datasets/Players.csv','rU')
players = pd.read_csv(f)
players = players.sort_values(by='surname')
players = players.reset_index(drop=True)
numitems = len(players)
percenttrain = 0.95
numtrain = int(numitems*percenttrain)
numtest = numitems - numtrain
print('Training set', numtrain, 'items')
print('Test set', numtest, 'items')
playersTrain = players[0:numtrain]
playersTest = players[numtrain:]

Training set 565 items
Test set 30 items


  app.launch_new_instance()


In [59]:
# This cell does the classification.
# Try different features and different numbers of neighbors.
# What's the highest accuracy you can get?
features_ = ['minutes', 'shots', 'passes', 'tackles', 'saves']
neighbors_ = [i for i in range(1, 100)]

best_accuracy = 0
best_features = None
best_neighbors = 0

import itertools
def findsubsets(S,m):
    return set(itertools.combinations(S, m))

for m in range(1, len(features_) + 1):
    feature_list = list(findsubsets(features_, m))
    for features_tuple in feature_list:
        features = list(features_tuple)
        for neighbors in neighbors_:
            classifier = KNeighborsClassifier(neighbors)
            classifier.fit(playersTrain[features], playersTrain[  'position'])
            predictions = classifier.predict(playersTest[features])
            # Calculate accuracy
            numtrain = len(playersTrain)
            numtest = len(playersTest)
            correct = 0
            for i in range(numtest):
                if predictions[i] == playersTest.loc[numtrain+i]['position']: correct +=1
            accuracy = float(correct)/float(numtest)
            if accuracy > best_accuracy:
                #print(accuracy, features, neighbors)
                best_accuracy = accuracy
                best_features = features
                best_neighbors = neighbors
        #print 'Accuracy:', float(correct)/float(numtest), neighbors
        
print("Best accuracy: ", best_accuracy)
print("Best features: ", best_features)
print("Best K parameter: ", best_neighbors)

Best accuracy:  0.7333333333333333
Best features:  ['shots', 'tackles']
Best K parameter:  40


## Decision tree classification

In [65]:
# Predict temperature category from other features
features_ = ['longitude','latitude']
splits = [i for i in range(2, 100)]

import itertools
def findsubsets(S,m):
    return set(itertools.combinations(S, m))

best_accuracy = 0
best_features = None
best_split = 0

for m in range(1, len(features_) + 1):
    feature_list = list(findsubsets(features_, m))
    for features_tuple in feature_list:
        features = list(features_tuple)
        for split in splits:
            dt = DecisionTreeClassifier(min_samples_split=split) # parameter is optional
            dt.fit(citiesTrain[features],citiesTrain['category'])
            predictions = dt.predict(citiesTest[features])
            # Calculate accuracy
            numtrain = len(citiesTrain)
            numtest = len(citiesTest)
            correct = 0
            for i in range(numtest):
            #    print 'Predicted:', predictions[i], ' Actual:', citiesTest.loc[numtrain+i]['category']
                if predictions[i] == citiesTest.loc[numtrain+i]['category']: correct +=1
            #print 'Accuracy:', float(correct)/float(numtest)
        # Try other values for split, other features
            accuracy = float(correct)/float(numtest)
            if accuracy > best_accuracy:
                #print(accuracy, features, trees)
                best_accuracy = accuracy
                best_features = features
                best_split = split
                
print("Best accuracy: ", best_accuracy)
print("Best features: ", best_features)
print("Best split parameter: ", best_split)

Best accuracy:  0.78125
Best features:  ['longitude', 'latitude']
Best split parameter:  19


### "Forest" of decision trees

In [68]:
# Predict temperature category from other features
features_ = ['longitude', 'latitude']
trees_ = [i for i in range(1, 500)]

import itertools
def findsubsets(S,m):
    return set(itertools.combinations(S, m))

best_accuracy = 0
best_features = None
best_trees = 0

for m in range(1, len(features_) + 1):
    feature_list = list(findsubsets(features_, m))
    for features_tuple in feature_list:
        features = list(features_tuple)
        for trees in trees_:
            rf = RandomForestClassifier(n_estimators=trees)
            rf.fit(citiesTrain[features],citiesTrain['category'])
            predictions = rf.predict(citiesTest[features])
            # Calculate accuracy
            numtrain = len(citiesTrain)
            numtest = len(citiesTest)
            correct = 0
            for i in range(numtest):
            #    print 'Predicted:', predictions[i], ' Actual:', citiesTest.loc[numtrain+i]['category']
                if predictions[i] == citiesTest.loc[numtrain+i]['category']: correct +=1
            #print 'Accuracy:', float(correct)/float(numtest), trees
            # Try other values for trees
            accuracy = float(correct)/float(numtest)
            if accuracy > best_accuracy:
                #print(accuracy, features, trees)
                best_accuracy = accuracy
                best_features = features
                best_trees = trees
                
print("Best accuracy: ", best_accuracy)
print("Best features: ", best_features)
print("Best trees parameter: ", best_trees)

Best accuracy:  0.875
Best features:  ['longitude', 'latitude']
Best trees parameter:  15


### <font color="green">Your Turn: Decision tree and forest of trees on World Cup Data</font>

In [6]:
# SINGLE TREE
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# Try different features and different values for min_samples_split.
# What's the highest accuracy you can get?
features_ = ['minutes', 'shots', 'passes', 'tackles', 'saves']
splits = [i for i in range(2, 100)]

import itertools
def findsubsets(S,m):
    return set(itertools.combinations(S, m))

best_accuracy = 0
best_features = None
best_split = 0

for m in range(1, len(features_) + 1):
    feature_list = list(findsubsets(features_, m))
    for features_tuple in feature_list:
        features = list(features_tuple)
        for split in splits:
            dt = DecisionTreeClassifier(min_samples_split=split) # parameter is optional
            dt.fit(playersTrain[features],playersTrain['position'])
            predictions = dt.predict(playersTest[features])
            # Calculate accuracy
            numtrain = len(playersTrain)
            numtest = len(playersTest)
            correct = 0
            for i in range(numtest):
            #    print 'Predicted:', predictions[i], ' Actual:', playersTest.loc[numtrain+i]['position']
                if predictions[i] == playersTest.loc[numtrain+i]['position']: correct +=1
            #print 'Accuracy:', float(correct)/float(numtest)
            accuracy = float(correct)/float(numtest)
            if accuracy > best_accuracy:
                #print(accuracy, features, trees)
                best_accuracy = accuracy
                best_features = features
                best_split = split
                
print("Best accuracy: ", best_accuracy)
print("Best features: ", best_features)
print("Best split parameter: ", best_split)

Best accuracy:  0.8
Best features:  ['minutes', 'shots', 'tackles']
Best split parameter:  35


In [29]:
# FOREST OF TREES
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# Try different values for n_estimators.
# What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']

trees_ = [i for i in range(1, 100)]

import itertools
def findsubsets(S,m):
    return set(itertools.combinations(S, m))

best_accuracy = 0
best_features = None
best_trees = 0

for m in range(1, len(features_) + 1):
    feature_list = list(findsubsets(features_, m))
    for features_tuple in feature_list:
        features = list(features_tuple)
        for trees in trees_:
            rf = RandomForestClassifier(n_estimators=trees)
            rf.fit(playersTrain[features],playersTrain['position'])
            predictions = rf.predict(playersTest[features])
            # Calculate accuracy
            numtrain = len(playersTrain)
            numtest = len(playersTest)
            correct = 0
            for i in range(numtest):
            #    print 'Predicted:', predictions[i], ' Actual:', playersTest.loc[numtrain+i]['position']
                if predictions[i] == playersTest.loc[numtrain+i]['position']: correct +=1
            #print('Accuracy:', float(correct)/float(numtest))
            accuracy = float(correct)/float(numtest)
            if accuracy > best_accuracy:
                print(accuracy, trees, features)
                best_accuracy = accuracy
                best_features = features
                best_trees = trees
                
print("Best accuracy: ", best_accuracy)
print("Best features: ", best_features)
print("Best trees parameter: ", best_trees)

0.5666666666666667 1 ['passes']
0.6 4 ['passes']
0.6333333333333333 8 ['shots', 'passes']
0.6666666666666666 16 ['shots', 'passes']
0.7 56 ['shots', 'tackles']
0.7333333333333333 32 ['shots', 'passes', 'saves']
Best accuracy:  0.7333333333333333
Best features:  ['shots', 'passes', 'saves']
Best trees parameter:  32


### Naive Bayes classification

In [18]:
# Predict temperature category from other features
features_ = ['longitude', 'latitude']

import itertools
def findsubsets(S,m):
    return set(itertools.combinations(S, m))

best_accuracy = 0
best_features = None

for m in range(1, len(features_) + 1):
    feature_list = list(findsubsets(features_, m))
    for features_tuple in feature_list:
        features = list(features_tuple)
        nb = GaussianNB()
        nb.fit(citiesTrain[features],citiesTrain['category'])
        predictions = nb.predict(citiesTest[features])
        # Calculate accuracy
        numtrain = len(citiesTrain)
        numtest = len(citiesTest)
        correct = 0
        for i in range(numtest):
        #    print 'Predicted:', predictions[i], ' Actual:', citiesTest.loc[numtrain+i]['category']
            if predictions[i] == citiesTest.loc[numtrain+i]['category']: correct +=1
        #print 'Accuracy:', float(correct)/float(numtest)
        accuracy = float(correct)/float(numtest)
        if accuracy > best_accuracy:
            #print(accuracy, features, trees)
            best_accuracy = accuracy
            best_features = features
                
print("Best accuracy: ", best_accuracy)
print("Best features: ", best_features)
# Try other features

Best accuracy:  0.84375
Best features:  ['latitude']


### <font color="green">Your Turn: Naive Bayes on World Cup Data</font>

In [22]:
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# Try different features. What's the highest accuracy you can get?
features_ = ['minutes', 'shots', 'passes', 'tackles', 'saves']

import itertools
def findsubsets(S,m):
    return set(itertools.combinations(S, m))

best_accuracy = 0
best_features = None

for m in range(1, len(features_) + 1):
    feature_list = list(findsubsets(features_, m))
    for features_tuple in feature_list:
        features = list(features_tuple)
        nb = GaussianNB()
        nb.fit(playersTrain[features],playersTrain['position'])
        predictions = nb.predict(playersTest[features])
        # Calculate accuracy
        numtrain = len(playersTrain)
        numtest = len(playersTest)
        correct = 0
        for i in range(numtest):
        #    print 'Predicted:', predictions[i], ' Actual:', playersTest.loc[numtrain+i]['position']
            if predictions[i] == playersTest.loc[numtrain+i]['position']: correct +=1
        #print 'Accuracy:', float(correct)/float(numtest), features
        accuracy = float(correct)/float(numtest)
        if accuracy > best_accuracy:
            #print(accuracy, features, trees)
            best_accuracy = accuracy
            best_features = features
                
print("Best accuracy: ", best_accuracy)
print("Best features: ", best_features)

Best accuracy:  0.7
Best features:  ['shots', 'passes', 'tackles', 'saves']
