### Using R for Machine Learning

In [None]:
%load_ext rpy2.ipython

In [None]:
# For compatibility across multiple platforms
import os
IB = os.environ.get('INSTABASE_URI',None) is not None
open = ib.open if IB else open

In [None]:
%%R
library(class)
library(rpart)
library(randomForest)
library(e1071) # for NaiveBayes

#### Open CSV files and load data into data frames

In [None]:
C = open('Cities.csv').read()
P = open('Players.csv').read()

In [None]:
%%R -i C -i P
cities <- read.csv(text=C)
players <- read.csv(text=P)

### Regression

In [None]:
%%R
# Linear regression for temperature versus latitude
reg <- lm(cities$temperature~cities$latitude)
plot(cities$latitude, cities$temperature, xlab='latitude', ylab='temperature',
     col='blue', pch=16)
abline(reg, col='red')

In [None]:
%%R
# Correlation coefficient (r value) for temperature versus latitude
rvalue <- cor(cities$latitude, cities$temperature)
print(rvalue)

### Classification set-up

In [None]:
%%R
# Add temperature category column to cities data
for (i in 1:nrow(cities))
{ if (cities[i,'temperature'] < 5) cities[i,'category'] <- 'cold'
  else if (cities[i,'temperature'] < 9) cities[i,'category'] <- 'cool'
  else if (cities[i,'temperature'] < 15) cities[i,'category'] <- 'warm'
  else cities[i,'category'] <- 'hot'
}
cat('cold', nrow(cities[cities$category == 'cold', ]), '\n')
cat('cool', nrow(cities[cities$category == 'cool', ]), '\n')
cat('warm', nrow(cities[cities$category == 'warm', ]), '\n')
cat('hot', nrow(cities[cities$category == 'hot', ]), '\n')

In [None]:
%%R
# Create training and test sets for cities data
numitems <- nrow(cities)
percenttrain <- 0.85
numtrain <- round(numitems*percenttrain)
numtest <- numitems - numtrain
cat('Training set', numtrain, 'items\n')
cat('Test set', numtest, 'items\n')
citiesTrain <- cities[0:numtrain, ]
citiesTest <- cities[numtrain:numitems, ]

In [None]:
%%R
# Create training and test sets for players data - reorder first to avoid team bias
numitems <- nrow(players)
percenttrain <- 0.95
numtrain <- round(numitems*percenttrain)
numtest <- numitems - numtrain
cat('Training set', numtrain, 'items\n')
cat('Test set', numtest, 'items\n')
players <- players[order(players$surname), ]
playersTrain <- players[0:numtrain, ]
playersTest <- players[numtrain:numitems, ]

### K-nearest-neighbors classification

In [None]:
%%R
# Predict temperature category from other features
# Note ties are broken at random so different runs may get different results
features <- c('longitude','latitude')
neighbors <- 8
# data.frame coercion needed to allow single feature
train <- data.frame(citiesTrain[, features])
test <- data.frame(citiesTest[, features])
labels <- citiesTrain[, 'category']
predictions <- knn(train, test, labels, neighbors)
# Calculate accuracy
numtrain <- nrow(citiesTrain)
numtest <- nrow(citiesTest)
correct <- 0
for(i in 1:numtest) {
    # as.character coercion needed to convert factors to values
    cat('Predicted:', as.character(predictions[i]),
        ' Actual:', as.character(citiesTest[i,'category']), '\n');
    if (predictions[i] == citiesTest[i,'category']) correct <- correct + 1
}
cat('Accuracy:', correct/numtest)
# Comment out cat, play with other values for neighbors, try 'temperature'
# as feature

### <font color="green">Your Turn: K-nearest neighbors on World Cup Data</font>

In [None]:
%%R
# Predict position from one or more of minutes, shots, passes, tackles, saves
# Try different features and different numbers of neighbors
# What's the highest accuracy you can get?
# Note ties are broken at random so different runs may get different results
features <- c('minutes', 'shots', 'passes', 'tackles', 'saves')
neighbors <- 8
train <- data.frame(playersTrain[, features])
test <- data.frame(playersTest[, features])
labels <- playersTrain[, 'position']
predictions <- knn(train, test, labels, neighbors)
# Calculate accuracy
numtrain <- nrow(playersTrain)
numtest <- nrow(playersTest)
correct <- 0
for(i in 1:numtest) {
#    cat('Predicted:', as.character(predictions[i]),
#        ' Actual:', as.character(playersTest[i,'position']), '\n');
    if (predictions[i] == playersTest[i,'position']) correct <- correct + 1
}
cat('Accuracy:', correct/numtest)

### Decision tree classification

In [None]:
%%R
# Predict temperature category from other features
# minsplit: minimum number of observations needed for node split
# minbucket: minimum number of observations in leaf node (default minsplit/3)
features <- c('longitude','latitude')
# data.frame coercion and column naming needed to allow single feature:
train <- data.frame(citiesTrain[, features])
test <- data.frame(citiesTest[, features])
colnames(train) <- features
colnames(test) <- features
tree <- rpart(citiesTrain[ , 'category']~., train, minsplit=20, minbucket=5)
predictions <- predict(tree, test, type="class")
# Calculate accuracy
numtrain <- nrow(citiesTrain)
numtest <- nrow(citiesTest)
correct <- 0
for(i in 1:numtest) {
    cat('Predicted:', as.character(predictions[i]),
        ' Actual:', as.character(citiesTest[i,'category']), '\n');
    if (predictions[i] == citiesTest[i,'category']) correct <- correct + 1
}
cat('Accuracy:', correct/numtest)
# Comment out cat, play with other values for minsplit, minbucket
# Try 'temperature' as feature

### "Forest" of decision trees

In [None]:
%%R
# Predict temperature category from other features
# ntree: number of trees in forest
features <- c('longitude','latitude')
# data.frame coercion and column naming needed to allow single feature:
train <- data.frame(citiesTrain[, features])
test <- data.frame(citiesTest[, features])
colnames(train) <- features
colnames(test) <- features
forest <- randomForest(as.factor(citiesTrain[ , 'category'])~., train, ntree=10)
predictions <- predict(forest, test)
# Calculate accuracy
numtrain <- nrow(citiesTrain)
numtest <- nrow(citiesTest)
correct <- 0
for(i in 1:numtest) {
#    cat('Predicted:', as.character(predictions[i]),
#        ' Actual:', as.character(citiesTest[i,'category']), '\n');
    if (predictions[i] == citiesTest[i,'category']) correct <- correct + 1
}
cat('Accuracy:', correct/numtest)
# Play with other values for ntree, try 'temperature' as feature

### <font color="green">Your Turn: Decision tree and forest of trees on World Cup Data</font>

In [None]:
%%R
# SINGLE TREE
# Predict position from one or more of minutes, shots, passes, tackles, saves
# Try different features and different values for minsplit and minbucket
# What's the highest accuracy you can get?
features <- c('minutes', 'shots', 'passes', 'tackles', 'saves')
train <- data.frame(playersTrain[, features])
test <- data.frame(playersTest[, features])
colnames(train) <- features
colnames(test) <- features
tree <- rpart(playersTrain[ , 'position']~., train, minsplit=20, minbucket=5)
predictions <- predict(tree, test, type="class")
# Calculate accuracy
numtrain <- nrow(playersTrain)
numtest <- nrow(playersTest)
correct <- 0
for(i in 1:numtest) {
#    cat('Predicted:', as.character(predictions[i]),
#        ' Actual:', as.character(playersTest[i,'position']), '\n');
    if (predictions[i] == playersTest[i,'position']) correct <- correct + 1
}
cat('Accuracy:', correct/numtest)

In [None]:
%%R
# FOREST OF TREES
# Predict position from one or more of minutes, shots, passes, tackles, saves
# Try different features and different values for ntree
# What's the highest accuracy you can get?
features <- c('minutes', 'shots', 'passes', 'tackles', 'saves')
train <- data.frame(playersTrain[, features])
test <- data.frame(playersTest[, features])
colnames(train) <- features
colnames(test) <- features
forest <- randomForest(as.factor(playersTrain[ , 'position'])~., train, ntree=10)
predictions <- predict(forest, test)
# Calculate accuracy
numtrain <- nrow(playersTrain)
numtest <- nrow(playersTest)
correct <- 0
for(i in 1:numtest) {
#    cat('Predicted:', as.character(predictions[i]),
#        ' Actual:', as.character(citiesTest[i,'category']), '\n');
    if (predictions[i] == playersTest[i,'position']) correct <- correct + 1
}
cat('Accuracy:', correct/numtest)

### Naive Bayes Classification

In [None]:
%%R
# Predict temperature category from other features
features <- c('longitude', 'latitude')
# data.frame coercion and column naming needed to allow single feature:
train <- data.frame(citiesTrain[, features])
test <- data.frame(citiesTest[, features])
colnames(train) <- features
colnames(test) <- features
model <- naiveBayes(train, as.factor(citiesTrain[, 'category']))
predictions <- predict(model, test)
# Calculate accuracy
numtrain <- nrow(citiesTrain)
numtest <- nrow(citiesTest)
correct <- 0
for(i in 1:numtest) {
    cat('Predicted:', as.character(predictions[i]),
        ' Actual:', as.character(citiesTest[i,'category']), '\n');
    if (predictions[i] == citiesTest[i,'category']) correct <- correct + 1
}
cat('Accuracy:', correct/numtest)
# Comment out cat, try different features
# Add print(model), look at tables

### <font color="green">Your Turn: Naive Bayes on World Cup Data</font>

In [None]:
%%R
# Predict position from one or more of minutes, shots, passes, tackles, saves
# Try different features
# What's the highest accuracy you can get?
features <- c('minutes', 'shots', 'passes', 'tackles', 'saves')
train <- data.frame(playersTrain[, features])
test <- data.frame(playersTest[, features])
colnames(train) <- features
colnames(test) <- features
model <- naiveBayes(train, as.factor(playersTrain[, 'position']))
# print(model)
predictions <- predict(model, test)
# Calculate accuracy
numtrain <- nrow(playersTrain)
numtest <- nrow(playersTest)
correct <- 0
for(i in 1:numtest) {
#    cat('Predicted:', as.character(predictions[i]),
#        ' Actual:', as.character(playersTest[i,'position']), '\n');
    if (predictions[i] == playersTest[i,'position']) correct <- correct + 1
}
cat('Accuracy:', correct/numtest)

### Clustering

In [None]:
%%R
# K-means clustering of Cities data based on longitude-latitude
numclusters = 5
clus <- kmeans(cities[, c('longitude','latitude')], numclusters)
plot(cities$longitude, cities$latitude, xlab='longitude', ylab='latitude',
     col=clus$cluster, pch=16)
# Try different values for numclusters (K)

In [None]:
%%R
# Now with centroids
numclusters = 5
clus <- kmeans(cities[, c('longitude','latitude')], numclusters)
plot(cities$longitude, cities$latitude, xlab='longitude', ylab='latitude',
     col=clus$cluster, pch=16)
points(clus$centers, pch=8, cex=2)

In [None]:
%%R
# K-means clustering of Cities data based on temperature
numclusters = 5
clus <- kmeans(cities[, 'temperature'], numclusters)
plot(cities$longitude, cities$latitude, xlab='longitude', ylab='latitude',
     col=clus$cluster, pch=16)
# print(clus$centers)
# uncomment to show cluster centers

### <font color="green">Your Turn: Clustering on World Cup Data</font>

In [None]:
%%R
# Create a clustering from the Players data and show it
# visually in a scatterplot. No need for centroids.
YOUR CODE HERE