# COMP 432 PROJECT: Predicting the Colors of Squirrels of Central Park
By Vanessa Razon
<br/>Student ID: 40033699

In [1]:
# LOAD PACKAGES AND DATA
import pandas as pd                   
import numpy as np                    
import sklearn      
import sklearn.linear_model         # For LogisticRegression class        
import sklearn.model_selection      # For cross-validation class
import sklearn.metrics              # For accuracy_score class
import sklearn.neural_network       # For MLPClassifier class
import sklearn.svm                  # For SVC class 
import sklearn.ensemble             # For RandomForestClassifier
import sklearn.tree                 # For DecisionTreeClassifier
import matplotlib.pyplot as plt
# import warnings 
# warnings.filterwarnings("ignore", category=UserWarning)

### Preprocessing the dataset


In [2]:
# Load the dataset
df = pd.read_csv('2018_Central_Park_Squirrel_Census_-_Squirrel_Data.csv')
df.shape # We see that there are currently 3023 data points and 31 columns

# Checking for any missing values in the features/labels that will be used in the project
# Features
print("Number of missing values:")
print("X coordinate: %i"%(df.shape[0]-df['X'].count()))
print("Y coordinate: %i"%(df.shape[0]-df['X'].count()))
print("Time of day: %i"%(df.shape[0]-df['Shift'].count()))
print("Date: %i"%(df.shape[0]-df['Date'].count()))
# Labels (note: since squirrels do not always have a highlight fur color, it is ok to have missing values for
# the highlight color)
print("Primary fur color: %i"%(df.shape[0]-df['Primary Fur Color'].count())) 

# Remove data points with missing values of primary fur color
df = df.dropna(subset=['Primary Fur Color'])
# Set missing highlight fur colors as 'None'
df['Highlight Fur Color'] = df['Highlight Fur Color'].fillna('None')
df.reset_index(drop=True, inplace=True)

Number of missing values:
X coordinate: 0
Y coordinate: 0
Time of day: 0
Date: 0
Primary fur color: 55


In [3]:
# Converting the features/labels into numpy arrays
from sklearn import preprocessing   # For LabelEncoder class

Xcoor = np.array(df['X'])
Ycoor = np.array(df['Y'])
# Time of day: value 0 if shift=AM or 1 if shift=PM
time = np.array([0 if df['Shift'][i]=='AM' else 1 for i in range(df.shape[0])])
# Date: since the data was collected in the span of 2 weeks of October 2018, only the date 
# number will be used for the 'Date' feature
date = np.array([str(df['Date'][i])[2]+str(df['Date'][i])[3] for i in range(df.shape[0])], dtype='int32')

# Since the labels are expressed as categorical variables, categorical encoding will be done 

primary_LE = preprocessing.LabelEncoder()
pColor = primary_LE.fit_transform(df['Primary Fur Color'])

highlight_LE = preprocessing.LabelEncoder()
hColor = highlight_LE.fit_transform(df['Highlight Fur Color'])

### Multiclass Logistic Regression
The different solvers for a multiclass problem in sklearn's LogisticRegression yielded the same accuracies for the training and test sets.
Here, newton-cg was used as the solver.

In [4]:
from sklearn.linear_model import LogisticRegression

def MLR(predictors,labels,folds):
    """ Trains a multiclass logistic regression and performs k-fold cross-validation. 
    Expected output is a multiclass logistic regression model that has been trained using the given predictors and
    labels.
    
    Arguments
    ---------
    predictors: matrix(n_samples,n_features) of predictors to be used in the model
    labels: vector of squirrel's color of shape (n_samples, )
    folds: number of folds used in cross-validation
    
    Example
    -------
    >>>pColor_MLR = MLR(modelInput, labels[0],4)
    held-out accuracy (4-fold): 76.58356%
    >>>pColor_MLR
    LogisticRegression(max_iter=300, multi_class='multinomial', random_state=0, solver='newton-cg')
    
    """
    model = LogisticRegression(multi_class='multinomial', 
                               max_iter=300, solver='newton-cg', random_state=0).fit(predictors,labels)
    kFoldCV(model, predictors, labels,folds)
    return model

### Multilayer Perceptron
Several different MLPs were tested (different number of layers ( > 1), different solver and/or activation functions) and all yielded the same accuracies for the training and test sets

In [5]:
from sklearn.neural_network import MLPClassifier

def MLP(predictors,labels,folds):
    """ Trains a 5-layer multilayer perceptron and performs k-fold cross-validation. 
    Expected output is a MLP model that has been trained using the given predictors and
    labels.
    
    Arguments
    ---------
    predictors: matrix(n_samples,n_features) of predictors to be used in the model
    labels: vector of squirrel's color (1x2 vector of Strings)
    
    Example
    -------
    >>>pColor_MLP = MLP(modelInput, labels[0],4)
    held-out accuracy (4-fold): 83.73771%
    >>>pColor_MLP
    MLPClassifier(hidden_layer_sizes=(10,20,30,20,10), activation='relu', solver='adam', batch_size=100, max_iter=500,
                          learning_rate_init=0.05, random_state=0)
    """
    model = MLPClassifier(hidden_layer_sizes=(10,20,30,40), activation='relu', solver='adam', batch_size=100, max_iter=500,
                          learning_rate_init=0.01, random_state=0).fit(predictors,labels)
    kFoldCV(model, predictors, labels,folds)
    return model
    

### Support Vector Machine

In [6]:
from sklearn.svm import SVC

def SVM(predictors,labels,folds):
    """ Trains a support vector machine and performs k-fold cross-validation. 
    Expected output is a MLP model that has been trained using the given predictors and
    labels.
    
    Arguments
    ---------
    predictors: matrix(n_samples,n_features) of predictors to be used in the model
    labels: vector of squirrel's color (1x2 vector of Strings)
    
    Example
    -------
    >>>pColor_SVM = SVM(modelInput, labels[0],4)
    held-out accuracy (4-fold): 76.58356%
    >>>pColor_SVM
    SVC(kernel='rbf', degree=2, gamma=1)
    """
    model = SVC(kernel='rbf', degree=2, gamma=1).fit(predictors, labels)
    kFoldCV(model, predictors, labels,folds)
    return model


### Decision Tree and Random Forest

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

def DTC(predictors, labels, folds):
    """ Trains a decision tree classifier and performs k-fold cross-validation. 
    Expected output is a MLP model that has been trained using the given predictors and
    labels.
    
    Arguments
    ---------
    predictors: matrix(n_samples,n_features) of predictors to be used in the model
    labels: vector of squirrel's color (1x2 vector of Strings)
    
    Example
    -------
    >>>pColor_DTC = DTC(modelInput, labels[0],4)
    held-out accuracy (4-fold): 76.58356%
    >>>pColor_DTC
    DecisionTreeClassifier(random_state=0)
    """
    model = DecisionTreeClassifier(random_state=0).fit(predictors, labels)
    kFoldCV(model, predictors, labels,folds)
    return model

def RF(predictors, labels, folds):
    """ Trains a random forest and performs k-fold cross-validation. 
    Expected output is a MLP model that has been trained using the given predictors and
    labels.
    
    Arguments
    ---------
    predictors: matrix(n_samples,n_features) of predictors to be used in the model
    labels: vector of squirrel's color (1x2 vector of Strings)
    
    Example
    -------
    >>>pColor_RF = RF(modelInput, labels[0],4)
    held-out accuracy (4-fold): 76.58356%
    >>>pColor_RF
    RandomForestClassifier(random_state=0)
    """
    model = RandomForestClassifier(n_estimators=30, random_state=0).fit(predictors,labels)
    kFoldCV(model, predictors, labels,folds)
    return model

In [8]:
# Other useful fuctions

def color(colorVector):
    """ Transforms labels for primary and highlight color into original color names. The expected output is a 1d 
        array that holds the primary and highlight colors (Strings) 
    
    Arguments
    ---------
    vector: 1x2 array that holds integer values to be translated as colors
    
    Example
    -------
    >>>vector = np.array([1,10])
    >>>color(vector)
    ['Cinnamon', 'White']
    """
    return np.append(primary_LE.inverse_transform(colorVector[0]), highlight_LE.inverse_transform(colorVector[1]))


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import accuracy_score

def kFoldCV(model, X_train, y_train,folds=10):
    """ Performs k-fold cross-validation using classification error and prints held-out accuracy
    
    Arguments
    ---------
    model: model that will be used for cross-validation 
    X_train: array of predictors of shape (n_samples, n_features)
    y_train: array of labels of shape (n_samples, )
    folds: int number of folds used in cross-validation
    
    Example
    -------
    >>>kFoldCV(mlp, predictors, labels, 5)
    held-out accuracy (5-fold): 54.42314%
    """
    scores = cross_val_score(model, X_train, y_train, cv=folds, scoring='accuracy',)
    print("Accuracy of model on training set by %d-fold cross-validation: %.5f%%"%(folds,scores.mean()*100))
    return scores.mean()*100
    
def testAcc(model, X_test, y_test):
    """ Computes classification error on test set and prints held-out accuracy
    
    Arguments
    ---------
    model: model that will be used for testing
    X_test: array of predictors of shape (n_samples, n_features)
    y_test: array of labels of shape (n_samples, )
    
    Example
    -------
    >>>testAcc(model, X_test, y_test)
    
    """
    pred = model.predict(X_test)
    print("Accuracy of model on test set: %.5f%%" %(accuracy_score(y_test, pred)*100))
    return accuracy_score(y_test, pred)*100

In [9]:
## FINDING SOME "BETTER" ALTERNATIVES

def MLR2(X_train,y_train,X_test,y_test,folds):
    xvals = np.linspace(25,200,8)
    solvers = ['newton-cg', 'saga', 'sag','lbfgs']
    test = np.empty(xvals.size)
    for s in solvers:
        print(s)
        for i in range(xvals.size):
            model = LogisticRegression(multi_class='multinomial', 
                                       max_iter=i, solver=s, random_state=0).fit(X_train,y_train)
            test[i] = testAcc(model,X_test,y_test);
        plt.plot(xvals,test,label=s)
    plt.legend()
    plt.xlabel("iteration")
    plt.ylabel("classification accuracy")

def MLP2(X_train,y_train,X_test,y_test,layers,act,s,folds):
    model = MLPClassifier(hidden_layer_sizes=layers, activation=act, solver=s, batch_size=100, max_iter=500,
                          learning_rate_init=0.05, random_state=0).fit(X_train,y_train)
    kFoldCV(model, X_train,y_train,folds)
    testAcc(model,X_test,y_test)
    return model

def SVM2(X_train,y_train,X_test,y_test,folds,ker,deg):
    model = SVC(kernel=ker, degree=deg, gamma=1).fit(X_train,y_train)
    kFoldCV(model, X_train,y_train,folds)
    testAcc(model,X_test,y_test)
    return model

def DTC2(X_train,y_train,X_test,y_test,folds,max_f):
    model = DecisionTreeClassifier(max_features=max_f, random_state=0).fit(X_train, y_train)
    kFoldCV(model, X_train,y_train,folds)
    testAcc(model,X_test,y_test)
    return model

def RF2(X_train,y_train,X_test,y_test,folds,est):
    model = RandomForestClassifier(random_state=0, n_estimators=est).fit(X_train, y_train)
    kFoldCV(model, X_train,y_train,folds)
    testAcc(model,X_test,y_test)
    return model

### Running all the machine learning models
Predicting primary fur color

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

modelInput = np.column_stack((Xcoor,Ycoor,time,date))
X_train, X_test, y_train, y_test = train_test_split(modelInput,pColor, test_size=0.25, random_state=0) # For training

print("PRIMARY COLOR PREDICTION")

print("\nMLR")
# Multiclass Logistic Regression
pColor_MLR= MLR(X_train, y_train,4)
testAcc(pColor_MLR, X_test, y_test)

print("\nMLP")
# Multi-layer perceptron
pColor_MLP = MLP(X_train, y_train,4)
testAcc(pColor_MLP, X_test, y_test)

print("\nSVM")
# Support-vector machine
pColor_SVM = SVM(X_train, y_train,4)
testAcc(pColor_SVM, X_test, y_test)

print("\nDecision tree")
# Decision tree
pColor_DTC = DTC(X_train, y_train,4)
testAcc(pColor_DTC, X_test, y_test)

print("\nRandom Forest")
# Random Forest
pColor_RF = SVM(X_train, y_train,4)
testAcc(pColor_RF, X_test, y_test);

PRIMARY COLOR PREDICTION

MLR
Accuracy of model on training set by 4-fold cross-validation: 83.73771%
Accuracy of model on test set: 82.07547%

MLP
Accuracy of model on training set by 4-fold cross-validation: 83.73771%
Accuracy of model on test set: 82.07547%

SVM
Accuracy of model on training set by 4-fold cross-validation: 83.73771%
Accuracy of model on test set: 82.07547%

Decision tree
Accuracy of model on training set by 4-fold cross-validation: 78.61601%
Accuracy of model on test set: 75.33693%

Random Forest
Accuracy of model on training set by 4-fold cross-validation: 83.73771%
Accuracy of model on test set: 82.07547%


We see that the all the models, except the Decision tree, obtain the same accuracy on the test set. But why?

In [11]:
# Looking at the different predictions

print(pColor_MLR.predict(X_test))
print(primary_LE.inverse_transform([2]))  
# It seems like the Multiclass Logistic Regression's output is always 2 (which corresponds to 'Gray')
np.all(primary_LE.inverse_transform(pColor_MLR.predict(X_test)) == 'Gray')
# The Multiclass Logistic Regression is predicting that the sighted squirrel 
# will be gray no matter what the input is

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 

True

In [12]:
# Checking if the other models get the same predictions

print("All MLP predictions are 'Gray'? " + str(np.array_equiv(pColor_MLR.predict(X_test), pColor_MLP.predict(X_test))))
print("All SVM predictions are 'Gray'? " + str(np.array_equiv(pColor_MLR.predict(X_test), pColor_SVM.predict(X_test))))
print("All Random Forest predictions are 'Gray'? " + str(np.array_equiv(pColor_MLR.predict(X_test), pColor_RF.predict(X_test))))
print("All Decision Tree predictions are 'Gray'? " + str(np.array_equiv(pColor_MLR.predict(X_test), pColor_DTC.predict(X_test))))

All MLP predictions are 'Gray'? True
All SVM predictions are 'Gray'? True
All Random Forest predictions are 'Gray'? True
All Decision Tree predictions are 'Gray'? False


In [13]:
print(df['Primary Fur Color'].value_counts()) 
# We see that squirrels with a gray primary fur color make up most of the dataset
# and this could be the reason why the 4 models are only predicting gray squirrels
print("\nGray squirrels make up %.5f%% of the dataset" %(2473/df['Primary Fur Color'].count()*100))

Gray        2473
Cinnamon     392
Black        103
Name: Primary Fur Color, dtype: int64

Gray squirrels make up 83.32210% of the dataset


#### Trying to handle the imbalance of classes problem
Using Penalize Algorithms (Cost-Sensitive Training)

In [34]:
# Penalized SVM
svc_model = SVC(kernel='rbf', degree=2, gamma=1, class_weight='balanced', probability=True).fit(X_train,y_train)
testAcc(svc_model,X_test,y_test)

# Tree-based algorithm: trying to find a better random forest
for i in range(1,20):
    print(i)
    randomF = RF2(X_train,y_train,X_test,y_test,4,i)
randomF = RF2(X_train,y_train,X_test,y_test,4,13)
np.unique(randomF.predict(X_test))

Accuracy of model on test set: 38.14016%
1
Accuracy of model on training set by 4-fold cross-validation: 76.19054%
Accuracy of model on test set: 74.79784%
2
Accuracy of model on training set by 4-fold cross-validation: 69.72177%
Accuracy of model on test set: 69.40701%
3
Accuracy of model on training set by 4-fold cross-validation: 79.69475%
Accuracy of model on test set: 78.43666%
4
Accuracy of model on training set by 4-fold cross-validation: 77.22350%
Accuracy of model on test set: 76.41509%
5
Accuracy of model on training set by 4-fold cross-validation: 80.77291%
Accuracy of model on test set: 78.84097%
6
Accuracy of model on training set by 4-fold cross-validation: 79.91981%
Accuracy of model on test set: 78.97574%
7
Accuracy of model on training set by 4-fold cross-validation: 82.52465%
Accuracy of model on test set: 78.97574%
8
Accuracy of model on training set by 4-fold cross-validation: 81.35696%
Accuracy of model on test set: 78.84097%
9
Accuracy of model on training set by 

array([0, 1, 2])

Predicting highlight fur color

In [None]:
print("HIGHLIGHT COLOR PREDICTION")

X_train, X_test, y_train, y_test = train_test_split(modelInput,hColor, test_size=0.25, random_state=0) # For training

print("\nMLR")
# Multiclass Logistic Regression
hColor_MLR= MLR(X_train, y_train,4)
testAcc(hColor_MLR, X_test, y_test)

print("\nMLP")
# Multi-layer perceptron
hColor_MLP = MLP(X_train, y_train,4)
testAcc(hColor_MLP, X_test, y_test)

print("\nSVM")
# Support-vector machine
hColor_SVM = SVM(X_train, y_train,4)
testAcc(hColor_SVM, X_test, y_test)

print("\nDecision tree")
# Decision tree
hColor_DTC = DTC(X_train, y_train,4)
testAcc(hColor_DTC, X_test, y_test)


print("\nRandom Forest")
# Random Forest
hColor_RF = SVM(X_train, y_train,4)
testAcc(hColor_RF, X_test, y_test);

### Clustering approach
Plotting the figures

In [None]:
plt.figure(figsize=(10,10))
coor = np.vstack((Xcoor,Ycoor)).T
plt.scatter(*coor[pColor==2].T, c="gray", s=10, label="gray")
plt.scatter(*coor[pColor==1].T, c="orange", s=10, label="cinnamon")
plt.scatter(*coor[pColor==0].T, c="black", s=10, label="black")
plt.xlabel("X coordinate")
plt.ylabel("Y coordinate")
plt.title("Squirrels' main color")
plt.legend()

plt.figure(figsize=(10,10))
colorList=["red", "orange", "yellow", "green", "blue", "purple", "pink", "gray", "black", "cyan", "olive"]
colorLabels = df['Highlight Fur Color'].value_counts().index.tolist()

for i in range(df["Highlight Fur Color"].nunique()):
    plt.scatter(*coor[hColor==i].T, c=colorList[i], s=10, label=colorLabels[i])
plt.legend()
plt.title("Squirrels' highlight color")
plt.xlabel("X coordinate")
plt.ylabel("Y coordinate")

In [None]:
import sklearn.cluster

coor = np.vstack((Xcoor,Ycoor)).T
plt.figure(figsize=(10,10))
kmeans = sklearn.cluster.KMeans(n_clusters=10, random_state=0).fit_predict(modelInput)
plt.scatter(coor[:, 0], coor[:, 1], c=kmeans, s=10)
plt.xlabel("X coordinate")
plt.ylabel("Y coordinate")
plt.title("Squirrels' main color")
plt.legend()

In [None]:
import sklearn.mixture
from sklearn.mixture import GaussianMixture

coor = np.vstack((Xcoor,Ycoor)).T
gmm = GaussianMixture(n_components=11, random_state=0).fit_predict(modelInput)
plt.scatter(coor[:, 0], coor[:, 1], c=gmm, s=10)
plt.xlabel("X coordinate")
plt.ylabel("Y coordinate")
plt.title("Squirrels' main color")
plt.legend()

gmm
kmeans = sklearn.cluster.KMeans(n_clusters=11, random_state=0).fit_predict(modelInput)

accuracy_score(hColor, kmeans)

In [None]:
MLP_test = MLP(modelInput[0],pColor[0],4)