In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

df = pd.read_csv('data/zoo.csv')

Exploring the data

In [2]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   animal_name  101 non-null    object
 1   hair         101 non-null    int64 
 2   feathers     101 non-null    int64 
 3   eggs         101 non-null    int64 
 4   milk         101 non-null    int64 
 5   airborne     101 non-null    int64 
 6   aquatic      101 non-null    int64 
 7   predator     101 non-null    int64 
 8   toothed      101 non-null    int64 
 9   backbone     101 non-null    int64 
 10  breathes     101 non-null    int64 
 11  venomous     101 non-null    int64 
 12  fins         101 non-null    int64 
 13  legs         101 non-null    int64 
 14  tail         101 non-null    int64 
 15  domestic     101 non-null    int64 
 16  catsize      101 non-null    int64 
 17  class_type   101 non-null    int64 
dtypes: int64(17), object(1)
memory usage: 14.3+ KB
None


In [3]:
print(df[:5])

  animal_name  hair  feathers  eggs  milk  airborne  aquatic  predator  \
0    aardvark     1         0     0     1         0        0         1   
1    antelope     1         0     0     1         0        0         0   
2        bass     0         0     1     0         0        1         1   
3        bear     1         0     0     1         0        0         1   
4        boar     1         0     0     1         0        0         1   

   toothed  backbone  breathes  venomous  fins  legs  tail  domestic  catsize  \
0        1         1         1         0     0     4     0         0        1   
1        1         1         1         0     0     4     1         0        1   
2        1         1         0         0     1     0     1         0        0   
3        1         1         1         0     0     4     0         0        1   
4        1         1         1         0     0     4     1         0        1   

   class_type  
0           1  
1           1  
2           4  
3   

In [4]:
#Taking all integer columns (actually boolean) as features, except class_type, which is the label
features = df.select_dtypes(include=['int64']).drop('class_type', axis=1)
features[:5]

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize
0,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1
1,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1
2,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0
3,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1
4,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1


In [5]:
labels = df.class_type
labels[:5]

0    1
1    1
2    4
3    1
4    1
Name: class_type, dtype: int64

In [6]:
#Since this is binary, we will do either mammal or non-mammal

labels.replace(np.arange(2, 10, 1), 0, inplace=True)
#Non-mammals
print((labels == 0).sum())
#Mammals
print((labels == 1).sum())

60
41


In [7]:
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model as lm
from sklearn.metrics import mean_squared_error as mse 

In [8]:
train_features, test_features, train_labels, test_labels = ms.train_test_split(features, labels, test_size=0.2)
print(train_features.shape)
print(test_features.shape)
print(train_labels.shape)
print(test_labels.shape)

(80, 16)
(21, 16)
(80,)
(21,)


In [9]:
logistic_mod = lm.LogisticRegression(solver='lbfgs')
logistic_mod.fit(train_features, train_labels)

In [10]:
print(logistic_mod.coef_)
print(logistic_mod.intercept_)

[[ 1.44279022 -0.48709736 -1.41031012  1.87652136 -0.38808301 -0.25844068
  -0.11043989  0.75956096  0.5323303   0.52877168 -0.48057464 -0.01190482
   0.10940594  0.2883512   0.13904888  0.81167099]]
[-3.09225947]


In [11]:
train_probs = logistic_mod.predict_proba(train_features)
test_probs = logistic_mod.predict_proba(test_features)

In [12]:
print('Class 0 and Class 1 probabilities')

print(train_probs[:5])

Class 0 and Class 1 probabilities
[[0.95723193 0.04276807]
 [0.02697863 0.97302137]
 [0.96453543 0.03546457]
 [0.0333574  0.9666426 ]
 [0.04401558 0.95598442]]


In [13]:
train_predictions = np.where(train_probs[:, 1] > 0.5, 1, 0)
test_predictions = np.where(test_probs[:, 1] > 0.5, 1, 0)

In [14]:
print(train_predictions[:5], train_labels[:5])

[0 1 0 1 1] 95    0
28    1
2     0
96    1
32    1
Name: class_type, dtype: int64


In [15]:
def ClassifierMetrics(labels, predictions):
    total = labels.size
    result = (labels == predictions)
    correct = result.sum()
    accuracy = (correct)/total

    #Precision (correct '1' prediction / total '1' prediction)
    precision = (result[predictions == 1.0].sum()) / (predictions == 1.0).sum()

    #Recall = (correct '1' predictions / total number of '1's)

    recall = (result[predictions == 1.0].sum()) / (labels == 1.0).sum()

    return [accuracy, precision, recall]

In [16]:
train_metrics = ClassifierMetrics(train_labels, train_predictions)
print(f"Accuracy: {train_metrics[0]}")
print(f"Precision: {train_metrics[1]}")
print(f"Recall: {train_metrics[2]}")

Accuracy: 1.0
Precision: 1.0
Recall: 1.0


In [17]:
test_metrics = ClassifierMetrics(test_labels, test_predictions)
print(f"Accuracy: {test_metrics[0]}")
print(f"Precision: {test_metrics[1]}")
print(f"Recall: {test_metrics[2]}")

Accuracy: 1.0
Precision: 1.0
Recall: 1.0


In [18]:
print(features.columns)

Index(['hair', 'feathers', 'eggs', 'milk', 'airborne', 'aquatic', 'predator',
       'toothed', 'backbone', 'breathes', 'venomous', 'fins', 'legs', 'tail',
       'domestic', 'catsize'],
      dtype='object')


In [21]:
#Try your own animal:
#Remember, the columns should be given in order (0 is no, 1 is yes)

#Testing for bat

custom_animal = [1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0]
custom_animal = np.reshape(custom_animal, (1, 16))
print(custom_animal.shape)

pred = logistic_mod.predict(custom_animal)

if pred == 0:
    print('Your animal is not a mammal')
else:
    print('Your animal is a mammal')

(1, 16)
Your animal is a mammal


