In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import plotly.express as py
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot
import cufflinks as cf
init_notebook_mode(connected=True)
cf.go_offline()

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
# from sklearn.preprocessing import StandardScaler

In [5]:
df = pd.read_csv('Zoo.csv')
df.head()

Unnamed: 0,animal name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


In [6]:
df.shape

(101, 18)

In [7]:
sum(df.duplicated())

0

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   animal name  101 non-null    object
 1   hair         101 non-null    int64 
 2   feathers     101 non-null    int64 
 3   eggs         101 non-null    int64 
 4   milk         101 non-null    int64 
 5   airborne     101 non-null    int64 
 6   aquatic      101 non-null    int64 
 7   predator     101 non-null    int64 
 8   toothed      101 non-null    int64 
 9   backbone     101 non-null    int64 
 10  breathes     101 non-null    int64 
 11  venomous     101 non-null    int64 
 12  fins         101 non-null    int64 
 13  legs         101 non-null    int64 
 14  tail         101 non-null    int64 
 15  domestic     101 non-null    int64 
 16  catsize      101 non-null    int64 
 17  type         101 non-null    int64 
dtypes: int64(17), object(1)
memory usage: 14.3+ KB


In [9]:
df.isnull().sum()

animal name    0
hair           0
feathers       0
eggs           0
milk           0
airborne       0
aquatic        0
predator       0
toothed        0
backbone       0
breathes       0
venomous       0
fins           0
legs           0
tail           0
domestic       0
catsize        0
type           0
dtype: int64

In [10]:
df.describe()

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
count,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0
mean,0.425743,0.19802,0.584158,0.405941,0.237624,0.356436,0.554455,0.60396,0.821782,0.792079,0.079208,0.168317,2.841584,0.742574,0.128713,0.435644,2.831683
std,0.496921,0.400495,0.495325,0.493522,0.42775,0.481335,0.499505,0.491512,0.384605,0.407844,0.27141,0.376013,2.033385,0.439397,0.336552,0.498314,2.102709
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,4.0,1.0,0.0,0.0,2.0
75%,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,4.0,1.0,0.0,1.0,4.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8.0,1.0,1.0,1.0,7.0


In [11]:
count = df.groupby('type')['type'].count().reset_index(name='Count')
count

Unnamed: 0,type,Count
0,1,41
1,2,20
2,3,5
3,4,13
4,5,4
5,6,8
6,7,10


In [12]:
bar = py.bar(count, x='type', y='Count', title='Animal Type')
bar.show()

#### Make x value & y value

In [13]:
x = df.iloc[:, 1:17]
y = df[['type']]

In [14]:
x.head(1)

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize
0,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1


In [15]:
y.head(1)

Unnamed: 0,type
0,1


In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)

In [17]:
def getNeighborsData(x_train, y_train, x_test, y_test):
    arr = []
    for k in range(1,8):
        model = KNeighborsClassifier(n_neighbors=k)
        model.fit(x_train, y_train)
        pred = model.predict(x_test)
        acc = accuracy_score(y_test, pred)

        obj = {
            'n_neighbors': k,
            'accuracy': acc
        }
        arr.append(obj)
    
    return arr

In [18]:
getNeighborsData(x_train, y_train, x_test, y_test)

[{'n_neighbors': 1, 'accuracy': 1.0},
 {'n_neighbors': 2, 'accuracy': 0.9523809523809523},
 {'n_neighbors': 3, 'accuracy': 0.9523809523809523},
 {'n_neighbors': 4, 'accuracy': 0.9047619047619048},
 {'n_neighbors': 5, 'accuracy': 0.8571428571428571},
 {'n_neighbors': 6, 'accuracy': 0.8095238095238095},
 {'n_neighbors': 7, 'accuracy': 0.8571428571428571}]

#### Neighbors 1 is much better

In [19]:
model = KNeighborsClassifier(n_neighbors=1)
model.fit(x_train, y_train)

pred = model.predict(x_test)

acc = accuracy_score(y_test, pred)
acc

1.0

In [20]:
report = classification_report(y_test, pred)
print(report)

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         7
           2       1.00      1.00      1.00         5
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00         3
           7       1.00      1.00      1.00         4

    accuracy                           1.00        21
   macro avg       1.00      1.00      1.00        21
weighted avg       1.00      1.00      1.00        21

