In [27]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import mean_squared_error

Problem Statement 1: 
Utilize agglomerative clustering for actual and predicted classTasks to be performed:

•Load the file “zoo.data” and look at the info and first five rows.
The first column denotes the animal name and the last one specifies a high-level class for the corresponding animal.
•Find out the unique number of high level class.
•Use the 16-intermediate feature and perform an agglomerative clustering. 
[ Hint: Refer to the agglomerative clustering module in scikit learn and set the number of clusters appropriately]
•Compute the mean squared error by comparing the actual class and predicted high level class.

In [3]:
# Loading the dataset
data = pd.read_csv("zoo.csv")
print(data.head())

  animal_name  hair  feathers  eggs  milk  airborne  aquatic  predator  \
0    aardvark     1         0     0     1         0        0         1   
1    antelope     1         0     0     1         0        0         0   
2        bass     0         0     1     0         0        1         1   
3        bear     1         0     0     1         0        0         1   
4        boar     1         0     0     1         0        0         1   

   toothed  backbone  breathes  venomous  fins  legs  tail  domestic  catsize  \
0        1         1         1         0     0     4     0         0        1   
1        1         1         1         0     0     4     1         0        1   
2        1         1         0         0     1     0     1         0        0   
3        1         1         1         0     0     4     0         0        1   
4        1         1         1         0     0     4     1         0        1   

   class_type  
0           1  
1           1  
2           4  
3   

In [17]:
data['class_type'].value_counts()

class_type
1    41
2    20
4    13
7    10
6     8
3     5
5     4
Name: count, dtype: int64

In [35]:
data['class_type'].unique()

array([1, 4, 2, 7, 6, 5, 3], dtype=int64)

In [21]:
data['class_type'].nunique()

7

In [7]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   animal_name  101 non-null    object
 1   hair         101 non-null    int64 
 2   feathers     101 non-null    int64 
 3   eggs         101 non-null    int64 
 4   milk         101 non-null    int64 
 5   airborne     101 non-null    int64 
 6   aquatic      101 non-null    int64 
 7   predator     101 non-null    int64 
 8   toothed      101 non-null    int64 
 9   backbone     101 non-null    int64 
 10  breathes     101 non-null    int64 
 11  venomous     101 non-null    int64 
 12  fins         101 non-null    int64 
 13  legs         101 non-null    int64 
 14  tail         101 non-null    int64 
 15  domestic     101 non-null    int64 
 16  catsize      101 non-null    int64 
 17  class_type   101 non-null    int64 
dtypes: int64(17), object(1)
memory usage: 14.3+ KB
None


In [23]:
# Extracting relevant columns
animal_names = data.iloc[:, 0]  # First column: Animal names
features = data.iloc[:, 1:-1]  # Intermediate features
actual_classes = data.iloc[:, -1]  # Last column: High-level classes

In [37]:
features

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize
0,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1
1,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1
2,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0
3,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1
4,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,1,0,0,1,0,0,0,1,1,1,0,0,2,1,0,1
97,1,0,1,0,1,0,0,0,0,1,1,0,6,0,0,0
98,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1
99,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0


In [25]:
# Finding the number of unique classes
unique_classes = actual_classes.nunique()
print(f"Number of unique high-level classes: {unique_classes}")

Number of unique high-level classes: 7


In [31]:
# Performing Agglomerative Clustering
agg_clustering = AgglomerativeClustering(n_clusters=unique_classes, linkage='ward')
predicted_classes = agg_clustering.fit_predict(features)
predicted_classes

array([1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 3, 2, 6, 0, 0, 3, 1, 2, 2, 3, 3,
       1, 3, 0, 5, 5, 4, 1, 4, 0, 1, 4, 3, 2, 1, 1, 3, 2, 0, 0, 3, 0, 3,
       1, 1, 0, 1, 1, 1, 1, 0, 5, 0, 1, 1, 3, 3, 3, 3, 2, 2, 6, 5, 1, 1,
       2, 1, 1, 1, 1, 3, 0, 2, 2, 4, 6, 6, 3, 3, 6, 6, 2, 3, 4, 0, 2, 3,
       0, 5, 5, 5, 2, 4, 1, 3, 4, 0, 1, 6, 3], dtype=int64)

In [33]:
# Computing Mean Squared Error
mse = mean_squared_error(actual_classes, predicted_classes)
print(f"Mean Squared Error between actual and predicted classes: {mse:.4f}")

Mean Squared Error between actual and predicted classes: 7.6733
