<a href="https://colab.research.google.com/github/ylfoo/ERA2036/blob/main/Lab/Learn_Classification_thru_Fish.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Credit to Law Yi Yang

In [1]:
# Classification model to predict the fish species
# Load modules and packages
import pandas as pd
from sklearn.model_selection import train_test_split as split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Load dataset
df = pd.read_csv('https://raw.githubusercontent.com/wooihaw/datasets/main/fish.csv')

In [3]:
# Dimension of dataset
df.shape

(159, 7)

In [4]:
# preview 10 lines of data
df.sample(10)

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
87,Perch,120.0,20.0,22.0,23.5,5.64,3.525
99,Perch,180.0,23.0,25.0,26.5,6.4395,3.6835
34,Bream,950.0,38.0,41.0,46.5,17.6235,6.3705
153,Smelt,9.8,11.4,12.0,13.2,2.2044,1.1484
19,Bream,650.0,31.0,33.5,38.7,14.4738,5.7276
116,Perch,900.0,36.5,39.0,41.4,11.1366,7.4934
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
72,Perch,5.9,7.5,8.4,8.8,2.112,1.408
46,Roach,140.0,21.0,22.5,25.0,6.55,3.325
16,Bream,700.0,30.4,33.0,38.3,14.8604,5.2854


In [5]:
# Descriptive statistics
df.describe()

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width
count,159.0,159.0,159.0,159.0,159.0,159.0
mean,398.326415,26.24717,28.415723,31.227044,8.970994,4.417486
std,357.978317,9.996441,10.716328,11.610246,4.286208,1.685804
min,0.0,7.5,8.4,8.8,1.7284,1.0476
25%,120.0,19.05,21.0,23.15,5.9448,3.38565
50%,273.0,25.2,27.3,29.4,7.786,4.2485
75%,650.0,32.7,35.5,39.65,12.3659,5.5845
max,1650.0,59.0,63.4,68.0,18.957,8.142


In [6]:
# Split dataset into training and testing sets
y = df.values[:, 0]
del df["Species"]
X = df.values
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)

In [7]:
# Train and evaluate a k-NN model with k=1
knn = KNeighborsClassifier(n_neighbors=1).fit(X_train, y_train)
print(f"knn accuracy: {knn.score(X_test, y_test)}")

knn accuracy: 0.525


In [8]:
# Train and evaluate a decision tree model with max_depth=1
dtc = DecisionTreeClassifier(random_state=42, max_depth=1).fit(X_train, y_train)
print(f"dtc accuracy: {dtc.score(X_test, y_test)}")

dtc accuracy: 0.475


In [9]:
print("Dimensions of the dataset:", df.shape)
print("Dimensions of the training set (X_train):", X_train.shape)
print("Dimensions of the testing set (X_test):", X_test.shape)

Dimensions of the dataset: (159, 6)
Dimensions of the training set (X_train): (119, 6)
Dimensions of the testing set (X_test): (40, 6)


In [10]:
num_features = X_train.shape[1]
print("Number of features in the dataset:", num_features)

Number of features in the dataset: 6


In [11]:
num_samples_train = X_train.shape[0]
num_samples_test = X_test.shape[0]
print("Number of data samples in the training set:", num_samples_train)
print("Number of data samples in the testing set:", num_samples_test)

Number of data samples in the training set: 119
Number of data samples in the testing set: 40


In [12]:
k_values = [3, 5, 7, 9]
knn_scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k).fit(X_train, y_train)
    knn_score = knn.score(X_test, y_test)
    knn_scores.append((k, knn_score))

print("K-NN Performance:")
for k, score in knn_scores:
    print(f"k={k}, accuracy: {score}")


K-NN Performance:
k=3, accuracy: 0.475
k=5, accuracy: 0.6
k=7, accuracy: 0.575
k=9, accuracy: 0.6


In [13]:
max_depth_values = list(range(2, 11))
dtc_scores = []

for max_depth in max_depth_values:
    dtc = DecisionTreeClassifier(random_state=42, max_depth=max_depth).fit(X_train, y_train)
    dtc_score = dtc.score(X_test, y_test)
    dtc_scores.append((max_depth, dtc_score))

print("Decision Tree Performance:")
for max_depth, score in dtc_scores:
    print(f"max_depth={max_depth}, accuracy: {score}")


Decision Tree Performance:
max_depth=2, accuracy: 0.675
max_depth=3, accuracy: 0.7
max_depth=4, accuracy: 0.7
max_depth=5, accuracy: 0.725
max_depth=6, accuracy: 0.625
max_depth=7, accuracy: 0.75
max_depth=8, accuracy: 0.775
max_depth=9, accuracy: 0.7
max_depth=10, accuracy: 0.725


Between these two models, Decision Tree is more suitable for this dataset and max_depth = 8 is the optimum value for the hyperparameter.