# **Fish Species Prediction**

Build a classifier to predict the species of fish.

Using Random Forest Algorithm.

### **Data preprocessing**

In [76]:
import pandas as pd

from sklearn.preprocessing import MinMaxScaler

In [77]:
dataset = pd.read_csv("fish_dataset.csv")

# Print random datapoints.
dataset.sample(10)

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
146,Smelt,7.5,10.0,10.5,11.6,1.972,1.16
110,Perch,556.0,32.0,34.5,36.5,10.2565,6.3875
22,Bream,620.0,31.5,34.5,39.7,15.5227,5.2801
128,Pike,200.0,30.0,32.3,34.8,5.568,3.3756
158,Smelt,19.9,13.8,15.0,16.2,2.9322,1.8792
104,Perch,265.0,25.4,27.5,28.9,7.0516,4.335
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
5,Bream,450.0,26.8,29.7,34.7,13.6024,4.9274
136,Pike,540.0,40.1,43.0,45.8,7.786,5.1296
144,Pike,1650.0,59.0,63.4,68.0,10.812,7.48


In [78]:
# Split features and target.

y = dataset["Species"]
X = dataset.drop("Species", axis=1)

X.head()

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width
0,242.0,23.2,25.4,30.0,11.52,4.02
1,290.0,24.0,26.3,31.2,12.48,4.3056
2,340.0,23.9,26.5,31.1,12.3778,4.6961
3,363.0,26.3,29.0,33.5,12.73,4.4555
4,430.0,26.5,29.0,34.0,12.444,5.134


In [79]:
y.sample(5)

70    Parkki
69    Parkki
18     Bream
71    Parkki
82     Perch
Name: Species, dtype: object

In [80]:
print(X.shape)
print(y.shape)

(159, 6)
(159,)


In [81]:
# Feature scaling.

scaler = MinMaxScaler()
 
X_scaled = scaler.fit_transform(X.to_numpy())

X_scaled = pd.DataFrame(X_scaled, columns=['Weight', 'Length1', 'Length2', 'Length3', 'Height', 'Width'])

X_scaled.sample(10)

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width
95,0.10303,0.271845,0.274545,0.273649,0.263898,0.377396
38,0.052727,0.207767,0.207273,0.226351,0.225683,0.299814
135,0.309091,0.631068,0.62,0.619932,0.295822,0.480858
127,0.606061,0.652427,0.647273,0.638514,0.624566,0.92301
104,0.160606,0.347573,0.347273,0.339527,0.308975,0.46338
140,0.575758,0.792233,0.787273,0.782095,0.417782,0.722203
18,0.369697,0.454369,0.456364,0.503378,0.807065,0.575975
12,0.30303,0.419417,0.42,0.466216,0.698304,0.468031
56,0.163636,0.32233,0.329091,0.346284,0.372462,0.451187
74,0.024242,0.12233,0.12,0.121622,0.121635,0.19514


### **Model training & evaluation**

In [82]:
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier

In [83]:
# Spliting dataset into train and test.

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42)

print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

X_train: (119, 6)
X_test: (40, 6)
y_train: (119,)
y_test: (40,)


In [84]:
# Model training.

clf = RandomForestClassifier(n_estimators=1000)

clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000)

In [85]:
# Model evaluation.

y_pred = clf.predict(X_test)

score = accuracy_score(y_test, y_pred)
print("Accuracy:", score)

Accuracy: 0.8


In [86]:
# Model saving.

with open('fish_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [87]:
# Model loading.

with open('fish_classifier.pkl', 'rb') as f:
    clf_load = pickle.load(f)

In [88]:
# Custom model prediction.

output = clf_load.predict([[0.1,0.2,0.3,0.4,0.5,0.6]])

print("Output:", output[0])

Output: Parkki


  "X does not have valid feature names, but"
