In [146]:
import pandas as pd
import plotly.graph_objs as go
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

In [147]:
df = pd.read_csv('../../datasets/fish.csv')
df.tail()

Unnamed: 0,species,weight,length1,length2,length3,length4,length5
80,Pike,950.0,48.3,51.7,55.1,16.2,11.2
81,Pike,1250.0,52.0,56.0,59.7,17.9,11.7
82,Pike,1600.0,56.0,60.0,64.0,15.0,9.6
83,Pike,1550.0,56.0,60.0,64.0,15.0,9.6
84,Pike,1650.0,59.0,63.4,68.0,15.9,11.0


In [148]:
# Show unique species
df['species'].unique()

array(['Bream', 'Roach', 'Smelt', 'Pike'], dtype=object)

In [149]:
from plotly.subplots import make_subplots

# Create subplots
fig = make_subplots(rows=1, cols=5, subplot_titles=('Weight vs Length 1', 'Weight vs Length 2', 'Weight vs Length 3', 'Weight vs Length 4', 'Weight vs Length 5'))

# Add scatter plots for each length column
for i in range(1, 6):
    fig.add_trace(go.Scatter(x=df['weight'], y=df[f'length{i}'], mode='markers'), row=1, col=i)

# Update subplot layout
fig.update_layout(title='Weight vs Length Scatterplots for Fish', height=400, width=1000)

# Show figure
fig.show()

In [150]:
import plotly.express as px

# Select weight and length columns
df_corr = df.iloc[:, [1,2,3,4,5,6]]

# Calculate correlation matrix
corr_matrix = df_corr.corr()

# Create heatmap chart
fig = px.imshow(corr_matrix,
                labels=dict(x="Fish Measurements", y="Fish Measurements", color="Correlation Coefficient"),
                x=['Weight', 'Length 1', 'Length 2', 'Length 3', 'Length 4', 'Length 5'],
                y=['Weight', 'Length 1', 'Length 2', 'Length 3', 'Length 4', 'Length 5'],
                color_continuous_scale='RdBu',
                zmin=-1, zmax=1)

fig.update_layout(title='Correlation Matrix of Fish Measurements')
fig.show()


In [151]:
X = df.iloc[:, 1:7].values

# Determine the optimal number of clusters using the elbow method
sse = []
k_values = range(2, 10)
for k in k_values:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    sse.append(kmeans.inertia_)

# Plot the elbow curve
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(k_values), y=sse, mode='lines+markers'))
fig.update_layout(title='Elbow Curve for Fish Clustering', xaxis_title='Number of Clusters', yaxis_title='Sum of Squared Distances')
fig.show()

In [152]:
from sklearn.cluster import KMeans
import plotly.graph_objects as go
import pandas as pd


# Select the relevant features
X = df[['weight', 'length3']]

# Train the KMeans model with 4 clusters
kmeans = KMeans(n_clusters=4, random_state=42).fit(X)

# Add a new column to the dataframe with the cluster labels
df['cluster'] = kmeans.labels_

# Create a scatter plot for each combination of features
fig = go.Figure()
for i, feat1 in enumerate(X.columns):
    for j, feat2 in enumerate(X.columns):
        if i >= j:
            continue
        fig.add_trace(
            go.Scatter(
                x=X[feat1],
                y=X[feat2],
                mode='markers',
                marker=dict(
                    color=df['cluster'],
                    colorscale='Viridis',
                    size=7,
                    line=dict(width=0.5, color='white')
                ),
                name='',
                showlegend=False
            )
        )

# Set the axis titles and show the plot
fig.update_layout(
    title='KMeans Clustering (k=4) of Fish Species',
    xaxis_title='Weight',
    yaxis_title='Length',
    width=800,
    height=600
)
fig.show()


In [153]:
from sklearn.metrics import silhouette_score

# Create a KMeans model with k=4
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)

# Calculate the silhouette score
score = silhouette_score(X, kmeans.labels_)
print(f"Silhouette Score: {score:.3f}")


Silhouette Score: 0.612


In [154]:
# Get cluster labels for each data point
cluster_labels = kmeans.predict(X)

# Map cluster labels to species labels
species_labels = {0: 'Bream', 1: 'Roach', 2: 'Smelt', 3: 'Parkki'}
species_cluster_labels = [species_labels[label] for label in cluster_labels]



In [155]:
def return_specie(predicted_cluster):
    # Map the predicted cluster to a species
    if predicted_cluster == 0:
        predicted_species = "Bream"
    elif predicted_cluster == 1:
        predicted_species = "Roach"
    elif predicted_cluster == 2:
        predicted_species = "Smelt"
    else:
        predicted_species = "Pike"
    return predicted_species

In [156]:
def generate_new_predict(new_data):
    # Predict the cluster
    predicted_cluster = kmeans.predict(new_data)
    predicted_species = return_specie(predicted_cluster)
    print(f"The predicted cluster is {predicted_cluster[0]} and the predicted species is {predicted_species}.")


In [157]:
# Create a new data point
new_data = [[250, 25 ]]
generate_new_predict(new_data)

# Create a new data point
new_data = [[800, 37.2 ]]
generate_new_predict(new_data)

# Create a new data point
new_data = [[495, 9 ]]
generate_new_predict(new_data)

# Create a new data point
new_data = [[1250, 65 ]]
generate_new_predict(new_data)

The predicted cluster is 0 and the predicted species is Bream.
The predicted cluster is 3 and the predicted species is Pike.
The predicted cluster is 1 and the predicted species is Roach.
The predicted cluster is 2 and the predicted species is Smelt.


In [158]:
# Select three features for clustering
X = df.iloc[:, [1, 2, 4]].values
X[1:5]


array([[290. ,  24. ,  31.2],
       [340. ,  23.9,  31.1],
       [363. ,  26.3,  33.5],
       [430. ,  26.5,  34. ]])

In [159]:

# Perform k-means clustering with 3 clusters
kmeans3d = KMeans(n_clusters=4)
y_kmeans = kmeans3d.fit_predict(X)

# Create a 3D scatter plot of the clusters
fig = px.scatter_3d(df, x=X[:, 0], y=X[:, 1], z=X[:, 2], color=y_kmeans)
fig.show()


In [160]:
def generate_new_predict(new_data):
    # Predict the cluster
    predicted_cluster = kmeans3d.predict(new_data)
    predicted_species = return_specie(predicted_cluster)

    print(f"The predicted cluster is {predicted_cluster[0]} and the predicted species is {predicted_species}.")


In [161]:
# Create a new data point
new_data = [[900, 48, 55]]
generate_new_predict(new_data)

# Create a new data point
new_data = [[9, 13, 16]]
generate_new_predict(new_data)

# Create a new data point
new_data = [[90, 20, 21]]
generate_new_predict(new_data)

# Create a new data point
new_data = [[400, 38, 31]]
generate_new_predict(new_data)


The predicted cluster is 2 and the predicted species is Smelt.
The predicted cluster is 1 and the predicted species is Roach.
The predicted cluster is 1 and the predicted species is Roach.
The predicted cluster is 0 and the predicted species is Bream.
