<a href="https://colab.research.google.com/github/tuntul17/project-cycling/blob/main/Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from pylab import *
import seaborn as sns # just in case
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Cycling-adventures/data/output_data/cleanNsummed.csv")

In [3]:
#Usage ==0 has missing value which may effect the result!
df = df[df["Usage"]==1].reset_index()
df.drop(["index"],axis=1,inplace=True)

In [None]:
t_cor = df[df.columns[:-2]].corr()
figure(figsize=(10,8))
sns.heatmap(t_cor, annot=True)
savefig("/content/drive/MyDrive/Colab Notebooks/Cycling-adventures/data/output_data/Correlation.jpg")
close()

In [5]:
# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Fit and transform the data
scaled_data = scaler.fit_transform(df[df.columns[2:-2]])

# Create a DataFrame for the scaled data
scaled_df = pd.DataFrame(scaled_data, columns=df.columns[2:-2])

# Concatenate the scaled DataFrame with the last two columns of the original DataFrame
df1 = pd.concat([scaled_df, df[df.columns[-2:]]], axis=1)


In [None]:
# Create an empty list to store the inertia (within-cluster sum of squares) values
inertia = []
# Define the range of K values
k_values = range(1,10)
# Fit K-Means for each K and calculate inertia
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df1[df1.columns[:-2]])  # Replace 'scaled_data' with your scaled feature matrix
    inertia.append(kmeans.inertia_)


In [14]:
# Plot the elbow curve
figure(figsize=(8, 6))
plot(k_values, inertia, marker='o', linestyle='-', color='b')
xlabel('Number of Clusters (K)')
ylabel('Inertia')
title('Elbow Method for Optimal K')
grid(True)
savefig("/content/drive/MyDrive/Colab Notebooks/Cycling-adventures/data/output_data/ElbowMethod.jpg")
close()

In [None]:
# Create a K-Means instance with the optimal number of clusters (e.g., k_optimal)
kmeans = KMeans(n_clusters=4, random_state=42)

# Fit K-Means to your scaled data
#cluster_labels = kmeans.fit_predict(df1[:-2])
# Fit K-Means to your scaled data
cluster_labels = kmeans.fit_predict(df1[df1.columns[:-2]])  # Exclude the last two rows and the 'Cluster' column

# Assign cluster labels to a new column in df1
df1['Cluster'] = cluster_labels

In [15]:
# Assuming 'df' is your original DataFrame and 'cluster_labels' contains cluster assignments
df1['Cluster'] = cluster_labels  # Add cluster assignments as a new column

# Calculate the mean of each feature within each cluster
cluster_means = (df1.groupby('Cluster')[df1.columns[:-3]].mean()).transpose()

# Create a heatmap
plt.figure(figsize=(14, 7))
sns.heatmap(cluster_means*10, cmap='coolwarm', annot=True)
plt.title('Feature Means by Cluster')
savefig("/content/drive/MyDrive/Colab Notebooks/Cycling-adventures/data/output_data/FeatureMeans.jpg")
close()


* Cluster == 0: Long Z2 Rides
* Cluster == 1: Short Z2 Rides
* Cluster == 2: Short Tempo Rides
* Cluster == 3: Coffe&Gossip Rides

In [10]:
df = df.merge(df1[["Cluster","id"]],on="id",how="inner")
df.to_csv("/content/drive/MyDrive/Colab Notebooks/Cycling-adventures/data/output_data/clustered_summ.csv")

In [16]:
df2 = df1.drop(["id","Usage"],axis=1)
# Calculate means and standard deviations of features within each cluster
cluster_means = df2.groupby('Cluster').mean()*10
cluster_std = df2.groupby('Cluster').std()*10

# Create cluster profiles with error bars
plt.figure(figsize=(12, 8))
for i, cluster in enumerate(cluster_means.index):
    plt.errorbar(
        range(len(cluster_means.columns)),
        cluster_means.loc[cluster],
        yerr=cluster_std.loc[cluster] / np.sqrt(len(df1[df1['Cluster'] == cluster])),
        label=f'Cluster {cluster}',
        marker='o',
    )

plt.xticks(range(len(cluster_means.columns)), cluster_means.columns, rotation=45)
plt.xlabel('Features')
plt.ylabel('Feature Values')
plt.title('Cluster Profiles')
plt.legend()
plt.grid(True)
plt.savefig("/content/drive/MyDrive/Colab Notebooks/Cycling-adventures/data/output_data/ClusterProfile.jpg")
plt.close()