In [19]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset
df1 = pd.read_csv('modified.csv')
df2 = pd.read_csv('modified_1.csv')
df = pd.concat([df1, df2], ignore_index=True)

# Convert timestamp columns to datetime
df['bus_board_computer_sent_time'] = pd.to_datetime(df['bus_board_computer_sent_time'])
df['created_time'] = pd.to_datetime(df['created_time'], errors='coerce', format='ISO8601')

# Calculate bus arrival time difference in seconds
df['arrival_time_diff'] = (df['created_time'] - df['bus_board_computer_sent_time']).dt.total_seconds()

# Select features for clustering
features = ['bus_stop_id', 'bus_id', 'route_number', 'enter_sum', 'exit_sum', 'tickets_count', 'arrival_time_diff']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[features])

# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X_scaled)

# Plot the clustering results
plt.figure(figsize=(8, 6))
plt.scatter(X_scaled[:, 0], X_scaled[:, -1], c=df['cluster'], cmap='viridis', alpha=0.6)
plt.xlabel('Bus Stop ID (Scaled)')
plt.ylabel('Arrival Time Difference (Scaled)')
plt.title('K-Means Clustering of Bus Arrival Times')
plt.colorbar(label='Cluster')
plt.show()

# Compute accuracy by checking how well clusters align with route numbers
accuracy = np.mean(df['cluster'] == df['route_number'] % 3) * 100
print(f'Clustering Accuracy: {accuracy:.2f}%')


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from scipy.stats import mode

# Step 1: Load Data
df1 = pd.read_csv('modified.csv')
df2 = pd.read_csv('modified_1.csv')
df = pd.concat([df1, df2], ignore_index=True)

# Step 2: Data Preprocessing
df['route_number'] = pd.to_numeric(df['route_number'], errors='coerce')
df['bus_stop_id'] = pd.to_numeric(df['bus_stop_id'], errors='coerce')

# Drop NaN values if any
df.dropna(subset=['route_number', 'bus_stop_id'], inplace=True)

# Step 3: KMeans Clustering
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(df[['route_number', 'bus_stop_id']])

# Step 4: Map Clusters to True Labels (If available)
if 'true_label' in df.columns:  # Assuming we have true labels
    labels = np.zeros_like(df['cluster'])
    for i in range(3):
        mask = (df['cluster'] == i)
        labels[mask] = mode(df['true_label'][mask])[0][0]

    # Compute Accuracy
    accuracy = accuracy_score(df['true_label'], labels)
    print(f"Clustering Accuracy: {accuracy * 100:.2f}%")

# Step 5: Visualize KMeans Clustering
plt.figure(figsize=(8, 6))
plt.scatter(df['route_number'], df['bus_stop_id'], c=df['cluster'], cmap='viridis', edgecolor='k', s=50, alpha=0.7)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], 
            c='red', marker='X', s=200, label='Centroids')
plt.title("K-Means Clustering of Bus Routes")
plt.xlabel("Route Number")
plt.ylabel("Bus Stop ID")
plt.legend()
plt.colorbar(label="Cluster")
plt.show()

ValueError: time data "2024-12-12T09:51:50" doesn't match format "%Y-%m-%dT%H:%M:%S.%f", at position 3154. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.