In [None]:
import pandas as pd
from tabulate import tabulate

#load telematics data
df = pd.read_csv("G9WT3MTW2E7Z.csv")
print(tabulate(df.head(), headers="keys", tablefmt="fancy_grid"))

In [None]:
import numpy as np

#droping columns
columns_to_drop = ["telematics_id", "h2gen_id", "fuel_lifetime", "fuel_telematics", "engine_speed", "runtime"]
df = df.drop(columns=columns_to_drop)

#converting timestamp to datetime
df["timestamp"] = pd.to_datetime(df["timestamp"])

#handling missing values (-1 to NaN)
df["odometer"] = df["odometer"].replace(-1, np.nan)

#droping rows with missing speed, timestamp, latitude, longitude
df = df.dropna(subset=["speed", "timestamp", "latitude", "longitude"])
print(f"After dropping NaN speeds/locations: {len(df)} rows")
print(tabulate(df.head(), headers="keys", tablefmt="fancy_grid"))


In [None]:
#handling duplicate timestamps
df_avg = df.groupby("timestamp").agg({
    "speed": "mean",
    "latitude": "first",
    "longitude": "first",
    "odometer": "first"
}).reset_index()
print(f"After averaging same timestamps: {len(df_avg)} rows")
print(tabulate(df_avg.head(), headers="keys", tablefmt="fancy_grid"))


In [None]:
#segmenting into driving sessions (time gap > 5 min)
df_avg["time_diff"] = df_avg["timestamp"].diff().dt.total_seconds().fillna(0)
df_avg["session"] = (df_avg["time_diff"] > 300).cumsum()
print(f"Number of sessions: {df_avg['session'].nunique()}")


In [None]:
!pip install haversine

In [None]:
from haversine import haversine

#calculating features per session
features = []
for session_id, session_df in df_avg.groupby("session"):

    #avg speed (moving average)
    moving_speeds = session_df["speed"][session_df["speed"] > 0]
    if len(moving_speeds) > 0:
        avg_speed = moving_speeds.mean()
    else:
        avg_speed = 0

    #max speed
    max_speed = session_df["speed"].max()

    #avg acceleration
    speeds = session_df["speed"].values
    times = (session_df["timestamp"] - session_df["timestamp"].iloc[0]).dt.total_seconds().values
    accel = []
    for i in range(1, len(speeds)):
        delta_speed = speeds[i] - speeds[i-1]
        delta_time = times[i] - times[i-1]
        if delta_time > 0:
            accel.append(delta_speed / delta_time)
    avg_accel = np.mean(np.abs(accel)) if accel else 0

    #distance (using odometer or Haversine)
    if session_df["odometer"].isnull().sum() == 0:
        distance = session_df["odometer"].iloc[-1] - session_df["odometer"].iloc[0]
    else:
        distance = 0
        for i in range(1, len(session_df)):
            coord1 = (session_df["latitude"].iloc[i-1], session_df["longitude"].iloc[i-1])
            coord2 = (session_df["latitude"].iloc[i], session_df["longitude"].iloc[i])
            distance += haversine(coord1, coord2)

    #idle fraction
    idle_fraction = (session_df["speed"] == 0).mean()

    features.append([avg_speed, max_speed, avg_accel, distance, idle_fraction])
    print(f"Session {session_id}: Avg_Speed={avg_speed:.2f}, Max_Speed={max_speed:.2f}, Avg_Accel={avg_accel:.2f}, Distance={distance:.2f}, Idle_Fraction={idle_fraction:.2f}")


In [None]:
#creating features dataframe
features_df = pd.DataFrame(features, columns=["Avg_Speed", "Max_Speed", "Avg_Accel", "Distance", "Idle_Fraction"])
print(f"Features DataFrame: {len(features_df)} rows")
print(tabulate(features_df.head(), headers="keys", tablefmt="fancy_grid"))


In [None]:
from sklearn.preprocessing import StandardScaler

#standardizing features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features_df)


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

#computing inertia and silhouette scores for k=2 to 10
inertia = []
silhouette_scores = []

k_range = range(2, 11)
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=101)
    kmeans.fit(scaled_features)
    inertia.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(scaled_features, kmeans.labels_))

#plot inertia - elbow method
plt.figure(figsize=(8, 6))
plt.plot(k_range, inertia, marker="o", color="blue")
plt.title("Elbow Method")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia")
plt.grid(True)
plt.show()


In [None]:
#plot silhouette scores
plt.figure(figsize=(8, 6))
plt.plot(k_range, silhouette_scores, marker="s", color="red")
plt.title("Silhouette Score vs. Number of Clusters")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Silhouette Score")
plt.grid()
plt.show()


In [None]:
#applying K-Means clustering (k=3)
kmeans = KMeans(n_clusters=3, random_state=101)
features_df["Cluster"] = kmeans.fit_predict(scaled_features)


In [None]:
#mapping clusters to desired labels
cluster_map = {
    0: "Fast-paced and Long-Haul Driving",
    1: "Slow and Idle-Prone",
    2: "Moderate and Mixed-Pattern"
}

features_df["Cluster_Label"] = features_df["Cluster"].map(cluster_map)

#displaying results
print("\nFinal Clusters:")
print(tabulate(features_df.head(), headers="keys", tablefmt="fancy_grid"))


In [None]:
from sklearn.decomposition import PCA
import seaborn as sns

#PCA for visualization
pca = PCA(n_components=2)
pca_components = pca.fit_transform(scaled_features)

features_df["PCA1"] = pca_components[:, 0]
features_df["PCA2"] = pca_components[:, 1]

#scatter plot
plt.figure(figsize=(8, 6))
sns.scatterplot(data=features_df, x="PCA1", y="PCA2", hue="Cluster_Label", palette="viridis", s=100)
plt.title("Clusters and Centroids after PCA")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend(title="Cluster", loc="upper center")
plt.show()
