# 🚴 Bike-Level Predictive Maintenance Analysis
## Team: Anomaly Archers (Section B)

**Techniques Used:**
- K-Means Clustering (Unsupervised Learning)
- Isolation Forest (Anomaly Detection)
- Time Series Analysis
- Health Scoring & Maintenance Ranking

---

## 1. Setup & Imports

In [None]:
# Core libraries
import pandas as pd
import numpy as np
from datetime import datetime
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# ML libraries
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print("✅ All libraries imported successfully!")

## 2. Load Enhanced Data

We use the **synthesized dataset** with:
- `bike_id` - Unique bike identifier
- `start_odometer_km` / `end_odometer_km` - Distance meter readings
- `user_rating` - Customer feedback (nullable)
- `complaint_flag` - Customer complaints

In [None]:
# Load enhanced dataset with bike IDs
df = pd.read_csv('../data/processed/enhanced_trips.csv')
df['started_at'] = pd.to_datetime(df['started_at'])

print(f"✅ Loaded {len(df):,} trips")
print(f"   Unique bikes: {df['bike_id'].nunique():,}")
print(f"   Classic bikes: {(df['rideable_type']=='classic_bike').sum():,} trips")
print(f"   Electric bikes: {(df['rideable_type']=='electric_bike').sum():,} trips")

In [None]:
# View sample data
df[['ride_id', 'bike_id', 'rideable_type', 'duration_min', 
    'start_odometer_km', 'end_odometer_km', 'trip_distance_km',
    'user_rating', 'complaint_flag']].head(10)

## 3. Bike-Level Feature Engineering

Aggregate trip data to create **per-bike features**.

In [None]:
# Create bike-level features
bike_features = df.groupby('bike_id').agg(
    rideable_type=('rideable_type', 'first'),
    total_trips=('ride_id', 'count'),
    total_duration_min=('duration_min', 'sum'),
    avg_trip_duration=('duration_min', 'mean'),
    std_trip_duration=('duration_min', 'std'),
    total_distance_km=('trip_distance_km', 'sum'),
    avg_trip_distance=('trip_distance_km', 'mean'),
    first_trip=('started_at', 'min'),
    last_trip=('started_at', 'max'),
    unique_start_stations=('start_station_id', 'nunique'),
    unique_end_stations=('end_station_id', 'nunique'),
    member_trips=('member_casual', lambda x: (x == 'member').sum()),
    avg_user_rating=('user_rating', 'mean'),
    ratings_given=('user_rating', lambda x: x.notna().sum()),
    complaint_count=('complaint_flag', 'sum'),
    days_since_service=('days_since_service', 'first')
).reset_index()

# Derived features
bike_features['days_active'] = (bike_features['last_trip'] - bike_features['first_trip']).dt.days + 1
bike_features['trips_per_day'] = bike_features['total_trips'] / bike_features['days_active']
bike_features['member_ratio'] = bike_features['member_trips'] / bike_features['total_trips']
bike_features['complaint_rate'] = bike_features['complaint_count'] / bike_features['total_trips']
bike_features['short_trip_ratio'] = df.groupby('bike_id').apply(
    lambda x: (x['duration_min'] < 3).sum() / len(x)
).values

# Add cumulative mileage from odometer
max_odometer = df.groupby('bike_id')['end_odometer_km'].max().reset_index()
max_odometer.columns = ['bike_id', 'cumulative_mileage']
bike_features = bike_features.merge(max_odometer, on='bike_id', how='left')

# Fill NaN
bike_features = bike_features.fillna(0)

print(f"✅ Created features for {len(bike_features):,} bikes")
bike_features.head()

## 4. Prepare Features for ML

In [None]:
# Select features for clustering
feature_columns = [
    'total_trips', 'total_distance_km', 'avg_trip_duration', 'avg_trip_distance',
    'trips_per_day', 'member_ratio', 'avg_user_rating', 'complaint_rate',
    'short_trip_ratio', 'days_since_service', 'cumulative_mileage'
]

X = bike_features[feature_columns].copy()
X = X.replace([np.inf, -np.inf], 0).fillna(0)

# Normalize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"✅ Feature matrix shape: {X_scaled.shape}")
print(f"\nFeatures used: {feature_columns}")

## 5. K-Means Clustering

Group bikes by usage patterns to identify different "usage profiles".

In [None]:
# Find optimal K using Elbow Method
inertias = []
silhouette_scores_list = []
K_range = range(2, 8)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores_list.append(silhouette_score(X_scaled, kmeans.labels_))

# Plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
axes[0].set_xlabel('K')
axes[0].set_ylabel('Inertia')
axes[0].set_title('Elbow Method for Optimal K')
axes[0].grid(True)

axes[1].plot(K_range, silhouette_scores_list, 'go-', linewidth=2, markersize=8)
axes[1].set_xlabel('K')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Score vs K')
axes[1].grid(True)

plt.tight_layout()
plt.savefig('../outputs/figures/bike_elbow_silhouette.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nSilhouette Scores: {dict(zip(K_range, [f'{s:.3f}' for s in silhouette_scores_list]))}")

In [None]:
# Train final model with K=4
K = 4
kmeans = KMeans(n_clusters=K, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_scaled)
bike_features['cluster'] = cluster_labels

# Name clusters based on usage intensity
cluster_summary = bike_features.groupby('cluster')[feature_columns].mean()
usage_order = cluster_summary['trips_per_day'].sort_values().index.tolist()

cluster_names = {
    usage_order[0]: '🟢 Light Usage',
    usage_order[1]: '🟡 Moderate Usage',
    usage_order[2]: '🟠 Heavy Usage',
    usage_order[3]: '🔴 Extreme Usage'
}
bike_features['cluster_name'] = bike_features['cluster'].map(cluster_names)

print(f"✅ Clustering complete (K={K})")
print(f"\nCluster Distribution:")
for c, name in cluster_names.items():
    count = (bike_features['cluster'] == c).sum()
    print(f"  {name}: {count} bikes")

In [None]:
# Visualize clusters using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(12, 8))
colors = ['#2ecc71', '#f1c40f', '#e67e22', '#e74c3c']
for cluster_id in sorted(bike_features['cluster'].unique()):
    mask = bike_features['cluster'] == cluster_id
    plt.scatter(X_pca[mask, 0], X_pca[mask, 1], c=colors[cluster_id],
                label=cluster_names[cluster_id], alpha=0.6, s=60, edgecolors='black', linewidth=0.5)

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)')
plt.title('Bike Clusters by Usage Profile (PCA)')
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.savefig('../outputs/figures/bike_cluster_visualization.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Anomaly Detection (Isolation Forest)

Identify bikes with unusual behavior patterns.

In [None]:
# Train Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42, n_estimators=100)
anomaly_labels = iso_forest.fit_predict(X_scaled)
anomaly_scores = iso_forest.decision_function(X_scaled)

bike_features['anomaly'] = anomaly_labels
bike_features['anomaly_score'] = anomaly_scores
bike_features['is_anomaly'] = anomaly_labels == -1

print(f"✅ Anomaly detection complete")
print(f"   Normal bikes: {(anomaly_labels == 1).sum()}")
print(f"   Anomalous bikes: {(anomaly_labels == -1).sum()}")
print(f"   Anomaly rate: {(anomaly_labels == -1).mean()*100:.1f}%")

In [None]:
# Visualize anomalies
plt.figure(figsize=(12, 8))
plt.scatter(X_pca[~bike_features['is_anomaly'], 0],
            X_pca[~bike_features['is_anomaly'], 1],
            c='#3498db', label='Normal', alpha=0.5, s=50)
plt.scatter(X_pca[bike_features['is_anomaly'], 0],
            X_pca[bike_features['is_anomaly'], 1],
            c='#e74c3c', label='Anomaly', alpha=0.9, s=120, marker='X', edgecolors='black')

plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Bike Anomaly Detection (Isolation Forest)')
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.savefig('../outputs/figures/bike_anomaly_detection.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Time Series Analysis

Track usage trends to detect degradation patterns.

In [None]:
# Calculate usage trends per bike
df['date'] = df['started_at'].dt.date
daily_stats = df.groupby(['bike_id', 'date']).agg(
    daily_trips=('ride_id', 'count'),
    daily_distance=('trip_distance_km', 'sum')
).reset_index()

def calculate_trend(group):
    if len(group) < 7:
        return 0
    x = np.arange(len(group))
    y = group['daily_trips'].values
    slope, _, _, _, _ = stats.linregress(x, y)
    return slope

bike_trends = daily_stats.groupby('bike_id').apply(calculate_trend).reset_index()
bike_trends.columns = ['bike_id', 'usage_trend']

# Merge and categorize
bike_features = bike_features.merge(bike_trends, on='bike_id', how='left')
bike_features['usage_trend'] = bike_features['usage_trend'].fillna(0)

bike_features['trend_category'] = pd.cut(
    bike_features['usage_trend'],
    bins=[-np.inf, -0.3, 0.3, np.inf],
    labels=['📉 Declining', '➡️ Stable', '📈 Increasing']
)

print("✅ Trend analysis complete")
print("\nTrend Distribution:")
print(bike_features['trend_category'].value_counts())

## 8. Health Scoring & Maintenance Ranking

Combine all signals into a unified **health score**.

In [None]:
# Calculate health score components

# 1. Cluster Risk
cluster_risk_map = {usage_order[0]: 0.1, usage_order[1]: 0.3, usage_order[2]: 0.6, usage_order[3]: 0.9}
bike_features['cluster_risk'] = bike_features['cluster'].map(cluster_risk_map)

# 2. Anomaly Risk (normalized 0-1)
min_score = bike_features['anomaly_score'].min()
max_score = bike_features['anomaly_score'].max()
bike_features['normalized_anomaly'] = 1 - ((bike_features['anomaly_score'] - min_score) / (max_score - min_score))

# 3. Trend Risk
bike_features['trend_category_str'] = bike_features['trend_category'].astype(str)
trend_risk_map = {'📉 Declining': 0.7, '➡️ Stable': 0.3, '📈 Increasing': 0.1}
bike_features['trend_risk'] = bike_features['trend_category_str'].map(trend_risk_map).fillna(0.3)

# 4. Complaint Risk
bike_features['complaint_risk'] = (bike_features['complaint_rate'] * 10).clip(0, 1)

# 5. Service Age Risk
bike_features['service_risk'] = (bike_features['days_since_service'] / 180).clip(0, 1)

# 6. Rating Risk
bike_features['rating_risk'] = np.where(
    bike_features['avg_user_rating'] > 0,
    1 - (bike_features['avg_user_rating'] / 5),
    0.5
)

# Composite Score (weighted average)
bike_features['health_score'] = (
    0.20 * bike_features['cluster_risk'] +
    0.20 * bike_features['normalized_anomaly'] +
    0.10 * bike_features['trend_risk'] +
    0.20 * bike_features['complaint_risk'] +
    0.15 * bike_features['service_risk'] +
    0.15 * bike_features['rating_risk']
)

# Categorize
def categorize_health(score):
    if score < 0.35: return '🟢 Stable'
    elif score < 0.55: return '🟡 Warning'
    else: return '🔴 Critical'

bike_features['health_category'] = bike_features['health_score'].apply(categorize_health)

print("✅ Health scoring complete")
print("\nHealth Distribution:")
for cat in ['🟢 Stable', '🟡 Warning', '🔴 Critical']:
    count = (bike_features['health_category'] == cat).sum()
    pct = count / len(bike_features) * 100
    print(f"  {cat}: {count} bikes ({pct:.1f}%)")

## 9. Maintenance Priority Ranking

In [None]:
# Create priority ranking
priority_ranking = bike_features.sort_values('health_score', ascending=False)[
    ['bike_id', 'rideable_type', 'cluster_name', 'health_score', 'health_category',
     'total_trips', 'cumulative_mileage', 'complaint_count', 'avg_user_rating', 'days_since_service']
].reset_index(drop=True)
priority_ranking.index = priority_ranking.index + 1
priority_ranking.index.name = 'Priority'

# Save
priority_ranking.to_csv('../outputs/bike_maintenance_priority.csv')

print("📋 TOP 20 BIKES REQUIRING MAINTENANCE:")
priority_ranking.head(20)

## 10. Visualizations

In [None]:
# Health distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

health_counts = bike_features['health_category'].value_counts()
color_map = {'🟢 Stable': '#2ecc71', '🟡 Warning': '#f1c40f', '🔴 Critical': '#e74c3c'}
bar_colors = [color_map.get(c, '#95a5a6') for c in health_counts.index]
axes[0].bar(health_counts.index, health_counts.values, color=bar_colors)
axes[0].set_xlabel('Health Category')
axes[0].set_ylabel('Number of Bikes')
axes[0].set_title('Fleet Health Distribution')

axes[1].hist(bike_features['health_score'], bins=30, edgecolor='black', alpha=0.7, color='#3498db')
axes[1].axvline(x=0.35, color='green', linestyle='--', linewidth=2, label='Stable threshold')
axes[1].axvline(x=0.55, color='orange', linestyle='--', linewidth=2, label='Warning threshold')
axes[1].set_xlabel('Health Score')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Health Score Distribution')
axes[1].legend()

plt.tight_layout()
plt.savefig('../outputs/figures/bike_health_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Top 10 priority bikes
top10 = priority_ranking.head(10)
plt.figure(figsize=(12, 6))
bar_colors = ['#e74c3c' if cat == '🔴 Critical' else '#f1c40f' for cat in top10['health_category']]
bars = plt.barh(range(len(top10)), top10['health_score'], color=bar_colors)
plt.yticks(range(len(top10)), top10['bike_id'])
plt.xlabel('Health Score (Higher = More Urgent)')
plt.title('🔧 Top 10 Bikes Requiring Maintenance')
plt.gca().invert_yaxis()

for i, (score, cat) in enumerate(zip(top10['health_score'], top10['health_category'])):
    plt.text(score + 0.01, i, f'{score:.2f} {cat}', va='center', fontsize=10)

plt.tight_layout()
plt.savefig('../outputs/figures/bike_top10_priority.png', dpi=150, bbox_inches='tight')
plt.show()

## 11. Conclusions & Business Insights

### Key Findings

1. **Clustering**: Identified 4 distinct usage profiles
2. **Anomaly Detection**: Found outlier bikes with unusual patterns
3. **Time Series**: Tracked usage trends (seasonal decline expected)
4. **Health Scoring**: Combined all signals into actionable priority list

### Business Impact

- **Proactive Maintenance**: Address critical bikes before failures
- **Resource Optimization**: Focus repair teams on highest-risk bikes
- **Cost Reduction**: Prevent expensive emergency repairs
- **Customer Satisfaction**: Fewer breakdowns = happier customers

In [None]:
# Final Summary
print("="*60)
print("📊 BIKE-LEVEL ANALYSIS COMPLETE!")
print("="*60)
print(f"\nTotal bikes analyzed: {len(bike_features):,}")
print(f"  - Classic: {(bike_features['rideable_type']=='classic_bike').sum():,}")
print(f"  - Electric: {(bike_features['rideable_type']=='electric_bike').sum():,}")
print(f"\nCluster Distribution:")
for name in sorted(bike_features['cluster_name'].unique()):
    count = (bike_features['cluster_name'] == name).sum()
    print(f"  {name}: {count} bikes")
print(f"\nAnomaly Detection:")
print(f"  Anomalous bikes: {bike_features['is_anomaly'].sum()}")
print(f"\nHealth Status:")
for cat in ['🟢 Stable', '🟡 Warning', '🔴 Critical']:
    count = (bike_features['health_category'] == cat).sum()
    pct = count / len(bike_features) * 100
    print(f"  {cat}: {count} bikes ({pct:.1f}%)")
print("\n" + "="*60)