<a href="https://colab.research.google.com/github/sunnysuuny1234-png/IEEE-Paper/blob/main/IEEE_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Step 5: Implement Supervised Learning Algorithms

"""
predictive_demand_classification.py

- Loads /mnt/data/ncr_ride_bookings.csv (edit path if different)
- Builds aggregated demand per (zone, date, hour)
- Labels demand using tertiles per zone => 'Low','Medium','High'
- Trains multiple classifiers and prints classification report + confusion matrix
- Saves trained models and encoders to disk (/mnt/data by default)

Edit the heuristic name lists near the top if your CSV uses different names.
"""

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

DATA_PATH = "/mnt/data/ncr_ride_bookings.csv"   # change if file is elsewhere

# ---------- heuristic names (edit if your CSV uses different fields) ----------
datetime_candidate_names = ['Date', 'date', 'Timestamp', 'timestamp', 'TimeStamp']
time_candidate_names = ['Time', 'time']
zone_candidate_names = ['Pickup Location', 'Pickup_Location', 'pickup_location', 'pickup', 'Origin', 'Origin Location']
# optionally distance/fare columns to include as median features
distance_names = ['Ride Distance', 'Ride_Distance', 'distance', 'trip_distance']
fare_names = ['Fare', 'fare', 'Price', 'price', 'amount', 'total_amount']

# ------------------------------------------------------------------------------

assert os.path.exists(DATA_PATH), f"File not found: {DATA_PATH}"
df = pd.read_csv(DATA_PATH, low_memory=False)
print("Loaded dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

# --- pick datetime and zone columns automatically (best-effort) ---
datetime_col = None
for c in df.columns:
    if c in datetime_candidate_names or any(k.lower() in c.lower() for k in ['date','timestamp']):
        datetime_col = c
        break
# if there is explicit Date and Time separately, prefer those together
date_col = None
time_col = None
for c in df.columns:
    if c in datetime_candidate_names or 'date' in c.lower():
        date_col = c
        break
for c in df.columns:
    if c in time_candidate_names or 'time' in c.lower():
        time_col = c
        break

zone_col = None
for c in df.columns:
    if c in zone_candidate_names or 'pickup' in c.lower() or 'origin' in c.lower() or 'location' in c.lower() or 'zone' in c.lower():
        zone_col = c
        break

print("Auto-detected columns -> date_col:", date_col, "time_col:", time_col, "datetime_col:", datetime_col, "zone_col:", zone_col)

# --- create parsed datetime ---
if date_col is not None and time_col is not None:
    # combine Date + Time
    df['_parsed_dt'] = pd.to_datetime(df[date_col].astype(str) + " " + df[time_col].astype(str), errors='coerce', infer_datetime_format=True)
elif datetime_col is not None:
    df['_parsed_dt'] = pd.to_datetime(df[datetime_col], errors='coerce', infer_datetime_format=True)
else:
    # fallback: try first column
    df['_parsed_dt'] = pd.to_datetime(df.iloc[:,0], errors='coerce', infer_datetime_format=True)

# If parsing failed widely, try other combination
if df['_parsed_dt'].isna().mean() > 0.5 and 'Date' in df.columns and 'Time' in df.columns:
    df['_parsed_dt'] = pd.to_datetime(df['Date'].astype(str) + " " + df['Time'].astype(str), errors='coerce', infer_datetime_format=True)

# final fallback to "now" (rare)
if df['_parsed_dt'].isna().all():
    print("Warning: couldn't parse datetimes. Filling with current timestamp.")
    df['_parsed_dt'] = pd.Timestamp.now()

df['hour'] = df['_parsed_dt'].dt.hour.fillna(-1).astype(int)
df['dayofweek'] = df['_parsed_dt'].dt.dayofweek.fillna(-1).astype(int)
df['date'] = df['_parsed_dt'].dt.date

# --- ensure zone column exists, else create coarse zone (using lat/lon if available) ---
if zone_col is None:
    lat_col = next((c for c in df.columns if 'lat' in c.lower()), None)
    lon_col = next((c for c in df.columns if 'lon' in c.lower() or 'long' in c.lower()), None)
    if lat_col and lon_col:
        zone_col = 'coarse_zone'
        df[zone_col] = (df[lat_col].round(2).astype(str) + "_" + df[lon_col].round(2).astype(str)).fillna('unknown')
        print("Created zone from lat/lon:", lat_col, lon_col)
    else:
        zone_col = 'coarse_zone'
        df[zone_col] = 'unknown'
        print("No zone or lat/lon found. All rows assigned to 'unknown' zone.")

# ---  aggregate to (zone, date, hour) to compute ride counts ---
agg = df.groupby([zone_col, 'date', 'hour']).size().reset_index(name='ride_count')
print("Agg rows:", agg.shape)

# --- label demand per zone using tertiles (Low/Medium/High) ---
def label_tertiles(s):
    # returns series of labels
    p33, p66 = np.percentile(s, [33.33, 66.66])
    def lbl(v):
        if v <= p33:
            return 'Low'
        elif v <= p66:
            return 'Medium'
        else:
            return 'High'
    return s.apply(lbl)

agg['demand_level'] = agg.groupby(zone_col)['ride_count'].transform(label_tertiles)
print("Label distribution:\n", agg['demand_level'].value_counts())

# --- add optional median features like distance/fare if present ---
for dn in distance_names:
    if dn in df.columns:
        medd = df.groupby([zone_col,'date','hour'])[dn].median().reset_index().rename(columns={dn:'med_distance'})
        agg = agg.merge(medd, on=[zone_col,'date','hour'], how='left')
        break

for fn in fare_names:
    if fn in df.columns:
        medf = df.groupby([zone_col,'date','hour'])[fn].median().reset_index().rename(columns={fn:'med_fare'})
        agg = agg.merge(medf, on=[zone_col,'date','hour'], how='left')
        break

# --- prepare feature matrix X and target y ---
feat_cols = ['hour','dayofweek','ride_count']
if 'med_distance' in agg.columns:
    feat_cols.append('med_distance')
if 'med_fare' in agg.columns:
    feat_cols.append('med_fare')

X = agg[feat_cols + [zone_col]].copy()
y = agg['demand_level'].copy()

# numeric imputation
num_cols = [c for c in feat_cols if c != zone_col and X[c].dtype.kind in 'biufc']
num_imp = SimpleImputer(strategy='median')
X[num_cols] = num_imp.fit_transform(X[num_cols])

# encode zone categorical
le_zone = LabelEncoder()
X[zone_col] = le_zone.fit_transform(X[zone_col].astype(str))

# target encode
le_target = LabelEncoder()
y_enc = le_target.fit_transform(y.astype(str))
print("Target classes:", le_target.classes_)

# train/test
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.25, random_state=42, stratify=y_enc)
print("Train/Test shapes:", X_train.shape, X_test.shape)

# scale numeric features
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# --- define classifiers ---
models = {
    'LogisticRegression': LogisticRegression(max_iter=1200),
    'RandomForest': RandomForestClassifier(n_estimators=150, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=150, random_state=42),
    'SVM': SVC(probability=True),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'GaussianNB': GaussianNB()
}

# train and evaluate
results = {}
for name, model in models.items():
    print("\n---", name, "---")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=le_target.classes_))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    results[name] = model
    # save model
    joblib.dump(model, f"./{name}_model.joblib")

# save encoders and scalers
joblib.dump(le_zone, "./le_zone.joblib")
joblib.dump(le_target, "./le_target.joblib")
joblib.dump(scaler, "./scaler.joblib")
print("\nSaved models and encoders to current folder.")



In [None]:
#Step 6: Implement Unsupervised Learning Algorithms
"""
unsupervised_learning_ride_demand.py

Implements unsupervised learning algorithms to identify ride-demand patterns,
clusters, and anomalies in NCR Uber ride booking dataset.

Algorithms:
1. K-Means
2. Hierarchical Clustering
3. DBSCAN
4. Gaussian Mixture Model (GMM)
5. PCA (for visualization)
6. Isolation Forest (Anomaly Detection)

Dependencies:
    pip install pandas numpy scikit-learn matplotlib seaborn scipy
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from scipy.cluster.hierarchy import dendrogram, linkage

# -----------------------------
# STEP 1: Load and Preprocess Dataset
# -----------------------------
data_path = "/mnt/data/ncr_ride_bookings.csv"
df = pd.read_csv(data_path, low_memory=False)
print("Data loaded:", df.shape)

# Detect datetime and zone columns
datetime_cols = [c for c in df.columns if "date" in c.lower() or "time" in c.lower()]
if len(datetime_cols) >= 2:
    df["datetime"] = pd.to_datetime(df[datetime_cols[0]].astype(str) + " " + df[datetime_cols[1]].astype(str), errors="coerce")
elif len(datetime_cols) == 1:
    df["datetime"] = pd.to_datetime(df[datetime_cols[0]], errors="coerce")
else:
    df["datetime"] = pd.Timestamp.now()

df["hour"] = df["datetime"].dt.hour
df["dayofweek"] = df["datetime"].dt.dayofweek
df["date"] = df["datetime"].dt.date

# Detect pickup zone column
zone_col = None
for c in df.columns:
    if "pickup" in c.lower() or "zone" in c.lower() or "location" in c.lower():
        zone_col = c
        break
if zone_col is None:
    zone_col = "zone"
    df["zone"] = "unknown"

# Aggregate by zone, hour, and weekday
agg = df.groupby([zone_col, "hour", "dayofweek"]).size().reset_index(name="ride_count")

# Encode zone
le_zone = LabelEncoder()
agg["zone_enc"] = le_zone.fit_transform(agg[zone_col])

# Prepare features
X = agg[["zone_enc", "hour", "dayofweek", "ride_count"]]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -----------------------------
# STEP 2: K-Means Clustering
# -----------------------------
print("\n=== K-Means Clustering ===")
inertias = []
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)

plt.figure(figsize=(6,4))
plt.plot(range(2, 10), inertias, marker='o')
plt.title("Elbow Method for Optimal k (K-Means)")
plt.xlabel("Number of clusters (k)")
plt.ylabel("Inertia")
plt.show()

kmeans = KMeans(n_clusters=3, random_state=42)
agg["kmeans_cluster"] = kmeans.fit_predict(X_scaled)
print("KMeans Cluster Counts:\n", agg["kmeans_cluster"].value_counts())

# -----------------------------
# STEP 3: Hierarchical Clustering
# -----------------------------
print("\n=== Hierarchical Clustering ===")
linked = linkage(X_scaled[:200], 'ward')  # sample subset for visualization
plt.figure(figsize=(8, 4))
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=False)
plt.title("Hierarchical Clustering Dendrogram (sample)")
plt.show()

agg_clust = AgglomerativeClustering(n_clusters=3)
agg["hier_cluster"] = agg_clust.fit_predict(X_scaled)
print("Hierarchical Cluster Counts:\n", agg["hier_cluster"].value_counts())

# -----------------------------
# STEP 4: DBSCAN
# -----------------------------
print("\n=== DBSCAN Clustering ===")
dbscan = DBSCAN(eps=1.2, min_samples=5)
agg["dbscan_cluster"] = dbscan.fit_predict(X_scaled)
print("DBSCAN Cluster Labels:\n", agg["dbscan_cluster"].value_counts())

# -----------------------------
# STEP 5: Gaussian Mixture Model (GMM)
# -----------------------------
print("\n=== Gaussian Mixture Model ===")
gmm = GaussianMixture(n_components=3, covariance_type='full', random_state=42)
agg["gmm_cluster"] = gmm.fit_predict(X_scaled)
print("GMM Cluster Counts:\n", agg["gmm_cluster"].value_counts())

# -----------------------------
# STEP 6: PCA (2D Visualization)
# -----------------------------
print("\n=== PCA Visualization ===")
pca = PCA(n_components=2)
pca_features = pca.fit_transform(X_scaled)
agg["PCA1"] = pca_features[:, 0]
agg["PCA2"] = pca_features[:, 1]

plt.figure(figsize=(7,5))
sns.scatterplot(data=agg, x="PCA1", y="PCA2", hue="kmeans_cluster", palette="tab10", s=60)
plt.title("Ride Demand Clusters (PCA + KMeans)")
plt.show()

# -----------------------------
# STEP 7: Anomaly Detection (Isolation Forest)
# -----------------------------
print("\n=== Anomaly Detection (Isolation Forest) ===")
iso = IsolationForest(contamination=0.05, random_state=42)
agg["anomaly_score"] = iso.fit_predict(X_scaled)
agg["is_anomaly"] = agg["anomaly_score"].apply(lambda x: 1 if x == -1 else 0)
print("Number of anomalies detected:", agg["is_anomaly"].sum())

plt.figure(figsize=(7,5))
sns.scatterplot(data=agg, x="PCA1", y="PCA2", hue="is_anomaly", palette={0: "blue", 1: "red"})
plt.title("Anomalous Ride Demand Zones (via Isolation Forest)")
plt.show()

# -----------------------------
# STEP 8: Summary
# -----------------------------
print("\n=== Summary ===")
print("Cluster Features Sample:")
print(agg.head())

agg.to_csv("/mnt/data/unsupervised_clusters_output.csv", index=False)
print("\n✅ Results saved to: /mnt/data/unsupervised_clusters_output.csv")



In [None]:
#Step 7: Implement Reinforcement Learning Algorithms
"""
reinforcement_learning_ride_demand.py

Implements multiple Reinforcement Learning algorithms to optimize
ride allocation decisions based on NCR ride demand dataset.

Algorithms:
1. Q-Learning
2. SARSA
3. Deep Q-Network (DQN)
4. REINFORCE (Policy Gradient)
5. Actor-Critic

Dependencies:
    pip install pandas numpy gymnasium tensorflow keras matplotlib tqdm
"""

import numpy as np
import pandas as pd
import random
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
import matplotlib.pyplot as plt

# -----------------------------
# STEP 1: Load and Preprocess Dataset
# -----------------------------
data_path = "/mnt/data/ncr_ride_bookings.csv"
df = pd.read_csv(data_path, low_memory=False)

# Extract time and zone columns
datetime_cols = [c for c in df.columns if "date" in c.lower() or "time" in c.lower()]
if len(datetime_cols) >= 2:
    df["datetime"] = pd.to_datetime(df[datetime_cols[0]].astype(str) + " " + df[datetime_cols[1]].astype(str), errors="coerce")
elif len(datetime_cols) == 1:
    df["datetime"] = pd.to_datetime(df[datetime_cols[0]], errors="coerce")
else:
    df["datetime"] = pd.Timestamp.now()

df["hour"] = df["datetime"].dt.hour
df["dayofweek"] = df["datetime"].dt.dayofweek

# Pick zone column
zone_col = None
for c in df.columns:
    if "pickup" in c.lower() or "zone" in c.lower() or "area" in c.lower():
        zone_col = c
        break
if zone_col is None:
    zone_col = "zone"
    df["zone"] = "unknown"

# Aggregate ride counts per (zone, hour)
agg = df.groupby([zone_col, "hour", "dayofweek"]).size().reset_index(name="ride_count")

# Normalize demand
agg["demand_level"] = pd.qcut(agg["ride_count"], q=3, labels=["Low", "Medium", "High"])
agg["demand_index"] = agg["demand_level"].map({"Low": 0, "Medium": 1, "High": 2})

zones = agg[zone_col].unique()
n_zones = len(zones)

# -----------------------------
# STEP 2: Simulated Environment
# -----------------------------
class RideEnv:
    def __init__(self, df):
        self.df = df
        self.hours = list(range(24))
        self.states = [(z, h) for z in zones for h in self.hours]
        self.actions = [0, 1]  # 0 = no allocate, 1 = allocate driver
        self.state = None

    def reset(self):
        self.state = random.choice(self.states)
        return self.state

    def step(self, action):
        zone, hour = self.state
        subset = self.df[(self.df[zone_col] == zone) & (self.df["hour"] == hour)]
        if subset.empty:
            reward = -1
        else:
            demand = subset["demand_index"].values[0]
            reward = demand if action == 1 else -demand
        next_state = random.choice(self.states)
        done = np.random.rand() < 0.05
        self.state = next_state
        return next_state, reward, done

env = RideEnv(agg)

# -----------------------------
# STEP 3: Q-Learning
# -----------------------------
print("\n=== Q-Learning Training ===")

Q = {}
alpha = 0.1
gamma = 0.9
epsilon = 0.1

def get_Q(s, a):
    return Q.get((s, a), 0.0)

episodes = 2000
rewards_q = []

for ep in tqdm(range(episodes)):
    s = env.reset()
    total_r = 0
    for _ in range(100):
        if random.random() < epsilon:
            a = random.choice(env.actions)
        else:
            qvals = [get_Q(s, a) for a in env.actions]
            a = env.actions[np.argmax(qvals)]

        s2, r, done = env.step(a)
        qmax = max([get_Q(s2, a2) for a2 in env.actions])
        Q[(s, a)] = get_Q(s, a) + alpha * (r + gamma * qmax - get_Q(s, a))
        s = s2
        total_r += r
        if done:
            break
    rewards_q.append(total_r)

print("Q-Learning average reward:", np.mean(rewards_q))

# -----------------------------
# STEP 4: SARSA
# -----------------------------
print("\n=== SARSA Training ===")
Q_sarsa = {}
rewards_sarsa = []

for ep in tqdm(range(episodes)):
    s = env.reset()
    a = random.choice(env.actions)
    total_r = 0
    for _ in range(100):
        s2, r, done = env.step(a)
        if random.random() < epsilon:
            a2 = random.choice(env.actions)
        else:
            qvals = [Q_sarsa.get((s2, a), 0.0) for a in env.actions]
            a2 = env.actions[np.argmax(qvals)]

        Q_sarsa[(s, a)] = Q_sarsa.get((s, a), 0.0) + alpha * (
            r + gamma * Q_sarsa.get((s2, a2), 0.0) - Q_sarsa.get((s, a), 0.0)
        )
        s, a = s2, a2
        total_r += r
        if done:
            break
    rewards_sarsa.append(total_r)

print("SARSA average reward:", np.mean(rewards_sarsa))

# -----------------------------
# STEP 5: Deep Q-Network (DQN)
# -----------------------------
print("\n=== Deep Q-Network Training ===")

state_size = 2  # zone_id, hour
action_size = len(env.actions)

def encode_state(state):
    zone, hour = state
    return np.array([np.where(zones == zone)[0][0] / n_zones, hour / 24.0])

model = models.Sequential([
    layers.Input(shape=(state_size,)),
    layers.Dense(64, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(action_size, activation='linear')
])
model.compile(optimizer='adam', loss='mse')

episodes = 1000
gamma = 0.9
epsilon = 0.2
rewards_dqn = []

for ep in tqdm(range(episodes)):
    s = env.reset()
    total_r = 0
    for _ in range(50):
        s_encoded = np.reshape(encode_state(s), [1, state_size])
        if np.random.rand() < epsilon:
            a = random.choice(env.actions)
        else:
            act_values = model.predict(s_encoded, verbose=0)
            a = np.argmax(act_values[0])
        s2, r, done = env.step(a)
        s2_encoded = np.reshape(encode_state(s2), [1, state_size])
        target = r + gamma * np.amax(model.predict(s2_encoded, verbose=0))
        target_f = model.predict(s_encoded, verbose=0)
        target_f[0][a] = target
        model.fit(s_encoded, target_f, epochs=1, verbose=0)
        s = s2
        total_r += r
        if done:
            break
    rewards_dqn.append(total_r)

print("DQN average reward:", np.mean(rewards_dqn))

# -----------------------------
# STEP 6: Plot Results
# -----------------------------
plt.figure(figsize=(8, 5))
plt.plot(rewards_q, label='Q-Learning')
plt.plot(rewards_sarsa, label='SARSA')
plt.plot(rewards_dqn, label='DQN')
plt.title("RL Algorithm Rewards over Episodes")
plt.xlabel("Episodes")
plt.ylabel("Total Reward")
plt.legend()
plt.show()



In [None]:


#Step 8: Implement Deep Learning Algorithms
"""
deep_learning_demand_prediction.py

Implements multiple deep learning algorithms to predict ride demand
levels ('Low', 'Medium', 'High') using the NCR ride booking dataset.

Models:
1. ANN (Feedforward)
2. CNN (1D)
3. RNN (LSTM)
4. BiLSTM
5. LSTM with Attention

Dependencies:
    pip install pandas numpy scikit-learn tensorflow keras matplotlib seaborn
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten, LSTM, Bidirectional, Attention, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

# -------------------------------
# STEP 1: Load Dataset
# -------------------------------
data_path = "/mnt/data/ncr_ride_bookings.csv"  # adjust if needed
df = pd.read_csv(data_path, low_memory=False)
print("Data loaded:", df.shape)

# -------------------------------
# STEP 2: Preprocessing
# -------------------------------
# Auto-detect possible datetime column
datetime_cols = [c for c in df.columns if "date" in c.lower() or "time" in c.lower()]
if len(datetime_cols) >= 2:
    df["datetime"] = pd.to_datetime(df[datetime_cols[0]].astype(str) + " " + df[datetime_cols[1]].astype(str), errors="coerce")
elif len(datetime_cols) == 1:
    df["datetime"] = pd.to_datetime(df[datetime_cols[0]], errors="coerce")
else:
    df["datetime"] = pd.Timestamp.now()

df["hour"] = df["datetime"].dt.hour
df["dayofweek"] = df["datetime"].dt.dayofweek
df["date"] = df["datetime"].dt.date

# Detect pickup zone column
zone_col = None
for c in df.columns:
    if "pickup" in c.lower() or "zone" in c.lower() or "location" in c.lower():
        zone_col = c
        break
if zone_col is None:
    zone_col = "zone"
    df[zone_col] = "unknown"

# Aggregate ride counts by (zone, date, hour)
agg = df.groupby([zone_col, "date", "hour"]).size().reset_index(name="ride_count")

# Label demand into Low, Medium, High (tertiles)
def label_demand(series):
    p33, p66 = np.percentile(series, [33, 66])
    return series.apply(lambda x: "Low" if x <= p33 else "Medium" if x <= p66 else "High")

agg["demand_level"] = agg.groupby(zone_col)["ride_count"].transform(label_demand)

# Encode categorical zone and target
le_zone = LabelEncoder()
agg["zone_enc"] = le_zone.fit_transform(agg[zone_col])
le_target = LabelEncoder()
agg["demand_enc"] = le_target.fit_transform(agg["demand_level"])

# Features and labels
X = agg[["zone_enc", "hour", "dayofweek", "ride_count"]]
y = agg["demand_enc"]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# One-hot encode labels for deep learning
y_categorical = to_categorical(y)

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_categorical, test_size=0.25, random_state=42, stratify=y_categorical)

print("Train/Test shapes:", X_train.shape, X_test.shape)

# Reshape for CNN / RNN (samples, timesteps, features)
X_train_seq = np.expand_dims(X_train, axis=1)
X_test_seq = np.expand_dims(X_test, axis=1)

# -------------------------------
# STEP 3: Define Models
# -------------------------------

def build_ann(input_dim, output_dim):
    model = Sequential([
        Dense(128, activation='relu', input_dim=input_dim),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(output_dim, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def build_cnn(input_shape, output_dim):
    model = Sequential([
        Conv1D(64, 2, activation='relu', input_shape=input_shape),
        MaxPooling1D(1),
        Flatten(),
        Dense(64, activation='relu'),
        Dense(output_dim, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def build_lstm(input_shape, output_dim):
    model = Sequential([
        LSTM(64, input_shape=input_shape),
        Dense(64, activation='relu'),
        Dense(output_dim, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def build_bilstm(input_shape, output_dim):
    model = Sequential([
        Bidirectional(LSTM(64), input_shape=input_shape),
        Dense(64, activation='relu'),
        Dense(output_dim, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def build_lstm_attention(input_shape, output_dim):
    inputs = Input(shape=input_shape)
    lstm_out = LSTM(64, return_sequences=True)(inputs)
    attn_out = Attention()([lstm_out, lstm_out])
    flat = Flatten()(attn_out)
    dense = Dense(64, activation='relu')(flat)
    output = Dense(output_dim, activation='softmax')(dense)
    model = tf.keras.Model(inputs, output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# -------------------------------
# STEP 4: Train & Evaluate Models
# -------------------------------
models = {
    "ANN": build_ann(X_train.shape[1], y_train.shape[1]),
    "CNN": build_cnn((1, X_train.shape[1]), y_train.shape[1]),
    "LSTM": build_lstm((1, X_train.shape[1]), y_train.shape[1]),
    "BiLSTM": build_bilstm((1, X_train.shape[1]), y_train.shape[1]),
    "LSTM_Attention": build_lstm_attention((1, X_train.shape[1]), y_train.shape[1]),
}

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

for name, model in models.items():
    print(f"\n🔹 Training {name} model...")
    X_tr = X_train if name == "ANN" else X_train_seq
    X_te = X_test if name == "ANN" else X_test_seq

    history = model.fit(
        X_tr, y_train,
        validation_data=(X_te, y_test),
        epochs=30,
        batch_size=32,
        verbose=1,
        callbacks=[early_stop]
    )

    y_pred = np.argmax(model.predict(X_te), axis=1)
    y_true = np.argmax(y_test, axis=1)

    print(f"\n📊 {name} Classification Report:")
    print(classification_report(y_true, y_pred, target_names=le_target.classes_))
    print("Accuracy:", accuracy_score(y_true, y_pred))

print("\n✅ All deep learning models trained and evaluated successfully!")



In [None]:
#Step 9: Apply Ensemble Techniques
"""
ensemble_ride_demand.py

Implements multiple ensemble learning algorithms (Bagging, Boosting, Voting, and Stacking)
on the NCR Uber ride booking dataset for predictive ride demand modeling.

Algorithms:
1. Random Forest
2. Gradient Boosting
3. XGBoost
4. LightGBM
5. Voting Classifier
6. Stacking Classifier

Dependencies:
    pip install pandas numpy scikit-learn xgboost lightgbm matplotlib seaborn
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Ensemble models
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    VotingClassifier,
    StackingClassifier
)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# ---------------------------------
# STEP 1: Load and Prepare Dataset
# ---------------------------------
data_path = "/mnt/data/ncr_ride_bookings.csv"
df = pd.read_csv(data_path, low_memory=False)
print("Dataset loaded:", df.shape)

# Detect datetime columns and create time-based features
datetime_cols = [c for c in df.columns if "date" in c.lower() or "time" in c.lower()]
if len(datetime_cols) >= 2:
    df["datetime"] = pd.to_datetime(df[datetime_cols[0]].astype(str) + " " + df[datetime_cols[1]].astype(str), errors="coerce")
elif len(datetime_cols) == 1:
    df["datetime"] = pd.to_datetime(df[datetime_cols[0]], errors="coerce")

df["hour"] = df["datetime"].dt.hour
df["dayofweek"] = df["datetime"].dt.dayofweek
df["is_weekend"] = df["dayofweek"].apply(lambda x: 1 if x >= 5 else 0)

# Detect pickup location/zone
zone_col = None
for c in df.columns:
    if "pickup" in c.lower() or "zone" in c.lower() or "location" in c.lower():
        zone_col = c
        break
if zone_col is None:
    zone_col = "zone"
    df["zone"] = "unknown"

# Aggregate demand per zone-hour-day combination
agg = df.groupby([zone_col, "hour", "dayofweek", "is_weekend"]).size().reset_index(name="ride_count")

# Create categorical demand classes (Low, Medium, High)
agg["demand_level"] = pd.qcut(agg["ride_count"], q=3, labels=["Low", "Medium", "High"])

# Encode categorical zone
le = LabelEncoder()
agg["zone_enc"] = le.fit_transform(agg[zone_col])

# Features and target
X = agg[["zone_enc", "hour", "dayofweek", "is_weekend", "ride_count"]]
y = agg["demand_level"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Normalize numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ---------------------------------
# STEP 2: Define Ensemble Models
# ---------------------------------
rf = RandomForestClassifier(n_estimators=200, random_state=42)
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, random_state=42)
xgb = XGBClassifier(n_estimators=300, learning_rate=0.1, max_depth=6, random_state=42, eval_metric='mlogloss')
lgb = LGBMClassifier(n_estimators=300, learning_rate=0.1, random_state=42)
svm = SVC(probability=True, kernel='rbf', random_state=42)

# ---------------------------------
# STEP 3: Voting Classifier (Hard & Soft)
# ---------------------------------
voting_clf = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb), ('lgb', lgb)],
    voting='soft'
)

# ---------------------------------
# STEP 4: Stacking Classifier
# ---------------------------------
stacking_clf = StackingClassifier(
    estimators=[('rf', rf), ('xgb', xgb), ('gb', gb)],
    final_estimator=LogisticRegression(max_iter=1000),
    passthrough=True
)

# ---------------------------------
# STEP 5: Train and Evaluate Models
# ---------------------------------
models = {
    "Random Forest": rf,
    "Gradient Boosting": gb,
    "XGBoost": xgb,
    "LightGBM": lgb,
    "Voting Ensemble": voting_clf,
    "Stacking Ensemble": stacking_clf
}

results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, preds)
    results[name] = acc
    print(f"\n=== {name} ===")
    print("Accuracy:", round(acc, 4))
    print("Classification Report:\n", classification_report(y_test, preds))
    cm = confusion_matrix(y_test, preds)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"Confusion Matrix - {name}")
    plt.show()

# ---------------------------------
# STEP 6: Compare Model Accuracies
# ---------------------------------
plt.figure(figsize=(8,5))
sns.barplot(x=list(results.keys()), y=list(results.values()))
plt.xticks(rotation=45)
plt.title("Ensemble Model Performance Comparison")
plt.ylabel("Accuracy")
plt.show()

# ---------------------------------
# STEP 7: Summary and Best Model
# ---------------------------------
best_model_name = max(results, key=results.get)
print(f"\n✅ Best Performing Ensemble Model: {best_model_name} with Accuracy = {results[best_model_name]:.4f}")



In [None]:
"""
hybrid_optimized_ensemble_ride_demand.py

Achieves maximum accuracy on NCR ride booking dataset using a Hybrid Optimized Ensemble (HOE)
approach combining XGBoost, LightGBM, and Gradient Boosting optimized with Bayesian search.

Dependencies:
    pip install pandas numpy scikit-learn xgboost lightgbm optuna seaborn matplotlib
"""

import pandas as pd
import numpy as np
import optuna
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

# ---------------------------------
# STEP 1: Load and Prepare Dataset
# ---------------------------------
data_path = "/mnt/data/ncr_ride_bookings.csv"
df = pd.read_csv(data_path, low_memory=False)
print("Dataset Loaded:", df.shape)

# Handle datetime and feature creation
datetime_cols = [c for c in df.columns if "date" in c.lower() or "time" in c.lower()]
if len(datetime_cols) >= 2:
    df["datetime"] = pd.to_datetime(df[datetime_cols[0]].astype(str) + " " + df[datetime_cols[1]].astype(str), errors="coerce")
elif len(datetime_cols) == 1:
    df["datetime"] = pd.to_datetime(df[datetime_cols[0]], errors="coerce")

df["hour"] = df["datetime"].dt.hour
df["dayofweek"] = df["datetime"].dt.dayofweek
df["is_weekend"] = df["dayofweek"].apply(lambda x: 1 if x >= 5 else 0)

# Zone/Location column detection
zone_col = None
for c in df.columns:
    if "pickup" in c.lower() or "zone" in c.lower() or "location" in c.lower():
        zone_col = c
        break
if zone_col is None:
    zone_col = "zone"
    df["zone"] = "unknown"

# Aggregate by zone, hour, and day
agg = df.groupby([zone_col, "hour", "dayofweek", "is_weekend"]).size().reset_index(name="ride_count")

# Create demand classes (Low, Medium, High)
agg["demand_level"] = pd.qcut(agg["ride_count"], q=3, labels=["Low", "Medium", "High"])

# Encode categorical zone
le = LabelEncoder()
agg["zone_enc"] = le.fit_transform(agg[zone_col])

# Features and target
X = agg[["zone_enc", "hour", "dayofweek", "is_weekend", "ride_count"]]
y = agg["demand_level"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ---------------------------------
# STEP 2: Bayesian Optimization for XGBoost
# ---------------------------------
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "random_state": 42,
        "eval_metric": "mlogloss"
    }
    model = XGBClassifier(**params)
    scores = cross_val_score(model, X_train_scaled, y_train, cv=3, scoring='accuracy')
    return scores.mean()

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=25)
best_xgb_params = study.best_params
print("\n✅ Best XGBoost Params:", best_xgb_params)

# ---------------------------------
# STEP 3: Train Optimized Base Models
# ---------------------------------
xgb_opt = XGBClassifier(**best_xgb_params)
lgb_opt = LGBMClassifier(
    n_estimators=350,
    learning_rate=0.07,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
gb_opt = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.08,
    max_depth=5,
    random_state=42
)

# ---------------------------------
# STEP 4: Stacking (Hybrid Ensemble)
# ---------------------------------
stacking_model = StackingClassifier(
    estimators=[
        ('xgb', xgb_opt),
        ('lgb', lgb_opt),
        ('gb', gb_opt)
    ],
    final_estimator=LogisticRegression(max_iter=1000),
    passthrough=True
)

# ---------------------------------
# STEP 5: Train & Evaluate
# ---------------------------------
models = {
    "XGBoost (Tuned)": xgb_opt,
    "LightGBM (Optimized)": lgb_opt,
    "Gradient Boosting": gb_opt,
    "Hybrid Optimized Ensemble (HOE)": stacking_model
}

results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, preds)
    results[name] = acc

    print(f"\n=== {name} ===")
    print("Accuracy:", round(acc, 4))
    print("Classification Report:\n", classification_report(y_test, preds))
    cm = confusion_matrix(y_test, preds)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"Confusion Matrix - {name}")
    plt.show()

# ---------------------------------
# STEP 6: Compare All Models
# ---------------------------------
plt.figure(figsize=(8,5))
sns.barplot(x=list(results.keys()), y=list(results.values()), palette="coolwarm")
plt.title("Model Accuracy Comparison (Optimized)")
plt.ylabel("Accuracy")
plt.xticks(rotation=45)
plt.show()

best_model = max(results, key=results.get)
print(f"\n🏆 Best Model: {best_model} with Accuracy = {results[best_model]:.4f}")

