In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data=pd.read_csv("Synthetic_Ride_Data_6000.csv")
data.head()

In [None]:
data.isnull().sum()

In [None]:
data.info

In [None]:
data.drop_duplicates()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
# import dask.dataframe as dd
# #dask lib helps in parllel processing and run large datset
# # Load the dataset using Dask
# file_path = 'Synthetic_Ride_Data_6000.csv'
# df = dd.read_csv(file_path)

# # Display basic information
# print(df.info())

In [None]:
cancellation_rate = data[data['Cancellation_Status'] == 'Cancelled'].shape[0] / data.shape[0]
print(f"Cancellation Rate: {cancellation_rate:.2%}")
#cancellation_rate

In [None]:
data['Ride_Start_Time']=pd.to_datetime(data['Ride_Start_Time'])
data['Ride_End_Time']=pd.to_datetime(data['Ride_End_Time'])
plt.figure(figsize=[10,11])
sns.countplot(data=data[data['Cancellation_Status']=='Cancelled'],x='Cancellation_Reason', palette='coolwarm')
plt.title('cancellation by reason')
plt.ylabel('cancellation count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Convert Ride_Start_Time and Ride_End_Time to datetime format
data['Ride_Start_Time'] = pd.to_datetime(data['Ride_Start_Time'], errors='coerce')  # Ensure proper conversion
data['Ride_End_Time'] = pd.to_datetime(data['Ride_End_Time'], errors='coerce')  # Ensure proper conversion

# Verify if the conversion was successful
print(data['Ride_Start_Time'].dtype)

# Ensure Hour_of_Ride column is created correctly
data['Hour_of_Ride'] = data['Ride_Start_Time'].dt.hour

# Cancellation Status Analysis
cancel_status_counts = data['Cancellation_Status'].value_counts()
print(cancel_status_counts)

# Plot cancellation status distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='Cancellation_Status', data=data, palette='viridis')
plt.title('Cancellation Status Distribution')
plt.xlabel('Cancellation Status')
plt.ylabel('Count')
plt.show()

# Cancellation reasons analysis
cancel_reasons_counts = data['Cancellation_Reason'].value_counts()
print(cancel_reasons_counts)

# Plot cancellation reasons
plt.figure(figsize=(8, 6))
sns.countplot(y='Cancellation_Reason', data=data, palette='viridis', order=data['Cancellation_Reason'].value_counts().index)
plt.title('Cancellation Reasons Distribution')
plt.xlabel('Count')
plt.ylabel('Cancellation Reason')
plt.show()

# Ride Fare Distribution (for cancelled vs completed rides)
plt.figure(figsize=(8, 6))
sns.boxplot(x='Cancellation_Status', y='Ride_Fare', data=data, palette='viridis')
plt.title('Ride Fare Distribution: Cancelled vs Completed')
plt.xlabel('Cancellation Status')
plt.ylabel('Ride Fare')
plt.show()

# Analyze the distribution of ride start times (hourly)
plt.figure(figsize=(8, 6))
sns.countplot(x='Hour_of_Ride', data=data, palette='viridis')
plt.title('Distribution of Ride Start Times by Hour')
plt.xlabel('Hour of Ride')
plt.ylabel('Count')
plt.show()

# Cancellation status by time of day (morning, afternoon, evening)
data['Time_of_Day'] = pd.cut(data['Hour_of_Ride'], bins=[0, 6, 12, 18, 24], labels=['Morning', 'Afternoon', 'Evening', 'Night'])
time_of_day_cancellation = data.groupby(['Time_of_Day', 'Cancellation_Status']).size().unstack()

# Plot cancellation status by time of day
time_of_day_cancellation.plot(kind='bar', stacked=True, figsize=(8, 6), colormap='viridis')
plt.title('Cancellation Status by Time of Day')
plt.xlabel('Time of Day')
plt.ylabel('Count')
plt.show()


In [None]:
data['Ride_Duration'] = (data['Ride_End_Time'] - data['Ride_Start_Time']).dt.total_seconds() / 60.0
data.head()
#Extract time-based features
data['Hour_of_Day'] = data['Ride_Start_Time'].dt.hour
data['Day_of_Week'] = data['Ride_Start_Time'].dt.dayofweek

In [None]:
# Encode categorical variables
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
label_encoder = LabelEncoder()
data['Cancellation_Reason'] = label_encoder.fit_transform(data['Cancellation_Reason'])
#data['User_Type'] = label_encoder.fit_transform(data['User_Type'])
data['Driver_ID'] = label_encoder.fit_transform(data['Driver_ID'])

In [None]:
# Step 4: Define Features and Target
target = 'Cancellation_Status'
features = ['Ride_Duration', 'Hour_of_Day', 'Day_of_Week', 'Cancellation_Reason', 'Driver_ID']

In [None]:
X = data[features]
y = label_encoder.fit_transform(data[target])

In [None]:
# Step 5: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Step 7: Train the Model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 8: Evaluate the Model
y_pred = model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


In [None]:
# Step 9: Feature Importance
feature_importances = model.feature_importances_
feature_names = features

plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances, y=feature_names, palette='viridis')
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier

In [None]:
from xgboost import XGBClassifier

# Initialize the model
model = XGBClassifier(
    n_estimators=100,        # Number of trees
    max_depth=6,             # Maximum depth of trees
    learning_rate=0.1,       # Step size for weight updates
    subsample=0.8,           # Fraction of samples used for each tree
    colsample_bytree=0.8,    # Fraction of features used for each tree
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]


In [None]:
# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# ROC-AUC score
roc_auc = roc_auc_score(y_test, y_prob)
print(f"ROC-AUC Score: {roc_auc:.2f}")


In [None]:
# Plot feature importance
importances = model.feature_importances_
feature_names = X.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=feature_names)
plt.title("Feature Importance")
plt.show()


In [None]:
# Example of a new scenario
new_scenario = pd.DataFrame({
    "ride_time_minutes": [30],
    "driver_rating": [4.5],
    "passenger_rating": [3.8],
    "ride_fare": [20],
    "cancellation_reason_encoded": [le_reason.transform(["Driver Late"])[0]]
})

# Scale the new scenario
new_scenario_scaled = scaler.transform(new_scenario)

# Predict the cancellation likelihood
cancellation_prob = model.predict_proba(new_scenario_scaled)[:, 1][0]
print(f"Probability of Cancellation: {cancellation_prob:.2%}")


Model Generalisation


In [None]:
# import openai

# openai.api_key = "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"

# def suggest_optimizations(ride_data):
#     prompt = f"""
#     Given the following ride details: {ride_data},
#     suggest ways to prevent cancellations. Consider better routes, time adjustments, or incentives.
#     """
#     response = openai.ChatCompletion.create(
#         # model="gpt-3.5-turbo",
#         messages=[{"role": "user", "content": prompt}]
#     )
#     return response['choices'][0]['message']['content']

# # Example usage
# ride_sample = {
#     "pickup_location": "Downtown",
#     "dropoff_location": "Airport",
#     "driver_rating": 4.2,
#     "passenger_rating": 4.8,
#     "time_of_request": "2025-01-28 15:00:00"
# }
# print(suggest_optimizations(ride_sample))


In [None]:
from faker import Faker

fake = Faker()

def generate_scenarios(n=1000):
    scenarios = []
    for _ in range(n):
        scenario = {
            "pickup_location": fake.city(),
            "dropoff_location": fake.city(),
            "driver_rating": round(fake.random.uniform(3.0, 5.0), 1),
            "passenger_rating": round(fake.random.uniform(3.0, 5.0), 1),
            "time_of_request": fake.date_time_this_month()
        }
        scenarios.append(scenario)
    return pd.DataFrame(scenarios)

generated_data = generate_scenarios()
print(generated_data.head())


FAKER DATA

In [None]:
import pandas as pd
import random
from faker import Faker

fake = Faker()

def generate_ride_data(num_samples=10000):
    data = []
    for _ in range(num_samples):
        data.append({
            "ride_id": fake.uuid4(),
            "pickup_location": fake.city(),
            "dropoff_location": fake.city(),
            "booking_time": fake.date_time_this_year(),
            "ride_time_minutes": random.randint(5, 60),
            "driver_rating": round(random.uniform(3.0, 5.0), 1),
            "passenger_rating": round(random.uniform(3.0, 5.0), 1),
            "cancellation_reason": random.choice(["None", "Driver Late", "Passenger No-show", "Changed Plans"]),
            "ride_fare": round(random.uniform(5, 50), 2),
            "cancellation_status": random.choice([0, 1])  # 0: Completed, 1: Cancelled
        })
    return pd.DataFrame(data)

data = generate_ride_data()
data.to_csv("ride_data.csv", index=False)


In [None]:
print(data['cancellation_status'].value_counts())


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv("ride_data.csv")

# Cancellation Rate
cancellation_rate = data['cancellation_status'].mean()
print(f"Cancellation Rate: {cancellation_rate:.2%}")

# Plot cancellation reasons
sns.countplot(y=data['cancellation_reason'], order=data['cancellation_reason'].value_counts().index)
plt.title("Reasons for Cancellation")
plt.show()

# # Plot correlation heatmap
# sns.heatmap(data.corr(), annot=True, cmap="coolwarm")
# plt.title("Correlation Heatmap")
# plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader

# Encode categorical data
le_reason = LabelEncoder()
data['cancellation_reason_encoded'] = le_reason.fit_transform(data['cancellation_reason'])

# Define features and target
features = ["ride_time_minutes", "driver_rating", "passenger_rating", "ride_fare", "cancellation_reason_encoded"]
X = data[features]
y = data['cancellation_status']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
import torch.nn as nn
import torch.optim as optim

# Define the model
class RideCancellationModel(nn.Module):
    def __init__(self, input_size):
        super(RideCancellationModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 32)
        self.bn1 = nn.BatchNorm1d(32)
        self.fc2 = nn.Linear(32, 16)
        self.bn2 = nn.BatchNorm1d(16)
        self.fc3 = nn.Linear(16, 1)
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = torch.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = self.sigmoid(self.fc3(x))
        return x


# Initialize model, loss, and optimizer
model = RideCancellationModel(input_size=X_train.shape[1])
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 20
for epoch in range(epochs):
    # Forward pass
    y_pred = model(X_train_tensor)
    loss = criterion(y_pred, y_train_tensor)

    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")


In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Experiment with 0.01 or 0.0001


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier  # Example: using Random Forest

# Wrap the PyTorch model in a scikit-learn wrapper if necessary
rf = RandomForestClassifier()
scores = cross_val_score(rf, X, y, cv=5)
print(f"Cross-validation scores: {scores}")

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train, y_train)
print(f"XGBoost Accuracy: {xgb.score(X_test, y_test):.2%}")


In [None]:
with torch.no_grad():
    y_test_pred = model(X_test_tensor)
    y_test_pred = (y_test_pred > 0.5).float()
    accuracy = (y_test_pred.eq(y_test_tensor).sum() / y_test_tensor.shape[0]).item()
    print(f"Test Accuracy: {accuracy:.2%}")


In [None]:
# import openai

# openai.api_key = "your_openai_api_key"

# def generate_scenario(prompt):
#     response = openai.Completion.create(
#         model="gpt-3.5-turbo",
#         prompt=prompt,
#         max_tokens=100
#     )
#     return response['choices'][0]['text']

# scenario_prompt = "Simulate a ride where the driver cancels due to traffic congestion."
# print(generate_scenario(scenario_prompt))


In [None]:
import streamlit as st
import matplotlib.pyplot as plt

st.title("Ride Cancellation Predictor and Optimizer")

# Upload Dataset
uploaded_file = st.file_uploader("Synthetic_Ride_Data_6000.csv", type=["csv"])
if uploaded_file is not None:
    data = pd.read_csv(uploaded_file)
    st.write(data.head())
#print(st)
# Display Metrics
st.subheader("Model Metrics")
st.write(f"Accuracy: {accuracy_score(y_test, y_pred)}")

# Plot Data
st.subheader("Cancellation Analysis")
fig, ax = plt.subplots()
data['Cancellation_Status'].value_counts().plot(kind='bar', ax=ax)
st.pyplot(fig)
