In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Load the dataset
data = pd.read_csv('/content/merged_data.csv')  # Replace with your file path

# Step 2: Preprocess the data
data['Date of Travel'] = pd.to_datetime(data['Date of Travel'])
data['Travel_Year'] = data['Date of Travel'].dt.year
data['Travel_Month'] = data['Date of Travel'].dt.month
data['Travel_Day'] = data['Date of Travel'].dt.day
data = data.drop(columns=['Date of Travel'])

# Encode categorical columns
label_encoders = {}
categorical_cols = ['Company', 'City', 'Payment_Mode', 'Gender', 'Month']
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Prepare features (X) and target (y)
exclude_cols = ['Profit', 'Price Charged', 'Cost of Trip', 'Income (USD/Month)', 'Age']
X = data.drop(columns=exclude_cols)
y = data['Profit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features (important for Ridge and Lasso)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Step 3: Ridge Regression
print("\n=== Ridge Regression ===")

# Define Ridge model and hyperparameter grid
ridge = Ridge(random_state=42)
ridge_param_grid = {'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}

# Perform GridSearchCV
ridge_grid_search = GridSearchCV(
    estimator=ridge,
    param_grid=ridge_param_grid,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1,
    n_jobs=-1
)

ridge_grid_search.fit(X_train_scaled, y_train)
ridge_best_params = ridge_grid_search.best_params_
print("\nBest Hyperparameters for Ridge:")
print(ridge_best_params)
print(f"Best MSE (negative): {ridge_grid_search.best_score_:.2f}")

# Train Ridge model with best parameters
ridge_model = Ridge(**ridge_best_params, random_state=42)
ridge_model.fit(X_train_scaled, y_train)

# Evaluate on test set
ridge_y_pred = ridge_model.predict(X_test_scaled)
ridge_mse = mean_squared_error(y_test, ridge_y_pred)
ridge_r2 = r2_score(y_test, ridge_y_pred)
ridge_rmse = np.sqrt(ridge_mse)

# Calculate tolerance-based accuracy
tolerance = 50  # Consider predictions within $50 of the true profit as "accurate"
ridge_absolute_errors = np.abs(ridge_y_pred - y_test)
ridge_accuracy_within_tolerance = np.mean(ridge_absolute_errors <= tolerance) * 100

print("\nRidge Regression Performance on Test Data:")
print(f"Mean Squared Error: {ridge_mse:.2f}")
print(f"R² Score: {ridge_r2:.2f}")
print(f"Root Mean Squared Error (per trip): ${ridge_rmse:.2f}")
print(f"Accuracy (within ±$50 tolerance): {ridge_accuracy_within_tolerance:.2f}%")

# Feature coefficients (importance)
ridge_coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': ridge_model.coef_
}).sort_values(by='Coefficient', key=abs, ascending=False)
print("\nRidge Regression Feature Coefficients:")
print(ridge_coefficients)

# Step 4: Predict future profits with Ridge
def generate_future_data(base_data, year, month, num_trips, scaler):
    historical_month_data = base_data[base_data['Travel_Month'] == month].copy()
    if historical_month_data.empty:
        historical_month_data = base_data.copy()

    future_data = historical_month_data.sample(n=num_trips, replace=True, random_state=42).copy()
    future_data['Travel_Year'] = year
    future_data['Travel_Month'] = month
    future_data['Travel_Day'] = np.random.randint(1, 31, size=num_trips)
    years_diff = year - 2018
    future_data['Price Charged'] *= (1 + 0.02) ** years_diff  # 2% inflation
    future_data['Cost of Trip'] *= (1 + 0.02) ** years_diff   # 2% inflation
    future_X = future_data.drop(columns=['Profit', 'Price Charged', 'Cost of Trip', 'Income (USD/Month)', 'Age'])
    return future_X, future_data, scaler.transform(future_X)

avg_trips_per_month = len(data) // (data['Travel_Year'].nunique() * 12)
print(f"\nAverage Trips per Month: {avg_trips_per_month}")

while True:
    try:
        user_year = int(input("Enter the year for profit prediction (e.g., 2025): "))
        user_month = int(input("Enter the month for profit prediction (1-12): "))
        if 1 <= user_month <= 12 and user_year >= 2019:
            break
        else:
            print("Invalid input. Month must be 1-12, and year must be 2019 or later.")
    except ValueError:
        print("Please enter valid numeric values.")

# Generate future data and predict profits
future_X, future_data, future_X_scaled = generate_future_data(data, user_year, user_month, avg_trips_per_month, scaler)
ridge_future_pred_profits = ridge_model.predict(future_X_scaled)

# Total profit with confidence interval
ridge_total_future_profit = ridge_future_pred_profits.sum()
ridge_total_error = ridge_rmse * avg_trips_per_month
print(f"\nRidge Predicted Total Company Profit for {user_month}/{user_year}: ${ridge_total_future_profit:,.2f}")
print(f"Estimated Range: ${ridge_total_future_profit - ridge_total_error:,.2f}–${ridge_total_future_profit + ridge_total_error:,.2f}")
print(f"(Note: Range based on RMSE ±${ridge_rmse:.2f} per trip across {avg_trips_per_month} trips)")

# Breakdown by company
future_data['Predicted_Profit'] = ridge_future_pred_profits
future_data['Company'] = label_encoders['Company'].inverse_transform(future_data['Company'])
ridge_profit_by_company = future_data.groupby('Company')['Predicted_Profit'].sum()
ridge_pink_cab_profit = ridge_profit_by_company.get('Pink Cab', 0)
ridge_yellow_cab_profit = ridge_profit_by_company.get('Yellow Cab', 0)
ridge_pink_error = ridge_total_error * (ridge_pink_cab_profit / ridge_total_future_profit) if ridge_total_future_profit != 0 else 0
ridge_yellow_error = ridge_total_error * (ridge_yellow_cab_profit / ridge_total_future_profit) if ridge_total_future_profit != 0 else 0

print(f"\nRidge Profit Breakdown by Company for {user_month}/{user_year}:")
print(f"Pink Cab: ${ridge_pink_cab_profit:,.2f} (Range: ${ridge_pink_cab_profit - ridge_pink_error:,.2f}–${ridge_pink_cab_profit + ridge_pink_error:,.2f})")
print(f"Yellow Cab: ${ridge_yellow_cab_profit:,.2f} (Range: ${ridge_yellow_cab_profit - ridge_yellow_error:,.2f}–${ridge_yellow_cab_profit + ridge_yellow_error:,.2f})")
print(f"Total (Pink Cab + Yellow Cab): ${(ridge_pink_cab_profit + ridge_yellow_cab_profit):,.2f}")


=== Ridge Regression ===
Fitting 3 folds for each of 5 candidates, totalling 15 fits

Best Hyperparameters for Ridge:
{'alpha': 10.0}
Best MSE (negative): -18072.44

Ridge Regression Performance on Test Data:
Mean Squared Error: 18190.98
R² Score: 0.30
Root Mean Squared Error (per trip): $134.87
Accuracy (within ±$50 tolerance): 37.54%

Ridge Regression Feature Coefficients:
                  Feature  Coefficient
2  Distance Travelled(KM)    73.976684
0                 Company    40.668219
8            Travel_Month   -14.058876
1                    City     9.905493
6                    Year    -4.916546
7             Travel_Year    -4.916546
5                   Month     2.608825
4                  Gender     2.392127
9              Travel_Day    -1.159043
3            Payment_Mode     0.241310

Average Trips per Month: 9983
Enter the year for profit prediction (e.g., 2025): 2025
Enter the month for profit prediction (1-12): 2

Ridge Predicted Total Company Profit for 2/2025: $1,131,

In [None]:
# Step 5: Lasso Regression
print("\n=== Lasso Regression ===")

# Define Lasso model and hyperparameter grid
lasso = Lasso(random_state=42, max_iter=10000)
lasso_param_grid = {'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}

# Perform GridSearchCV
lasso_grid_search = GridSearchCV(
    estimator=lasso,
    param_grid=lasso_param_grid,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1,
    n_jobs=-1
)

lasso_grid_search.fit(X_train_scaled, y_train)
lasso_best_params = lasso_grid_search.best_params_
print("\nBest Hyperparameters for Lasso:")
print(lasso_best_params)
print(f"Best MSE (negative): {lasso_grid_search.best_score_:.2f}")

# Train Lasso model with best parameters
lasso_model = Lasso(**lasso_best_params, random_state=42, max_iter=10000)
lasso_model.fit(X_train_scaled, y_train)

# Evaluate on test set
lasso_y_pred = lasso_model.predict(X_test_scaled)
lasso_mse = mean_squared_error(y_test, lasso_y_pred)
lasso_r2 = r2_score(y_test, lasso_y_pred)
lasso_rmse = np.sqrt(lasso_mse)

# Calculate tolerance-based accuracy
lasso_absolute_errors = np.abs(lasso_y_pred - y_test)
lasso_accuracy_within_tolerance = np.mean(lasso_absolute_errors <= tolerance) * 100

print("\nLasso Regression Performance on Test Data:")
print(f"Mean Squared Error: {lasso_mse:.2f}")
print(f"R² Score: {lasso_r2:.2f}")
print(f"Root Mean Squared Error (per trip): ${lasso_rmse:.2f}")
print(f"Accuracy (within ±$50 tolerance): {lasso_accuracy_within_tolerance:.2f}%")

# Feature coefficients (importance)
lasso_coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lasso_model.coef_
}).sort_values(by='Coefficient', key=abs, ascending=False)
print("\nLasso Regression Feature Coefficients:")
print(lasso_coefficients)

# Step 6: Predict future profits with Lasso
future_X, future_data, future_X_scaled = generate_future_data(data, user_year, user_month, avg_trips_per_month, scaler)
lasso_future_pred_profits = lasso_model.predict(future_X_scaled)

# Total profit with confidence interval
lasso_total_future_profit = lasso_future_pred_profits.sum()
lasso_total_error = lasso_rmse * avg_trips_per_month
print(f"\nLasso Predicted Total Company Profit for {user_month}/{user_year}: ${lasso_total_future_profit:,.2f}")
print(f"Estimated Range: ${lasso_total_future_profit - lasso_total_error:,.2f}–${lasso_total_future_profit + lasso_total_error:,.2f}")
print(f"(Note: Range based on RMSE ±${lasso_rmse:.2f} per trip across {avg_trips_per_month} trips)")

# Breakdown by company
future_data['Predicted_Profit'] = lasso_future_pred_profits
future_data['Company'] = label_encoders['Company'].inverse_transform(future_data['Company'])
lasso_profit_by_company = future_data.groupby('Company')['Predicted_Profit'].sum()
lasso_pink_cab_profit = lasso_profit_by_company.get('Pink Cab', 0)
lasso_yellow_cab_profit = lasso_profit_by_company.get('Yellow Cab', 0)
lasso_pink_error = lasso_total_error * (lasso_pink_cab_profit / lasso_total_future_profit) if lasso_total_future_profit != 0 else 0
lasso_yellow_error = lasso_total_error * (lasso_yellow_cab_profit / lasso_total_future_profit) if lasso_total_future_profit != 0 else 0

print(f"\nLasso Profit Breakdown by Company for {user_month}/{user_year}:")
print(f"Pink Cab: ${lasso_pink_cab_profit:,.2f} (Range: ${lasso_pink_cab_profit - lasso_pink_error:,.2f}–${lasso_pink_cab_profit + lasso_pink_error:,.2f})")
print(f"Yellow Cab: ${lasso_yellow_cab_profit:,.2f} (Range: ${lasso_yellow_cab_profit - lasso_yellow_error:,.2f}–${lasso_yellow_cab_profit + lasso_yellow_error:,.2f})")
print(f"Total (Pink Cab + Yellow Cab): ${(lasso_pink_cab_profit + lasso_yellow_cab_profit):,.2f}")


=== Lasso Regression ===
Fitting 3 folds for each of 5 candidates, totalling 15 fits

Best Hyperparameters for Lasso:
{'alpha': 0.01}
Best MSE (negative): -18072.44

Lasso Regression Performance on Test Data:
Mean Squared Error: 18191.03
R² Score: 0.30
Root Mean Squared Error (per trip): $134.87
Accuracy (within ±$50 tolerance): 37.53%

Lasso Regression Feature Coefficients:
                  Feature   Coefficient
2  Distance Travelled(KM)  7.396927e+01
0                 Company  4.066006e+01
8            Travel_Month -1.404699e+01
1                    City  9.895806e+00
6                    Year -9.822952e+00
5                   Month  2.595749e+00
4                  Gender  2.382457e+00
9              Travel_Day -1.149170e+00
3            Payment_Mode  2.313607e-01
7             Travel_Year -2.470550e-15

Lasso Predicted Total Company Profit for 2/2025: $1,621,046.43
Estimated Range: $274,597.87–$2,967,495.00
(Note: Range based on RMSE ±$134.87 per trip across 9983 trips)

Lasso Pro

In [None]:
# Step 7: Compare Ridge and Lasso
print("\n=== Comparison of Ridge and Lasso Regression ===")
comparison = pd.DataFrame({
    'Model': ['Ridge', 'Lasso'],
    'MSE': [ridge_mse, lasso_mse],
    'R² Score': [ridge_r2, lasso_r2],
    'RMSE': [ridge_rmse, lasso_rmse],
    'Accuracy (within ±$50)': [ridge_accuracy_within_tolerance, lasso_accuracy_within_tolerance],
    'Total Profit (Predicted)': [ridge_total_future_profit, lasso_total_future_profit],
    'Profit Range (Lower)': [ridge_total_future_profit - ridge_total_error, lasso_total_future_profit - lasso_total_error],
    'Profit Range (Upper)': [ridge_total_future_profit + ridge_total_error, lasso_total_future_profit + lasso_total_error]
})
print(comparison)


=== Comparison of Ridge and Lasso Regression ===
   Model           MSE  R² Score        RMSE  Accuracy (within ±$50)  \
0  Ridge  18190.976033  0.297028  134.873926               37.538085   
1  Lasso  18191.034386  0.297025  134.874143               37.532520   

   Total Profit (Predicted)  Profit Range (Lower)  Profit Range (Upper)  
0              1.131180e+06        -215266.572743          2.477626e+06  
1              1.621046e+06         274597.865776          2.967495e+06  


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

# Step 1: Load the dataset
data = pd.read_csv('/content/merged_data.csv')  # Replace with your file path

# Step 2: Explore the dataset
print("=== Dataset Exploration ===")

# Number of rows and columns
print(f"Number of rows: {data.shape[0]}")
print(f"Number of columns: {data.shape[1]}")

# Column names and data types
print("\nColumn names and data types:")
print(data.dtypes)

# Check for missing values
print("\nMissing values per column:")
print(data.isnull().sum())

# Basic statistics for numerical columns
print("\nBasic statistics for numerical columns:")
print(data.describe())

# Unique values in categorical columns
categorical_cols = ['Company', 'City', 'Payment_Mode', 'Gender', 'Month']
print("\nUnique values in categorical columns:")
for col in categorical_cols:
    if col in data.columns:
        print(f"{col}: {data[col].unique()}")

# Sample of the first few rows
print("\nSample of the dataset (first 5 rows):")
print(data.head())

# Step 3: Preprocess the data
# Convert 'Date of Travel' to datetime and extract components
if 'Date of Travel' in data.columns:
    data['Date of Travel'] = pd.to_datetime(data['Date of Travel'])
    data['Travel_Year'] = data['Date of Travel'].dt.year
    data['Travel_Month'] = data['Date of Travel'].dt.month
    data['Travel_Day'] = data['Date of Travel'].dt.day
    data = data.drop(columns=['Date of Travel'])
else:
    print("Warning: 'Date of Travel' not found. Ensure 'Travel_Year', 'Travel_Month', 'Travel_Day' are present.")
    required_date_cols = ['Travel_Year', 'Travel_Month', 'Travel_Day']
    if not all(col in data.columns for col in required_date_cols):
        raise KeyError("Dataset must contain either 'Date of Travel' or preprocessed 'Travel_Year', 'Travel_Month', 'Travel_Day' columns")

# Encode categorical columns
label_encoders = {}
for col in categorical_cols:
    if col in data.columns:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le
    else:
        print(f"Warning: '{col}' not found in dataset, skipping encoding")

# Step 4: Prepare features (X) and target (y) for LightGBM
exclude_cols = ['Profit', 'Price Charged', 'Cost of Trip', 'Income (USD/Month)', 'Age']
X = data.drop(columns=exclude_cols)
y = data['Profit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Convert to LightGBM Dataset (before training)
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Step 6: Print prepared data summary
print("\n=== Prepared Data for LightGBM ===")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Target variable (y) range: min={y.min():.2f}, max={y.max():.2f}, mean={y.mean():.2f}")
print("\nFeatures prepared for LightGBM (X):")
print(X.columns.tolist())
print("\nSample of prepared features (first 5 rows of X_train):")
print(X_train.head())

=== Dataset Exploration ===
Number of rows: 359392
Number of columns: 13

Column names and data types:
Date of Travel             object
Company                    object
City                       object
Distance Travelled(KM)    float64
Price Charged             float64
Cost of Trip              float64
Payment_Mode               object
Gender                     object
Age                         int64
Income (USD/Month)          int64
Profit                    float64
Month                      object
Year                        int64
dtype: object

Missing values per column:
Date of Travel            0
Company                   0
City                      0
Distance Travelled(KM)    0
Price Charged             0
Cost of Trip              0
Payment_Mode              0
Gender                    0
Age                       0
Income (USD/Month)        0
Profit                    0
Month                     0
Year                      0
dtype: int64

Basic statistics for numerical colu

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb

# Step 1: Load and preprocess the dataset (using your output as reference)
data = pd.read_csv('/content/merged_data.csv')  # Replace with your file path

# Convert 'Date of Travel' to datetime and extract components
data['Date of Travel'] = pd.to_datetime(data['Date of Travel'])
data['Travel_Year'] = data['Date of Travel'].dt.year
data['Travel_Month'] = data['Date of Travel'].dt.month
data['Travel_Day'] = data['Date of Travel'].dt.day
data = data.drop(columns=['Date of Travel'])

# Encode categorical columns
categorical_cols = ['Company', 'City', 'Payment_Mode', 'Gender', 'Month']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Prepare features (X) and target (y)
exclude_cols = ['Profit', 'Price Charged', 'Cost of Trip', 'Income (USD/Month)', 'Age', 'Year']  # Drop 'Year' to avoid multicollinearity
X = data.drop(columns=exclude_cols)
y = data['Profit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Convert to LightGBM Dataset with categorical features
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_cols, free_raw_data=False)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data, categorical_feature=categorical_cols, free_raw_data=False)

# Step 3: Define LightGBM parameters
params = {
    'objective': 'regression',  # Regression task
    'metric': 'mse',           # Mean Squared Error
    'boosting_type': 'gbdt',   # Traditional Gradient Boosting Decision Tree
    'num_leaves': 31,          # Number of leaves in one tree
    'learning_rate': 0.1,      # Step size shrinkage
    'feature_fraction': 0.9,   # Fraction of features to consider per tree
    'bagging_fraction': 0.8,   # Fraction of data to use for bagging
    'bagging_freq': 5,         # Frequency of bagging
    'verbose': -1,             # No verbose output during training
    'n_jobs': -1              # Use all available cores
}

# Step 4: Train the model
num_round = 100  # Number of boosting iterations
print("\nTraining LightGBM Model...")
gbm = lgb.train(params, train_data, num_boost_round=num_round, valid_sets=[test_data], callbacks=[lgb.log_evaluation(period=10)])

# Step 5: Evaluate the model
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

# Calculate tolerance-based accuracy
tolerance = 50  # Predictions within $50 of true profit
absolute_errors = np.abs(y_pred - y_test)
accuracy_within_tolerance = np.mean(absolute_errors <= tolerance) * 100

print("\nLightGBM Performance on Test Data:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"Root Mean Squared Error (per trip): ${rmse:.2f}")
print(f"Accuracy (within ±$50 tolerance): {accuracy_within_tolerance:.2f}%")

# Step 6: Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': gbm.feature_importance(importance_type='gain')
}).sort_values(by='Importance', ascending=False)
print("\nLightGBM Feature Importance:")
print(feature_importance)

# Step 7: Function to generate synthetic data with 2% inflation
def generate_future_data(base_data, year, month, num_trips, scaler=None):
    historical_month_data = base_data[base_data['Travel_Month'] == month].copy()
    if historical_month_data.empty:
        historical_month_data = base_data.copy()

    future_data = historical_month_data.sample(n=num_trips, replace=True, random_state=42).copy()
    future_data['Travel_Year'] = year
    future_data['Travel_Month'] = month
    future_data['Travel_Day'] = np.random.randint(1, 31, size=num_trips)
    years_diff = year - 2018
    future_data['Price Charged'] *= (1 + 0.02) ** years_diff  # 2% inflation
    future_data['Cost of Trip'] *= (1 + 0.02) ** years_diff   # 2% inflation
    future_X = future_data.drop(columns=exclude_cols)
    return future_X, future_data

avg_trips_per_month = len(data) // (data['Travel_Year'].nunique() * 12)
print(f"\nAverage Trips per Month: {avg_trips_per_month}")

while True:
    try:
        user_year = int(input("Enter the year for profit prediction (e.g., 2025): "))
        user_month = int(input("Enter the month for profit prediction (1-12): "))
        if 1 <= user_month <= 12 and user_year >= 2019:
            break
        else:
            print("Invalid input. Month must be 1-12, and year must be 2019 or later.")
    except ValueError:
        print("Please enter valid numeric values.")

# Generate future data and predict profits
future_X, future_data = generate_future_data(data, user_year, user_month, avg_trips_per_month)
future_pred_profits = gbm.predict(future_X)

# Total profit with confidence interval
total_future_profit = future_pred_profits.sum()
total_error = rmse * avg_trips_per_month
print(f"\nPredicted Total Company Profit for {user_month}/{user_year}: ${total_future_profit:,.2f}")
print(f"Estimated Range: ${total_future_profit - total_error:,.2f}–${total_future_profit + total_error:,.2f}")
print(f"(Note: Range based on RMSE ±${rmse:.2f} per trip across {avg_trips_per_month} trips)")

# Breakdown by company
future_data['Predicted_Profit'] = future_pred_profits
future_data['Company'] = label_encoders['Company'].inverse_transform(future_data['Company'])
profit_by_company = future_data.groupby('Company')['Predicted_Profit'].sum()
pink_cab_profit = profit_by_company.get('Pink Cab', 0)
yellow_cab_profit = profit_by_company.get('Yellow Cab', 0)
pink_error = total_error * (pink_cab_profit / total_future_profit) if total_future_profit != 0 else 0
yellow_error = total_error * (yellow_cab_profit / total_future_profit) if total_future_profit != 0 else 0

print(f"\nProfit Breakdown by Company for {user_month}/{user_year}:")
print(f"Pink Cab: ${pink_cab_profit:,.2f} (Range: ${pink_cab_profit - pink_error:,.2f}–${pink_cab_profit + pink_error:,.2f})")
print(f"Yellow Cab: ${yellow_cab_profit:,.2f} (Range: ${yellow_cab_profit - yellow_error:,.2f}–${yellow_cab_profit + yellow_error:,.2f})")
print(f"Total (Pink Cab + Yellow Cab): ${(pink_cab_profit + yellow_cab_profit):,.2f}")


Training LightGBM Model...
[10]	valid_0's l2: 9947
[20]	valid_0's l2: 6222.64
[30]	valid_0's l2: 5466.64
[40]	valid_0's l2: 5224.18
[50]	valid_0's l2: 5025.66
[60]	valid_0's l2: 4920.94
[70]	valid_0's l2: 4849.04
[80]	valid_0's l2: 4788.95
[90]	valid_0's l2: 4759.51
[100]	valid_0's l2: 4702.81

LightGBM Performance on Test Data:
Mean Squared Error: 4702.81
R² Score: 0.82
Root Mean Squared Error (per trip): $68.58
Accuracy (within ±$50 tolerance): 67.28%

LightGBM Feature Importance:
                  Feature    Importance
1                    City  1.025571e+10
2  Distance Travelled(KM)  9.639423e+09
0                 Company  2.982890e+09
5                   Month  1.286400e+09
7            Travel_Month  6.944023e+08
6             Travel_Year  3.338823e+08
8              Travel_Day  2.670672e+08
3            Payment_Mode  1.002593e+06
4                  Gender  8.595656e+05

Average Trips per Month: 9983
Enter the year for profit prediction (e.g., 2025): 2025
Enter the month for prof

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb

# Step 1: Load and preprocess the dataset
data = pd.read_csv('/content/merged_data.csv')

# Convert 'Date of Travel' to datetime and extract components
data['Date of Travel'] = pd.to_datetime(data['Date of Travel'])
data['Travel_Year'] = data['Date of Travel'].dt.year
data['Travel_Month'] = data['Date of Travel'].dt.month
data['Travel_Day'] = data['Date of Travel'].dt.day
data = data.drop(columns=['Date of Travel'])

# Encode categorical columns
categorical_cols = ['Company', 'City', 'Payment_Mode', 'Gender', 'Month']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Prepare features (X) and target (y)
exclude_cols = ['Profit', 'Price Charged', 'Cost of Trip', 'Income (USD/Month)', 'Age', 'Year', 'Month']  # Drop 'Year' and 'Month'
X = data.drop(columns=exclude_cols)
y = data['Profit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Update categorical columns (exclude 'Month')
categorical_cols = ['Company', 'City', 'Payment_Mode', 'Gender']

# Step 2: Convert to LightGBM Dataset
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_cols, free_raw_data=False)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data, categorical_feature=categorical_cols, free_raw_data=False)

# Step 3: Define LightGBM parameters with early stopping
params = {
    'objective': 'regression',
    'metric': 'mse',
    'boosting_type': 'gbdt',
    'num_leaves': 40,          # Increased from 31 to capture more complexity
    'learning_rate': 0.05,     # Reduced from 0.1 for more stable learning
    'feature_fraction': 0.8,   # Reduced to prevent overfitting
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'n_jobs': -1,
    'lambda_l1': 0.1,         # L1 regularization
    'lambda_l2': 0.1          # L2 regularization
}

# Step 4: Train the model with early stopping
num_round = 1000  # Higher max iterations, but early stopping will halt
print("\nTraining LightGBM Model with Early Stopping...")
gbm = lgb.train(
    params,
    train_data,
    num_boost_round=num_round,
    valid_sets=[test_data],
    callbacks=[lgb.early_stopping(stopping_rounds=20), lgb.log_evaluation(period=10)]
)

# Step 5: Evaluate the model
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

# Calculate tolerance-based accuracy
tolerance = 50
absolute_errors = np.abs(y_pred - y_test)
accuracy_within_tolerance = np.mean(absolute_errors <= tolerance) * 100

print("\nOptimized LightGBM Performance on Test Data:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"Root Mean Squared Error (per trip): ${rmse:.2f}")
print(f"Accuracy (within ±$50 tolerance): {accuracy_within_tolerance:.2f}%")

# Step 6: Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': gbm.feature_importance(importance_type='gain')
}).sort_values(by='Importance', ascending=False)
print("\nLightGBM Feature Importance:")
print(feature_importance)

# Step 7: Function to generate synthetic data with 2% inflation
def generate_future_data(base_data, year, month, num_trips):
    historical_month_data = base_data[base_data['Travel_Month'] == month].copy()
    if historical_month_data.empty:
        historical_month_data = base_data.copy()

    future_data = historical_month_data.sample(n=num_trips, replace=True, random_state=42).copy()
    future_data['Travel_Year'] = year
    future_data['Travel_Month'] = month
    future_data['Travel_Day'] = np.random.randint(1, 31, size=num_trips)
    years_diff = year - 2018
    future_data['Price Charged'] *= (1 + 0.02) ** years_diff  # 2% inflation
    future_data['Cost of Trip'] *= (1 + 0.02) ** years_diff   # 2% inflation
    future_X = future_data.drop(columns=exclude_cols)
    return future_X, future_data

avg_trips_per_month = len(data) // (data['Travel_Year'].nunique() * 12)
print(f"\nAverage Trips per Month: {avg_trips_per_month}")

while True:
    try:
        user_year = int(input("Enter the year for profit prediction (e.g., 2025): "))
        user_month = int(input("Enter the month for profit prediction (1-12): "))
        if 1 <= user_month <= 12 and user_year >= 2019:
            break
        else:
            print("Invalid input. Month must be 1-12, and year must be 2019 or later.")
    except ValueError:
        print("Please enter valid numeric values.")

# Generate future data and predict profits
future_X, future_data = generate_future_data(data, user_year, user_month, avg_trips_per_month)
future_pred_profits = gbm.predict(future_X)

# Total profit with confidence interval
total_future_profit = future_pred_profits.sum()
total_error = rmse * avg_trips_per_month
print(f"\nPredicted Total Company Profit for {user_month}/{user_year}: ${total_future_profit:,.2f}")
print(f"Estimated Range: ${total_future_profit - total_error:,.2f}–${total_future_profit + total_error:,.2f}")
print(f"(Note: Range based on RMSE ±${rmse:.2f} per trip across {avg_trips_per_month} trips)")

# Breakdown by company
future_data['Predicted_Profit'] = future_pred_profits
future_data['Company'] = label_encoders['Company'].inverse_transform(future_data['Company'])
profit_by_company = future_data.groupby('Company')['Predicted_Profit'].sum()
pink_cab_profit = profit_by_company.get('Pink Cab', 0)
yellow_cab_profit = profit_by_company.get('Yellow Cab', 0)
pink_error = total_error * (pink_cab_profit / total_future_profit) if total_future_profit != 0 else 0
yellow_error = total_error * (yellow_cab_profit / total_future_profit) if total_future_profit != 0 else 0

print(f"\nProfit Breakdown by Company for {user_month}/{user_year}:")
print(f"Pink Cab: ${pink_cab_profit:,.2f} (Range: ${pink_cab_profit - pink_error:,.2f}–${pink_cab_profit + pink_error:,.2f})")
print(f"Yellow Cab: ${yellow_cab_profit:,.2f} (Range: ${yellow_cab_profit - yellow_error:,.2f}–${yellow_cab_profit + yellow_error:,.2f})")
print(f"Total (Pink Cab + Yellow Cab): ${(pink_cab_profit + yellow_cab_profit):,.2f}")


Training LightGBM Model with Early Stopping...
Training until validation scores don't improve for 20 rounds
[10]	valid_0's l2: 16209.8
[20]	valid_0's l2: 10958.1
[30]	valid_0's l2: 8269.72
[40]	valid_0's l2: 6978.08
[50]	valid_0's l2: 6169.17
[60]	valid_0's l2: 5682.16
[70]	valid_0's l2: 5427.69
[80]	valid_0's l2: 5263.17
[90]	valid_0's l2: 5153.47
[100]	valid_0's l2: 5078.43
[110]	valid_0's l2: 5007.06
[120]	valid_0's l2: 4945.51
[130]	valid_0's l2: 4907.55
[140]	valid_0's l2: 4872.67
[150]	valid_0's l2: 4834.95
[160]	valid_0's l2: 4792.48
[170]	valid_0's l2: 4771.43
[180]	valid_0's l2: 4752.26
[190]	valid_0's l2: 4722.35
[200]	valid_0's l2: 4710.77
[210]	valid_0's l2: 4684.69
[220]	valid_0's l2: 4671.17
[230]	valid_0's l2: 4649.75
[240]	valid_0's l2: 4634.77
[250]	valid_0's l2: 4622.72
[260]	valid_0's l2: 4606.47
[270]	valid_0's l2: 4598.23
[280]	valid_0's l2: 4588.48
[290]	valid_0's l2: 4575.11
[300]	valid_0's l2: 4558.89
[310]	valid_0's l2: 4550.17
[320]	valid_0's l2: 4540.75
[330

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

# Step 1: Load and preprocess the dataset
data = pd.read_csv('/content/merged_data.csv')

# Convert 'Date of Travel' to datetime and extract components
data['Date of Travel'] = pd.to_datetime(data['Date of Travel'])
data['Travel_Year'] = data['Date of Travel'].dt.year
data['Travel_Month'] = data['Date of Travel'].dt.month
data['Travel_Day'] = data['Date of Travel'].dt.day
data = data.drop(columns=['Date of Travel'])

# Encode categorical columns
categorical_cols = ['Company', 'City', 'Payment_Mode', 'Gender', 'Month']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Feature engineering: Add interaction term
data['Distance_Company'] = data['Distance Travelled(KM)'] * data['Company']

# Prepare features (X) and target (y)
exclude_cols = ['Profit', 'Price Charged', 'Cost of Trip', 'Income (USD/Month)', 'Age', 'Year', 'Month']
X = data.drop(columns=exclude_cols)
y = data['Profit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Update categorical columns
categorical_cols = ['Company', 'City', 'Payment_Mode', 'Gender']

# Step 2: Hyperparameter tuning with GridSearchCV
print("\nPerforming Hyperparameter Tuning with GridSearchCV...")
gbm = lgb.LGBMRegressor(
    objective='huber',  # Robust loss function
    boosting_type='gbdt',
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=5,
    verbose=-1,
    n_jobs=-1,
    lambda_l1=0.1,
    lambda_l2=0.1
)

param_grid = {
    'num_leaves': [31, 40, 50],
    'learning_rate': [0.03, 0.05],
    'min_child_samples': [20, 30],
    'n_estimators': [1000]
}

grid_search = GridSearchCV(
    estimator=gbm,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("\nBest Hyperparameters from Grid Search:")
print(best_params)
print(f"Best MSE (negative): {grid_search.best_score_:.2f}")

# Step 3: Train the final model with best parameters
final_params = {
    'objective': 'huber',
    'boosting_type': 'gbdt',
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'n_jobs': -1,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    **best_params
}

train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_cols, free_raw_data=False)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data, categorical_feature=categorical_cols, free_raw_data=False)

print("\nTraining Final LightGBM Model with Best Parameters...")
final_gbm = lgb.train(
    final_params,
    train_data,
    num_boost_round=best_params['n_estimators'],
    valid_sets=[test_data],
    callbacks=[lgb.early_stopping(stopping_rounds=20), lgb.log_evaluation(period=10)]
)

# Step 4: Evaluate the model
y_pred = final_gbm.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

# Calculate tolerance-based accuracy
tolerance = 50
absolute_errors = np.abs(y_pred - y_test)
accuracy_within_tolerance = np.mean(absolute_errors <= tolerance) * 100

print("\nFinal LightGBM Performance on Test Data:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"Root Mean Squared Error (per trip): ${rmse:.2f}")
print(f"Accuracy (within ±$50 tolerance): {accuracy_within_tolerance:.2f}%")

# Step 5: Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': final_gbm.feature_importance(importance_type='gain')
}).sort_values(by='Importance', ascending=False)
print("\nFinal LightGBM Feature Importance:")
print(feature_importance)

# Step 6: Function to generate synthetic data with 2% inflation
def generate_future_data(base_data, year, month, num_trips):
    historical_month_data = base_data[base_data['Travel_Month'] == month].copy()
    if historical_month_data.empty:
        historical_month_data = base_data.copy()

    future_data = historical_month_data.sample(n=num_trips, replace=True, random_state=42).copy()
    future_data['Travel_Year'] = year
    future_data['Travel_Month'] = month
    future_data['Travel_Day'] = np.random.randint(1, 31, size=num_trips)
    years_diff = year - 2018
    future_data['Price Charged'] *= (1 + 0.02) ** years_diff  # 2% inflation
    future_data['Cost of Trip'] *= (1 + 0.02) ** years_diff   # 2% inflation
    future_data['Distance_Company'] = future_data['Distance Travelled(KM)'] * future_data['Company']
    future_X = future_data.drop(columns=exclude_cols)
    return future_X, future_data

avg_trips_per_month = len(data) // (data['Travel_Year'].nunique() * 12)
print(f"\nAverage Trips per Month: {avg_trips_per_month}")

while True:
    try:
        user_year = int(input("Enter the year for profit prediction (e.g., 2025): "))
        user_month = int(input("Enter the month for profit prediction (1-12): "))
        if 1 <= user_month <= 12 and user_year >= 2019:
            break
        else:
            print("Invalid input. Month must be 1-12, and year must be 2019 or later.")
    except ValueError:
        print("Please enter valid numeric values.")

# Generate future data and predict profits
future_X, future_data = generate_future_data(data, user_year, user_month, avg_trips_per_month)
future_pred_profits = final_gbm.predict(future_X)

# Total profit with confidence interval
total_future_profit = future_pred_profits.sum()
total_error = rmse * avg_trips_per_month
print(f"\nPredicted Total Company Profit for {user_month}/{user_year}: ${total_future_profit:,.2f}")
print(f"Estimated Range: ${total_future_profit - total_error:,.2f}–${total_future_profit + total_error:,.2f}")
print(f"(Note: Range based on RMSE ±${rmse:.2f} per trip across {avg_trips_per_month} trips)")

# Breakdown by company
future_data['Predicted_Profit'] = future_pred_profits
future_data['Company'] = label_encoders['Company'].inverse_transform(future_data['Company'])
profit_by_company = future_data.groupby('Company')['Predicted_Profit'].sum()
pink_cab_profit = profit_by_company.get('Pink Cab', 0)
yellow_cab_profit = profit_by_company.get('Yellow Cab', 0)
pink_error = total_error * (pink_cab_profit / total_future_profit) if total_future_profit != 0 else 0
yellow_error = total_error * (yellow_cab_profit / total_future_profit) if total_future_profit != 0 else 0

print(f"\nProfit Breakdown by Company for {user_month}/{user_year}:")
print(f"Pink Cab: ${pink_cab_profit:,.2f} (Range: ${pink_cab_profit - pink_error:,.2f}–${pink_cab_profit + pink_error:,.2f})")
print(f"Yellow Cab: ${yellow_cab_profit:,.2f} (Range: ${yellow_cab_profit - yellow_error:,.2f}–${yellow_cab_profit + yellow_error:,.2f})")
print(f"Total (Pink Cab + Yellow Cab): ${(pink_cab_profit + yellow_cab_profit):,.2f}")


Performing Hyperparameter Tuning with GridSearchCV...
Fitting 3 folds for each of 12 candidates, totalling 36 fits

Best Hyperparameters from Grid Search:
{'learning_rate': 0.05, 'min_child_samples': 30, 'n_estimators': 1000, 'num_leaves': 50}
Best MSE (negative): -19762.69

Training Final LightGBM Model with Best Parameters...




Training until validation scores don't improve for 20 rounds
[10]	valid_0's huber: 105.504
[20]	valid_0's huber: 105.313
[30]	valid_0's huber: 105.096
[40]	valid_0's huber: 104.868
[50]	valid_0's huber: 104.653
[60]	valid_0's huber: 104.419
[70]	valid_0's huber: 104.195
[80]	valid_0's huber: 103.98
[90]	valid_0's huber: 103.745
[100]	valid_0's huber: 103.521
[110]	valid_0's huber: 103.299
[120]	valid_0's huber: 103.075
[130]	valid_0's huber: 102.871
[140]	valid_0's huber: 102.649
[150]	valid_0's huber: 102.446
[160]	valid_0's huber: 102.263
[170]	valid_0's huber: 102.041
[180]	valid_0's huber: 101.82
[190]	valid_0's huber: 101.608
[200]	valid_0's huber: 101.395
[210]	valid_0's huber: 101.156
[220]	valid_0's huber: 100.967
[230]	valid_0's huber: 100.778
[240]	valid_0's huber: 100.551
[250]	valid_0's huber: 100.326
[260]	valid_0's huber: 100.095
[270]	valid_0's huber: 99.8773
[280]	valid_0's huber: 99.6794
[290]	valid_0's huber: 99.4461
[300]	valid_0's huber: 99.22
[310]	valid_0's huber: