In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV

# Step 1: Load and preprocess the dataset
data = pd.read_csv('/content/merged_data.csv')

# Convert 'Date of Travel' to datetime and extract components
data['Date of Travel'] = pd.to_datetime(data['Date of Travel'])
data['Travel_Year'] = data['Date of Travel'].dt.year
data['Travel_Month'] = data['Date of Travel'].dt.month
data['Travel_Day'] = data['Date of Travel'].dt.day
data = data.drop(columns=['Date of Travel'])

# Encode categorical columns
categorical_cols = ['Company', 'City', 'Payment_Mode', 'Gender']  # Check if 'Month' is here
label_encoders = {}
for col in categorical_cols:
    if col in data.columns:  # Ensure column exists before encoding
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le

# Verify and drop 'Month' if it exists
if 'Month' in data.columns:
    print("Warning: 'Month' column found and will be dropped as it’s redundant with 'Travel_Month'.")
    data = data.drop(columns=['Month'])

# Feature engineering: Add interaction terms
data['City_Distance'] = data['City'] * data['Distance Travelled(KM)']

# Prepare features (X) and target (y)
exclude_cols = ['Profit', 'Price Charged', 'Cost of Trip', 'Income (USD/Month)', 'Age', 'Year', 'Payment_Mode', 'Gender']
X = data.drop(columns=exclude_cols)
y = data['Profit']

# Check data types and convert if necessary
print("\nData types of features:")
print(X.dtypes)
for col in X.columns:
    if X[col].dtype == 'object':
        raise ValueError(f"Column {col} has object dtype, which is not supported. Please encode or drop it.")

# Split data into train, validation, and test sets
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.125, random_state=42)  # 70% train, 10% val, 20% test

# Update categorical columns
categorical_cols = ['Company', 'City']

# Step 2: Hyperparameter tuning with RandomizedSearchCV for speed
print("\nPerforming Hyperparameter Tuning with RandomizedSearchCV...")
gbm = lgb.LGBMRegressor(
    objective='regression',
    boosting_type='gbdt',
    bagging_fraction=0.9,
    feature_fraction=0.85,
    bagging_freq=5,
    subsample=0.8,  # Speed boost without reducing rows
    verbose=-1,
    n_jobs=-1,
    lambda_l1=0.1,
    lambda_l2=0.1
)

param_dist = {
    'num_leaves': [30, 40, 50],
    'learning_rate': [0.01, 0.03, 0.05],
    'min_child_samples': [20, 30, 40],
    'n_estimators': [500, 1000]
}

rand_search = RandomizedSearchCV(
    estimator=gbm,
    param_distributions=param_dist,
    n_iter=10,  # Reduced combinations for speed
    cv=5,
    verbose=1,
    n_jobs=-1,
    error_score='raise',  # To debug errors
    random_state=42
)

rand_search.fit(X_train, y_train)
best_params = rand_search.best_params_
print("\nBest Hyperparameters from Randomized Search:")
print(best_params)
print(f"Best MSE (negative): {rand_search.best_score_:.2f}")

# Step 3: Train the final model with best parameters and validation
final_params = {
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'bagging_fraction': 0.9,
    'feature_fraction': 0.85,
    'bagging_freq': 5,
    'subsample': 0.8,
    'verbose': -1,
    'n_jobs': -1,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    **best_params
}

train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_cols, free_raw_data=False)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data, categorical_feature=categorical_cols, free_raw_data=False)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data, categorical_feature=categorical_cols, free_raw_data=False)

print("\nTraining Final LightGBM Model with Best Parameters...")
final_gbm = lgb.train(
    final_params,
    train_data,
    num_boost_round=best_params['n_estimators'],
    valid_sets=[val_data],
    callbacks=[lgb.early_stopping(stopping_rounds=20), lgb.log_evaluation(period=10)]
)

# Step 4: Evaluate on test set
y_pred = final_gbm.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

# Calculate tolerance-based accuracy
tolerance = 50
absolute_errors = np.abs(y_pred - y_test)
accuracy_within_tolerance = np.mean(absolute_errors <= tolerance) * 100

print("\nFinal LightGBM Performance on Test Data:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"Root Mean Squared Error (per trip): ${rmse:.2f}")
print(f"Accuracy (within ±$50 tolerance): {accuracy_within_tolerance:.2f}%")

# Step 5: Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': final_gbm.feature_importance(importance_type='gain')
}).sort_values(by='Importance', ascending=False)
print("\nFinal LightGBM Feature Importance:")
print(feature_importance)

# Step 6: Function to generate synthetic data with 2% inflation
def generate_future_data(base_data, year, month, num_trips):
    historical_month_data = base_data[base_data['Travel_Month'] == month].copy()
    if historical_month_data.empty:
        historical_month_data = base_data.copy()
    
    future_data = historical_month_data.sample(n=num_trips, replace=True, random_state=42).copy()
    future_data['Travel_Year'] = year
    future_data['Travel_Month'] = month
    future_data['Travel_Day'] = np.random.randint(1, 31, size=num_trips)
    years_diff = year - 2018
    future_data['Price Charged'] *= (1 + 0.02) ** years_diff
    future_data['Cost of Trip'] *= (1 + 0.02) ** years_diff
    future_data['City_Distance'] = future_data['City'] * future_data['Distance Travelled(KM)']
    future_X = future_data.drop(columns=exclude_cols)
    return future_X, future_data

avg_trips_per_month = len(data) // (data['Travel_Year'].nunique() * 12)
print(f"\nAverage Trips per Month: {avg_trips_per_month}")

while True:
    try:
        user_year = int(input("Enter the year for profit prediction (e.g., 2025): "))
        user_month = int(input("Enter the month for profit prediction (1-12): "))
        if 1 <= user_month <= 12 and user_year >= 2019:
            break
        else:
            print("Invalid input. Month must be 1-12, and year must be 2019 or later.")
    except ValueError:
        print("Please enter valid numeric values.")

# Generate future data and predict profits
future_X, future_data = generate_future_data(data, user_year, user_month, avg_trips_per_month)
future_pred_profits = final_gbm.predict(future_X)

# Total profit with confidence interval
total_future_profit = future_pred_profits.sum()
total_error = rmse * avg_trips_per_month
print(f"\nPredicted Total Company Profit for {user_month}/{user_year}: ${total_future_profit:,.2f}")
print(f"Estimated Range: ${total_future_profit - total_error:,.2f}–${total_future_profit + total_error:,.2f}")
print(f"(Note: Range based on RMSE ±${rmse:.2f} per trip across {avg_trips_per_month} trips)")

# Breakdown by company
future_data['Predicted_Profit'] = future_pred_profits
future_data['Company'] = label_encoders['Company'].inverse_transform(future_data['Company'])
profit_by_company = future_data.groupby('Company')['Predicted_Profit'].sum()
pink_cab_profit = profit_by_company.get('Pink Cab', 0)
yellow_cab_profit = profit_by_company.get('Yellow Cab', 0)
pink_error = total_error * (pink_cab_profit / total_future_profit) if total_future_profit != 0 else 0
yellow_error = total_error * (yellow_cab_profit / total_future_profit) if total_future_profit != 0 else 0

print(f"\nProfit Breakdown by Company for {user_month}/{user_year}:")
print(f"Pink Cab: ${pink_cab_profit:,.2f} (Range: ${pink_cab_profit - pink_error:,.2f}–${pink_cab_profit + pink_error:,.2f})")
print(f"Yellow Cab: ${yellow_cab_profit:,.2f} (Range: ${yellow_cab_profit - yellow_error:,.2f}–${yellow_cab_profit + yellow_error:,.2f})")
print(f"Total (Pink Cab + Yellow Cab): ${(pink_cab_profit + yellow_cab_profit):,.2f}")