In [2]:
# üì¶ Essential Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib  # ‚úÖ For efficient saving/loading of models and transformers

# --- Configuration ---
DATA_PATH = "traffic volume.csv"
MODEL_SAVE_PATH = 'traffic_model.joblib'
ENCODERS_SAVE_PATH = 'encoders.joblib'
SCALER_SAVE_PATH = 'scaler.joblib'
TEST_SIZE_RATIO = 0.2
RANDOM_STATE_SEED = 42

print("--- Starting Traffic Volume Prediction Script ---")

# üì• Step 1: Load Data
try:
    data = pd.read_csv(DATA_PATH)
    print(f"‚úÖ Data loaded successfully from '{DATA_PATH}'. Shape: {data.shape}")
    print("First 5 rows of the dataset:")
    print(data.head())
except FileNotFoundError:
    print(f"‚ùå Error: The file '{DATA_PATH}' was not found. Please ensure it's in the correct directory.")
    exit() # Exit if data cannot be loaded

# üßπ Step 2: Clean & Preprocess Missing Values
print("\n--- Handling Missing Values ---")
# Fill numerical missing values with their mean
numerical_cols = ['temp', 'rain', 'snow']
for col in numerical_cols:
    if data[col].isnull().any():
        data[col].fillna(data[col].mean(), inplace=True)
        print(f"  - Filled missing values in '{col}' with its mean.")

# Fill categorical missing values in 'weather' with the most frequent value (mode)
if data['weather'].isnull().any():
    mode_weather = data['weather'].mode()[0]
    data['weather'].fillna(mode_weather, inplace=True)
    print(f"  - Filled missing values in 'weather' with its mode: '{mode_weather}'.")

print("\nMissing values after preprocessing:")
print(data.isnull().sum())

# üïê Step 3: Feature Engineering - Extract Date & Time Components
print("\n--- Feature Engineering: Extracting Date & Time Components ---")
# Split 'date' into day, month, year
data[['day', 'month', 'year']] = data['date'].str.split('-', expand=True)
# Split 'Time' into hours, minutes, seconds
data[['hours', 'minutes', 'seconds']] = data['Time'].str.split(':', expand=True)

# Drop original 'date' and 'Time' columns as they've been transformed
data.drop(columns=['date', 'Time'], inplace=True)

# Convert newly created time columns to integer type
cols_to_convert_to_int = ['day', 'month', 'year', 'hours', 'minutes', 'seconds']
for col in cols_to_convert_to_int:
    data[col] = pd.to_numeric(data[col], errors='coerce') # Coerce errors to NaN then fill
    if data[col].isnull().any():
        # Handle cases where conversion might result in NaN (e.g., if original string wasn't a valid number)
        data[col].fillna(data[col].mode()[0], inplace=True) # Fill with mode for safety
        print(f"  - Converted '{col}' to int and handled potential NaNs from conversion.")
    else:
        print(f"  - Converted '{col}' to int.")

print("\nData after feature engineering (first 5 rows):")
print(data.head())

# üè∑Ô∏è Step 4: Encode Categorical Features
print("\n--- Encoding Categorical Features ---")
# Initialize LabelEncoders for 'holiday' and 'weather'
le_holiday = LabelEncoder()
le_weather = LabelEncoder()

# Apply Label Encoding
data['holiday_encoded'] = le_holiday.fit_transform(data['holiday'])
data['weather_encoded'] = le_weather.fit_transform(data['weather'])

# Store encoders in a dictionary for saving
encoders = {
    'holiday_encoder': le_holiday,
    'weather_encoder': le_weather,
    'holiday_original_map': dict(zip(le_holiday.classes_, le_holiday.transform(le_holiday.classes_))),
    'weather_original_map': dict(zip(le_weather.classes_, le_weather.transform(le_weather.classes_)))
}
print("  - 'holiday' and 'weather' columns have been label encoded.")
print(f"  - Holiday mapping: {encoders['holiday_original_map']}")
print(f"  - Weather mapping: {encoders['weather_original_map']}")

# Drop original categorical columns
data.drop(columns=['holiday', 'weather'], inplace=True)
print("  - Original 'holiday' and 'weather' columns dropped.")

# üß™ Step 5: Define Features (X) and Target (y)
y = data['traffic_volume']
X = data.drop('traffic_volume', axis=1) # Drop the original target column

print(f"\n‚úÖ Features (X) shape: {X.shape}, Target (y) shape: {y.shape}")
print("Features used for training:")
print(X.columns.tolist())

# ‚öñÔ∏è Step 6: Scale Numerical Features
print("\n--- Scaling Features ---")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) # Fit and transform X
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns) # Convert back to DataFrame for consistency

print("  - Features scaled using StandardScaler.")
print("Scaled features (first 5 rows):")
print(X_scaled_df.head())

# üîÄ Step 7: Train-Test Split
print(f"\n--- Splitting Data into Training and Testing Sets ({TEST_SIZE_RATIO*100}% test size) ---")
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled_df, y, test_size=TEST_SIZE_RATIO, random_state=RANDOM_STATE_SEED
)
print(f"  - X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"  - X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

# ‚úÖ Step 8: Train the Model (Using GradientBoostingRegressor as in your lightweight model section)
print("\n--- Training Gradient Boosting Regressor Model ---")
model = GradientBoostingRegressor(n_estimators=100, max_depth=3, random_state=RANDOM_STATE_SEED)
model.fit(X_train, y_train)
print("‚úÖ Model training complete.")

# üìä Step 9: Evaluate the Model
print("\n--- Evaluating Model Performance ---")
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"  - R2 Score (Test Set): {r2:.4f}")
print(f"  - Mean Squared Error (Test Set): {mse:.4f}")
print(f"  - Root Mean Squared Error (Test Set): {rmse:.4f}")

# --- Optional: Model Comparison (as per your original code) ---
# If you want to compare multiple models, uncomment and adapt this section:
# print("\n--- Optional: Comparing Multiple Models (using original code logic) ---")
# from sklearn import linear_model, tree, ensemble, svm
# import xgboost
# lin_reg = linear_model.LinearRegression()
# Dtree = tree.DecisionTreeRegressor(random_state=RANDOM_STATE_SEED)
# Rand = ensemble.RandomForestRegressor(random_state=RANDOM_STATE_SEED)
# svr = svm.SVR()
# XGB = xgboost.XGBRegressor(random_state=RANDOM_STATE_SEED)

# models_list = {'Linear': lin_reg, 'Decision Tree': Dtree, 'Random Forest': Rand, 'SVR': svr, 'XGBoost': XGB}
# for name, algo in models_list.items():
#     print(f"  - Training {name}...")
#     algo.fit(X_train, y_train)
#     preds = algo.predict(X_test)
#     r2_val = r2_score(y_test, preds)
#     mse_val = mean_squared_error(y_test, preds)
#     print(f"    - {name} R2 Score: {r2_val:.4f}, MSE: {mse_val:.4f}")

# üíæ Step 10: Save Trained Model, Scaler, and Encoders
print("\n--- Saving Trained Model and Preprocessing Components ---")
joblib.dump(model, MODEL_SAVE_PATH, compress=3)
joblib.dump(scaler, SCALER_SAVE_PATH)
joblib.dump(encoders, ENCODERS_SAVE_PATH) # Save the dictionary of encoders

print(f"‚úÖ Model saved to '{MODEL_SAVE_PATH}'")
print(f"‚úÖ Scaler saved to '{SCALER_SAVE_PATH}'")
print(f"‚úÖ Encoders saved to '{ENCODERS_SAVE_PATH}'")


# --- Demonstration of Loading and Predicting ---
print("\n--- Demonstration: Loading Saved Components and Making a Prediction ---")

def predict_traffic_volume(
    input_data: dict,
    model_path: str = MODEL_SAVE_PATH,
    scaler_path: str = SCALER_SAVE_PATH,
    encoders_path: str = ENCODERS_SAVE_PATH
) -> float:
    """
    Loads the saved model, scaler, and encoders, and predicts traffic volume
    for a single new input data point.

    Args:
        input_data (dict): A dictionary containing input features, e.g.:
                           {
                               'temp': 20.5, 'rain': 0.0, 'snow': 0.0,
                               'day': 15, 'month': 6, 'year': 2024,
                               'hours': 10, 'minutes': 30, 'seconds': 0,
                               'holiday': 'None', 'weather': 'Clouds'
                           }
        model_path (str): Path to the saved model.
        scaler_path (str): Path to the saved scaler.
        encoders_path (str): Path to the saved encoders.

    Returns:
        float: Predicted traffic volume.
    """
    try:
        # Load components
        loaded_model = joblib.load(model_path)
        loaded_scaler = joblib.load(scaler_path)
        loaded_encoders = joblib.load(encoders_path)
        print("  - Model, scaler, and encoders loaded successfully for prediction.")

        # Convert input_data to DataFrame
        input_df = pd.DataFrame([input_data])

        # Apply Label Encoding using the loaded encoders
        input_df['holiday_encoded'] = loaded_encoders['holiday_encoder'].transform(input_df['holiday'])
        input_df['weather_encoded'] = loaded_encoders['weather_encoder'].transform(input_df['weather'])
        input_df.drop(columns=['holiday', 'weather'], inplace=True) # Drop original columns

        # Ensure columns are in the same order as during training
        # Get column names from the scaler (or from X.columns directly if you save them explicitly)
        # Assuming X_scaled_df had the correct order
        # For a robust solution, you might save X.columns during training.
        # For now, let's derive it from the scaler's features_in_ or reconstruct
        # For simplicity, let's assume the columns are in the same order as X_train
        # from the training script. This is a common point of failure if not handled precisely.
        # A safer way: `loaded_model.feature_names_in_` if available or save `X.columns`
        # during training.

        # Reorder columns to match the training data's feature order
        # This is a crucial step to avoid prediction errors due to misaligned features
        training_features_order = X.columns.tolist() # X from training script has the original order before scaling

        # Create a DataFrame for the single input, ensuring column order matches training
        processed_input_df = input_df[['temp', 'rain', 'snow', 'day', 'month', 'year',
                                       'hours', 'minutes', 'seconds',
                                       'holiday_encoded', 'weather_encoded']]

        # Scale the input data using the loaded scaler
        scaled_input = loaded_scaler.transform(processed_input_df)

        # Make prediction
        prediction = loaded_model.predict(scaled_input)
        return prediction[0]

    except Exception as e:
        print(f"‚ùå An error occurred during prediction: {e}")
        return -1 # Return a sentinel value for error

# Example usage of the prediction function
example_new_data = {
    'temp': 22.5,
    'rain': 0.1,
    'snow': 0.0,
    'day': 20,
    'month': 7,
    'year': 2025,
    'hours': 17,
    'minutes': 45,
    'seconds': 0,
    'holiday': 'None',
    'weather': 'Clear'
}

print("\nPredicting for example new data:")
print(example_new_data)
predicted_volume = predict_traffic_volume(example_new_data)
if predicted_volume != -1:
    print(f"Predicted Traffic Volume: {predicted_volume:.2f}")

print("\n--- Script Finished ---")


--- Starting Traffic Volume Prediction Script ---
‚úÖ Data loaded successfully from 'traffic volume.csv'. Shape: (48204, 8)
First 5 rows of the dataset:
  holiday    temp  rain  snow weather        date      Time  traffic_volume
0     NaN  288.28   0.0   0.0  Clouds  02-10-2012  09:00:00            5545
1     NaN  289.36   0.0   0.0  Clouds  02-10-2012  10:00:00            4516
2     NaN  289.58   0.0   0.0  Clouds  02-10-2012  11:00:00            4767
3     NaN  290.13   0.0   0.0  Clouds  02-10-2012  12:00:00            5026
4     NaN  291.14   0.0   0.0  Clouds  02-10-2012  13:00:00            4918

--- Handling Missing Values ---
  - Filled missing values in 'temp' with its mean.
  - Filled missing values in 'rain' with its mean.
  - Filled missing values in 'snow' with its mean.
  - Filled missing values in 'weather' with its mode: 'Clouds'.

Missing values after preprocessing:
holiday           48143
temp                  0
rain                  0
snow                  0
weather 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['weather'].fillna(mode_weather, inplace=True)


  - Converted 'month' to int.
  - Converted 'year' to int.
  - Converted 'hours' to int.
  - Converted 'minutes' to int.
  - Converted 'seconds' to int.

Data after feature engineering (first 5 rows):
  holiday    temp  rain  snow weather  traffic_volume  day  month  year  \
0     NaN  288.28   0.0   0.0  Clouds            5545    2     10  2012   
1     NaN  289.36   0.0   0.0  Clouds            4516    2     10  2012   
2     NaN  289.58   0.0   0.0  Clouds            4767    2     10  2012   
3     NaN  290.13   0.0   0.0  Clouds            5026    2     10  2012   
4     NaN  291.14   0.0   0.0  Clouds            4918    2     10  2012   

   hours  minutes  seconds  
0      9        0        0  
1     10        0        0  
2     11        0        0  
3     12        0        0  
4     13        0        0  

--- Encoding Categorical Features ---
  - 'holiday' and 'weather' columns have been label encoded.
  - Holiday mapping: {'Christmas Day': np.int64(0), 'Columbus Day': np.int