## DriveGreen Model 

In [1]:
#Importing the libraries needed for EDA
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [2]:
#Loaading the dataset
emissions_data = pd.read_csv("Emissions_Canada.csv")


In [3]:
#Converting all columns names to lowercase
emissions_data.columns = emissions_data.columns.str.lower()

# Remove leading/trailing spaces and replace internal spaces with underscores
emissions_data.columns = emissions_data.columns.str.strip().str.replace(' ', '_')

print(emissions_data.columns.tolist())


['make', 'model', 'vehicle_class', 'engine_size(l)', 'cylinders', 'transmission', 'fuel_type', 'fuel_consumption_city_(l/100_km)', 'fuel_consumption_hwy_(l/100_km)', 'fuel_consumption_comb_(l/100_km)', 'fuel_consumption_comb_(mpg)', 'co2_emissions(g/km)']


In [4]:

min_val = emissions_data['engine_size(l)'].min()
max_val = emissions_data['engine_size(l)'].max()


mine_val = emissions_data['cylinders'].min()
maxe_val = emissions_data['cylinders'].max()

print(min_val)
print(max_val)
print(mine_val)
print(maxe_val)

0.9
8.4
3
16


In [5]:
emissions_data.rename(columns={'co2_emissions(g/km)': 'co2_emissions'}, inplace=True)


In [6]:
#Selecting the feature columns and the target column
target_variable = 'co2_emissions'
numerical_features = ['engine_size(l)', 'cylinders']
categorical_feature = 'fuel_type'


In [7]:
#Transforming the data

transformed_numerical_features = pd.DataFrame(
    np.log1p(emissions_data[numerical_features]),
    columns=numerical_features,
    index=emissions_data.index
)

transformed_target_variable = np.log1p(emissions_data[target_variable])


### Model Training and Evaluation

In [9]:
# Import ML tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

#Import XGBoost Regressor
from xgboost import XGBRegressor
import joblib, os

The code below trains 4 different models and has an additional 2 models for prediction

In [10]:
# Combine log-transformed numerical + categorical
X = pd.concat(
    [transformed_numerical_features, emissions_data[[categorical_feature]]],
    axis=1
)

y = transformed_target_variable.copy()

# One-hot encode
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded_cat = encoder.fit_transform(emissions_data[[categorical_feature]])

encoded_cat_df = pd.DataFrame(
    encoded_cat,
    columns=encoder.get_feature_names_out([categorical_feature]),
    index=X.index
)

# Final training matrix
X_encoded = pd.concat(
    [X.drop(columns=[categorical_feature]), encoded_cat_df],
    axis=1
)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

# Train XGBoost
xgb_model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)
xgb_model.fit(X_train, y_train)

# Save artifacts
os.makedirs("model", exist_ok=True)
joblib.dump(xgb_model, "model/xgboost_model.pkl")
joblib.dump(encoder, "model/encoder.pkl")
joblib.dump(X_encoded.columns.tolist(), "model/feature_names.pkl")

print(" Model, encoder, and feature names saved correctly")

 Model, encoder, and feature names saved correctly
