In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

# Load the dataset
data = pd.read_csv('IEA-EV-dataEV_salesCarsHistorical.csv')

df=data.drop(['category','parameter','mode','unit'], axis=1)

# Preprocess the data
label_encoder = LabelEncoder()
df['region'] = label_encoder.fit_transform(df['region'])
df['powertrain'] = label_encoder.fit_transform(df['powertrain'])

# Split the data into features and target
X = df.drop('value', axis=1)
y = df['value']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [26]:
X

Unnamed: 0,region,powertrain,year
0,0,0,2011
1,0,0,2012
2,0,1,2012
3,0,1,2013
4,0,0,2013
...,...,...,...
829,35,1,2020
830,35,1,2021
831,35,0,2021
832,35,0,2022


In [10]:
# Define a pipeline with preprocessing steps and the XGBoost model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', XGBRegressor())
])

# Define the hyperparameters to tune
hyperparameters = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 4, 5],
    'model__subsample': [0.8, 0.9, 1.0],
    'model__reg_alpha': [0, 0.1, 0.5],
    'model__reg_lambda': [0, 0.1, 0.5]
}



In [11]:
# Perform grid search cross-validation to find the best hyperparameters
grid_search = GridSearchCV(pipeline, hyperparameters, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# Get the best model and its hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Train the best model on the entire training data
best_model.fit(X_train, y_train)



Pipeline(steps=[('scaler', StandardScaler()),
                ('model',
                 XGBRegressor(base_score=None, booster=None, callbacks=None,
                              colsample_bylevel=None, colsample_bynode=None,
                              colsample_bytree=None, early_stopping_rounds=None,
                              enable_categorical=False, eval_metric=None,
                              feature_types=None, gamma=None, gpu_id=None,
                              grow_policy=None, importance_type=None,
                              interaction_constraints=None, learning_rate=0.2,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=3, max_leaves=None,
                              min_child_weight=None, missing=nan,
                              monotone_constraints=None, n_estimators=300,
                              n_jobs=None, num_para

In [12]:
# Evaluate the model on the testing data
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)



Mean Squared Error: 14585337694.328066


In [13]:
# Extract the latest year from the dataset
latest_year = df['year'].max()

# Create future data for prediction
future_data = pd.DataFrame({
    'year': [latest_year + 1, latest_year + 2, latest_year + 3],
    'region': [df['region'].unique()[0]] * 3,
    'powertrain': [df['powertrain'].unique()[0]] * 3
})



In [21]:
# Preprocess the future data
future_data['region'] = label_encoder.transform(future_data['region'])
future_data['powertrain'] = label_encoder.transform(future_data['powertrain'])



In [22]:
# Perform prediction on the future data
future_predictions = best_model.predict(future_data)

# Create a DataFrame to store the future predictions
future_dataset = pd.DataFrame({
    'year': future_data['year'],
    'region': label_encoder.inverse_transform(future_data['region']),
    'powertrain': label_encoder.inverse_transform(future_data['powertrain']),
    'value': future_predictions
})

print(future_dataset)

   year  region  powertrain        value
0  2023       0           0  8818.576172
1  2024       0           0  8818.576172
2  2025       0           0  8818.576172


Feature names must be in the same order as they were in fit.



In [24]:
future_data

Unnamed: 0,year,region,powertrain
0,2023,0,0
1,2024,0,0
2,2025,0,0
