In [2]:
import pandas as pd
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the dataset
df = pd.read_csv('IEA-EV-dataEV_salesCarsHistorical.csv')

data = df.drop(['category', 'parameter', 'mode', 'unit'], axis=1)
encoded_data = pd.get_dummies(data, columns=['region', 'powertrain'])

feat = encoded_data.drop('value', axis=1)
y = encoded_data['value']

X_train, X_test, y_train, y_test = train_test_split(feat, y, test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10],
    'epsilon': [0.01, 0.1, 0.2],
    'kernel': ['linear', 'rbf']
}

# Create an SVR model
model = SVR()

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# Get the best SVR model from the grid search
best_model = grid_search.best_estimator_

# Predict the values for future years
future_predictions = best_model.predict(X_test)

# Calculate the Mean Squared Error (MSE)
mse = mean_squared_error(y_test, future_predictions)
print("Mean Squared Error (MSE):", mse)

# Extract the latest year from the dataset
latest_year = data['year'].max()

# Create a new DataFrame with future year, region, and powertrain
future_data = pd.DataFrame({
    'year': [latest_year + 1, latest_year + 2, latest_year + 3],
    'region': [data['region'].unique()[0]] * 3,
    'powertrain': [data['powertrain'].unique()[0]] * 3
})

# Perform one-hot encoding on the future data using the original encoder
encoded_future_data = pd.get_dummies(future_data, columns=['region', 'powertrain'])

# Remove the 'value' column from the encoded future data
encoded_future_data = encoded_future_data.reindex(columns=encoded_data.columns, fill_value=0)
encoded_future_data = encoded_future_data.drop('value', axis=1)
# Predict the values for future years
future_predictions = best_model.predict(encoded_future_data)

# Create a DataFrame to store the future predictions
future_dataset = pd.DataFrame({
    'year': future_data['year'],
    'region': future_data['region'],
    'powertrain': future_data['powertrain'],
    'value': future_predictions
})

# Print the future dataset
print(future_dataset)


Mean Squared Error (MSE): 44699932382.76291
   year     region powertrain         value
0  2023  Australia        BEV  12269.695882
1  2024  Australia        BEV  13343.225294
2  2025  Australia        BEV  14416.754706
