In [10]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the dataset
df = pd.read_csv('IEA-EV-dataEV_salesCarsHistorical.csv')



In [11]:
data=df.drop(['category','parameter','mode','unit'], axis=1)
data

Unnamed: 0,region,powertrain,year,value
0,Australia,BEV,2011,49
1,Australia,BEV,2012,170
2,Australia,PHEV,2012,80
3,Australia,PHEV,2013,100
4,Australia,BEV,2013,190
...,...,...,...,...
829,World,PHEV,2020,970000
830,World,PHEV,2021,1900000
831,World,BEV,2021,4600000
832,World,BEV,2022,7300000


In [12]:
encoded_data = pd.get_dummies(data, columns=['region', 'powertrain'])
encoded_data

Unnamed: 0,year,value,region_Australia,region_Austria,region_Belgium,region_Brazil,region_Canada,region_Chile,region_China,region_Denmark,...,region_South Africa,region_Spain,region_Sweden,region_Switzerland,region_Turkiye,region_USA,region_United Kingdom,region_World,powertrain_BEV,powertrain_PHEV
0,2011,49,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,2012,170,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2012,80,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2013,100,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2013,190,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
829,2020,970000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
830,2021,1900000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
831,2021,4600000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
832,2022,7300000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


In [20]:
feat = encoded_data.drop('value', axis=1)
y = encoded_data['value']
feat

Unnamed: 0,year,region_Australia,region_Austria,region_Belgium,region_Brazil,region_Canada,region_Chile,region_China,region_Denmark,region_EU27,...,region_South Africa,region_Spain,region_Sweden,region_Switzerland,region_Turkiye,region_USA,region_United Kingdom,region_World,powertrain_BEV,powertrain_PHEV
0,2011,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,2012,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2012,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2013,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2013,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
829,2020,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
830,2021,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
831,2021,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
832,2022,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


In [21]:
X_train, X_test, y_train, y_test = train_test_split(feat, y, test_size=0.2, random_state=42)


In [22]:
model = RandomForestRegressor()
model.fit(X_train, y_train)


RandomForestRegressor()

In [26]:
# Extract the latest year from the dataset
latest_year = data['year'].max()

# Create a new DataFrame with future year, region, and powertrain
future_data = pd.DataFrame({
    'year': [latest_year + 1, latest_year + 2, latest_year + 3],
    'region': [data['region'].unique()[0]] * 3,
    'powertrain': [data['powertrain'].unique()[0]] * 3
})

# Perform one-hot encoding on the future data using the original encoder
encoded_future_data = pd.get_dummies(future_data, columns=['region', 'powertrain'])

encoded_future_data = encoded_future_data.reindex(columns=encoded_data.columns[:], fill_value=0)
# Remove the 'value' column from the encoded future data
encoded_future_data = encoded_future_data.drop('value', axis=1)

# Predict the values for future years
future_predictions = model.predict(encoded_future_data)






In [27]:
encoded_future_data

Unnamed: 0,year,region_Australia,region_Austria,region_Belgium,region_Brazil,region_Canada,region_Chile,region_China,region_Denmark,region_EU27,...,region_South Africa,region_Spain,region_Sweden,region_Switzerland,region_Turkiye,region_USA,region_United Kingdom,region_World,powertrain_BEV,powertrain_PHEV
0,2023,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,2024,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2025,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [28]:
future_predictions

array([24909.4, 24909.4, 24909.4])

In [36]:
# Create a new DataFrame with future year, region, and powertrain
future_data = pd.DataFrame({
    'year': [latest_year + 1, latest_year + 2, latest_year + 3],
    'region': [data['region'].unique()[0]] * 3,
    'powertrain': [data['powertrain'].unique()[0]] * 3
})

# Perform one-hot encoding on the future data using the original encoder
encoded_future_data = pd.get_dummies(future_data, columns=['region', 'powertrain'])

# Remove the 'value' column from the encoded future data
encoded_future_data = encoded_future_data.reindex(columns=encoded_data.columns, fill_value=0)
encoded_future_data = encoded_future_data.drop('value', axis=1)

# Predict the values for future years
future_predictions = model.predict(encoded_future_data)

# Create a DataFrame to store the future predictions
future_dataset = pd.DataFrame({
    'year': future_data['year'],
    'region': future_data['region'],
    'powertrain': future_data['powertrain'],
    'value': future_predictions
})

# Print the future dataset
print(future_dataset)


   year     region powertrain    value
0  2023  Australia        BEV  24909.4
1  2024  Australia        BEV  24909.4
2  2025  Australia        BEV  24909.4


In [37]:
# Create a new DataFrame with future year, region, and powertrain
future_data = pd.DataFrame({
    'year': [latest_year + 1, latest_year + 2, latest_year + 3],
    'region': [data['region'].unique()[0]] * 3,
    'powertrain': [data['powertrain'].unique()[0]] * 3
})

# Perform one-hot encoding on the future data using the original encoder
encoded_future_data = pd.get_dummies(future_data, columns=['region', 'powertrain'])

# Remove the 'value' column from the encoded future data
encoded_future_data = encoded_future_data.reindex(columns=encoded_data.columns, fill_value=0)
encoded_future_data = encoded_future_data.drop('value', axis=1)

# Create an empty list to store the future predictions
future_predictions = []

# Predict the value for each future year
for i in range(encoded_future_data.shape[0]):
    future_prediction = model.predict(encoded_future_data.iloc[[i]])
    future_predictions.append(future_prediction[0])

# Add the predicted values to the future_data DataFrame
future_data['value'] = future_predictions
# Print the future dataset
print(future_data)


   year     region powertrain    value
0  2023  Australia        BEV  24909.4
1  2024  Australia        BEV  24909.4
2  2025  Australia        BEV  24909.4


In [38]:
y_pred = model.predict(X_test)

In [40]:
# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)

Mean Squared Error: 16172699667.471136
