### Libraries and Importing

In [3]:
# The latest versions of the following libraries should be installed
!pip install plotnine
!pip install pmdarima



In [4]:
# Mounting google drive to google collab notebook (Only necessary in google collab environment)
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Import relevant libraries (These should be installed if not already)
import pandas as pd
from pmdarima.arima import auto_arima
from pmdarima.arima import ARIMA as arima_order
import numpy as np
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
import warnings
warnings.filterwarnings("ignore")
import pickle

# Read csv files from data preparation
state_year = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/state_year.csv')
li_price = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/li_price.csv')
miso_load = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/miso_load.csv')
model_sales = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/model_sales.csv')
state_year = state_year.sort_values(['Year', 'State'])
us_ev_sales = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/US_EV_SalesData.csv')
state_year

Unnamed: 0,State,Year,Gasoline Price,Median Income,Population,Renewable Energy Use,Total Energy Use,Transportation Energy Use,Stations Opened
2652,AK,1960,,,,6800.0,296.0,27139.0,
2653,AL,1960,,,,112809.0,15485.0,176015.0,
2654,AR,1960,,,,48104.0,5662.0,104652.0,
2655,AZ,1960,,,,36181.0,6138.0,116689.0,
2656,CA,1960,,,,270161.0,57270.0,1224448.0,
...,...,...,...,...,...,...,...,...,...
3202,VA,2022,,,,,,,50.0
3204,VT,2022,,,,,,,6.0
3206,WA,2022,,,,,,,33.0
3208,WI,2022,,,,,,,14.0


### EV Sales vs Socioeconomic Factors

In [4]:
# Clean dataframes
states = ['AL', 'AR', 'IL', 'IN', 'IA', 'KY', 'LA', 'MI', 'MN', 'MS', 'MO','ND', 'SD', 'TX', 'WI']
state_year_copy = state_year.loc[state_year['State'].isin(states)]
state_year_copy.loc[state_year_copy['Stations Opened'].isna(), 'Stations Opened'] = 0
state_year_copy = state_year_copy.dropna()
state_year_copy = state_year_copy.loc[(state_year_copy['Year'] >= 2011)]
state_year_copy = state_year_copy.reset_index().drop('index', axis=1)

us_ev_sales_copy = us_ev_sales.sort_values(['Year', 'State'])
us_ev_sales_copy = us_ev_sales_copy.loc[(us_ev_sales_copy['Year'] <= 2019)]
us_ev_sales_copy = us_ev_sales_copy.reset_index().drop('index', axis=1)

extracted_col = us_ev_sales_copy["Total"]
state_year_copy = state_year_copy.join(extracted_col)
state_year_copy['States_Cat'] = pd.factorize(state_year_copy['State'])[0]
state_year_copy = state_year_copy[['States_Cat', 'Year', 'Gasoline Price', 'Median Income', 'Population', 'Renewable Energy Use', 
                                   'Transportation Energy Use', 'Stations Opened', 'Total', 'State']]
state_year_copy


Unnamed: 0,States_Cat,Year,Gasoline Price,Median Income,Population,Renewable Energy Use,Transportation Energy Use,Stations Opened,Total,State
0,0,2011,27.53,25082.0,4799642.0,255849.0,474771.0,8.0,73,AL
1,1,2011,27.86,23039.0,2941038.0,124987.0,284546.0,18.0,22,AR
2,2,2011,28.21,28663.0,3066772.0,668883.0,307194.0,18.0,49,IA
3,3,2011,28.71,32857.0,12867783.0,313512.0,985537.0,11.0,328,IL
4,4,2011,27.74,29475.0,6517250.0,222942.0,581270.0,5.0,94,IN
...,...,...,...,...,...,...,...,...,...,...
130,10,2019,19.03,16413.0,2978227.0,66398.0,342707.0,12.0,273,MS
131,11,2019,21.60,21205.0,763724.0,210222.0,131355.0,10.0,114,ND
132,12,2019,21.05,18142.0,887127.0,261949.0,98484.0,10.0,186,SD
133,13,2019,19.07,23743.0,28986794.0,962899.0,3334081.0,242.0,5780,TX


In [5]:
# Perform feature extraction to see how well each variable x predicts for Y
array = state_year_copy.values
X = array[:,0:8]
Y = array[:,8]

test = SelectKBest(score_func=f_classif, k=5)
fit = test.fit(X, Y)
# summarize scores
for i in range(len(fit.scores_)):
  print(state_year_copy.columns[i] + ": " + str(fit.scores_[i]))

States_Cat: 12.38733552631579
Year: 1.6750637755102042
Gasoline Price: 0.8847555164713398
Median Income: 3.214787784399947
Population: 31.80435584690869
Renewable Energy Use: 0.8497372786675979
Transportation Energy Use: 32.36896829539239
Stations Opened: 28.584300255490326


### Socioeconomic Factors Forecasting

In [14]:
# Filter for MISO states and drop useless columns
states = ['AL', 'AR', 'IL', 'IN', 'IA', 'KY', 'LA', 'MI', 'MN', 'MS', 'MO','ND', 'SD', 'TX', 'WI']
miso_states_year = state_year.loc[(state_year['State'].isin(states)) & (state_year['Year'] <= 2019) \
                                  & (state_year['Year'] >= 1984)]
miso_states_year = miso_states_year.drop('Total Energy Use', axis=1)
miso_states_year.loc[miso_states_year['Stations Opened'].isna(), 'Stations Opened'] = 0
miso_states_year

Unnamed: 0,State,Year,Gasoline Price,Median Income,Population,Renewable Energy Use,Transportation Energy Use,Stations Opened
715,AL,1984,8.98,54393.0,3951824.0,287856.0,337698.0,0.0
716,AR,1984,8.55,50540.0,2319767.0,91421.0,207483.0,0.0
726,IA,1984,9.34,68469.0,2858615.0,77577.0,228259.0,0.0
728,IL,1984,8.82,73753.0,11412128.0,145656.0,707471.0,0.0
729,IN,1984,8.74,66360.0,5458322.0,60540.0,513196.0,0.0
...,...,...,...,...,...,...,...,...
2524,MS,2019,19.03,16413.0,2978227.0,66398.0,342707.0,12.0
2527,ND,2019,21.60,21205.0,763724.0,210222.0,131355.0,10.0
2540,SD,2019,21.05,18142.0,887127.0,261949.0,98484.0,10.0
2542,TX,2019,19.07,23743.0,28986794.0,962899.0,3334081.0,242.0


In [None]:
# Setup dataframe columns
df_dict = dict()
df_dict['Year'] = []
df_dict['State'] = []
df_dict["Attribute"] = []
df_dict["Model Parameters"] = []
df_dict["Prediction Years Out"] = []
df_dict["Prediction"] = []
df_dict["Confidence Interval"] = []
df_dict["AIC"] = []
df_dict["AICc"] = []
df_dict["BIC"] = []

# Setup p, d, q hyperparameters
p_vals = [0, 1, 2]
d_vals = [1, 2]
q_vals = [0, 1, 2]
socioeconomic_models = []
"""
Loop 11 times to predict 10 years into the future for each of
2010 to 2019, training on past 26 years
"""

for i in range(11):
  print("Predicting for", str(2010 + i))
  # Predict value for each relevant attribute
  for col in miso_states_year.columns[2:]:
    # Predict value for each of MISOs states
    for state in states:
      train = miso_states_year.loc[(miso_states_year['State'] == state) &\
                                  (miso_states_year['Year'] < 2010+i) &\
                                  (miso_states_year['Year'] >= 1984+i), 
                                  col]
      # Predict for each combination of p, d, q
      for p in p_vals:
        for d in d_vals:
          for q in q_vals:
            # Handle exceptions (linear algebra errors or lack of normality)
            try:
              model = arima_order(order=(p, d, q))
              model.fit(train)
              # socioeconomic_models.append([2010+i, col, state, p, d, q, pickle.dumps(model)])
              prediction, confint = model.predict(n_periods=10, return_conf_int=True)
            except Exception:
              for ii in range(10):
                df_dict["Year"].append(2010 + i + ii)
                df_dict["State"].append(state)
                df_dict["Attribute"].append(col)
                df_dict["Model Parameters"].append((p, d, q))
                df_dict["Prediction Years Out"].append(ii+1)
                df_dict["Prediction"].append(None)
                df_dict["Confidence Interval"].append(None)
                df_dict["AIC"].append(None)
                df_dict["AICc"].append(None)
                df_dict["BIC"].append(None)
              continue

            # Store results
            for ii in range(len(prediction)):
              df_dict["Year"].append(2010 + i + ii)
              df_dict["State"].append(state)
              df_dict["Attribute"].append(col)
              df_dict["Model Parameters"].append((p, d, q))
              df_dict["Prediction Years Out"].append(ii+1)
              df_dict["Prediction"].append(prediction[ii])
              df_dict["Confidence Interval"].append(confint[ii])
              try:
                aic = False
                aicc = False
                bic = False
                aic = model.aic()
                aicc = model.aicc()
                bic = model.bic()
                df_dict["AIC"].append(aic)
                df_dict["AICc"].append(aicc)
                df_dict["BIC"].append(bic)
              except Exception:
                if aic:
                  df_dict["AIC"].append(aic)
                else:
                  df_dict["AIC"].append(None)
                if aicc:
                  df_dict["AICc"].append(aicc)
                else:
                  df_dict["AICc"].append(None)
                if bic:
                  df_dict["BIC"].append(bic)
                else:
                  df_dict["BIC"].append(None)
                continue
# Convert dictionary to pandas dataframe
df = pd.DataFrame(df_dict)

Predicting for 2010
Predicting for 2011
Predicting for 2012
Predicting for 2013
Predicting for 2014
Predicting for 2015
Predicting for 2016
Predicting for 2017
Predicting for 2018
Predicting for 2019
Predicting for 2020


Adding errors for years with data

In [None]:
# Actual values for attributes
df["Actual"] = "NA"
for year in range(2010, 2020):
  for state in states:
    for attribute in list(set(df['Attribute'])):
      df.loc[(df["Year"] == year) & \
             (df["State"] == state) & \
             (df["Attribute"] == attribute), \
             "Actual"] = miso_states_year.loc\
             [(miso_states_year['State'] == state) &\
              (miso_states_year['Year'] == year), attribute].values[0]

# Generate predicted error
df['Error'] = None
for year in range(2010, 2020):
  for state in states:
    for attribute in list(set(df['Attribute'])):
      df.loc[(df["Year"] == year) & \
             (df["State"] == state) & \
             (df["Attribute"] == attribute), \
             "Error"] = df.loc[(df["Year"] == year) & \
             (df["State"] == state) & \
             (df["Attribute"] == attribute), \
             "Prediction"] - df.loc[(df["Year"] == year) & \
             (df["State"] == state) & \
             (df["Attribute"] == attribute), \
             "Actual"]

# Generate absolute predicted error
df['Absolute Error'] = None
for year in range(2010, 2020):
  for state in states:
    for attribute in list(set(df['Attribute'])):
      df.loc[(df["Year"] == year) & \
             (df["State"] == state) & \
             (df["Attribute"] == attribute), \
             "Absolute Error"] = abs(df.loc[(df["Year"] == year) & \
             (df["State"] == state) & \
             (df["Attribute"] == attribute), \
             "Error"])



In [None]:
# Save CSV file
df.to_csv('socioeconomic_final.csv', index=False)

### EV Sales Forecasting

In [7]:
# Setup dataframe columns
df_dict = dict()
df_dict['Year'] = []
df_dict['State'] = []
df_dict["Attribute"] = []
df_dict["Model Parameters"] = []
df_dict["Prediction Years Out"] = []
df_dict["Prediction"] = []
df_dict["Confidence Interval"] = []
df_dict["AIC"] = []
df_dict["AICc"] = []
df_dict["BIC"] = []
# ev_sales_models = []

# Setup p, d, q hyperparameters
p_vals = [0, 1, 2]
d_vals = [1, 2]
q_vals = [0, 1, 2]

# Repeat below loops 5 times to predict 5 years into the future for each of
# 2018 to 2021
for i in range(5):
  print("Predicting for", str(2018 + i))
  # Predict value for each relevant attribute
  for col in us_ev_sales.columns[2:]:
    # Predict value for each of MISOs states
    for state in states:
      train = us_ev_sales.loc[(us_ev_sales['State'] == state) &\
                                  (us_ev_sales['Year'] < 2018+i) &\
                                  (us_ev_sales['Year'] >= 2011+i), 
                                  col]
      # Predict for each combination of p, d, q
      for p in p_vals:
        for d in d_vals:
          for q in q_vals:
            # Error handling (linear algebra or normality issues)
            try:
              model = arima_order(order=(p, d, q))
              model.fit(train)
              # ev_sales_models.append([2018+i, col, state, p, d, q, pickle.dumps(model)])
              prediction, confint = model.predict(n_periods=4, return_conf_int=True)
            except Exception:
              # If model crashes (Linear algebra error most likely or not normalized)
              # Store results
              for ii in range(4):
                df_dict["Year"].append(2018 + i + ii)
                df_dict["State"].append(state)
                df_dict["Attribute"].append(col)
                df_dict["Model Parameters"].append((p, d, q))
                df_dict["Prediction Years Out"].append(ii+1)
                df_dict["Prediction"].append(None)
                df_dict["Confidence Interval"].append(None)
                df_dict["AIC"].append(None)
                df_dict["AICc"].append(None)
                df_dict["BIC"].append(None)
              continue

            # Store results
            for ii in range(len(prediction)):
              df_dict["Year"].append(2018 + i + ii)
              df_dict["State"].append(state)
              df_dict["Attribute"].append(col)
              df_dict["Model Parameters"].append((p, d, q))
              df_dict["Prediction Years Out"].append(ii+1)
              df_dict["Prediction"].append(prediction[ii])
              df_dict["Confidence Interval"].append(confint[ii])
              try:
                aic = False
                aicc = False
                bic = False
                aic = model.aic()
                aicc = model.aicc()
                bic = model.bic()
                df_dict["AIC"].append(aic)
                df_dict["AICc"].append(aicc)
                df_dict["BIC"].append(bic)
              except Exception:
                if aic:
                  df_dict["AIC"].append(aic)
                else:
                  df_dict["AIC"].append(None)
                if aicc:
                  df_dict["AICc"].append(aicc)
                else:
                  df_dict["AICc"].append(None)
                if bic:
                  df_dict["BIC"].append(bic)
                else:
                  df_dict["BIC"].append(None)
                continue
                
# Convert dictionary to pandas dataframe
df2 = pd.DataFrame(df_dict)

Predicting for 2018
Predicting for 2019
Predicting for 2020
Predicting for 2021
Predicting for 2022


KeyboardInterrupt: ignored

In [None]:
# Actual values for attributes
df2["Actual"] = "NA"
for year in range(2018, 2022):
  for state in states:
    for attribute in list(set(df2['Attribute'])):
      df2.loc[(df2["Year"] == year) & \
             (df2["State"] == state) & \
             (df2["Attribute"] == attribute), \
             "Actual"] = us_ev_sales.loc\
             [(us_ev_sales['State'] == state) &\
              (us_ev_sales['Year'] == year), attribute].values[0]

# Generate predicted error
df2['Error'] = None
for year in range(2018, 2022):
  for state in states:
    for attribute in list(set(df2['Attribute'])):
      df2.loc[(df2["Year"] == year) & \
             (df2["State"] == state) & \
             (df2["Attribute"] == attribute), \
             "Error"] = df2.loc[(df2["Year"] == year) & \
             (df2["State"] == state) & \
             (df2["Attribute"] == attribute), \
             "Prediction"] - df2.loc[(df2["Year"] == year) & \
             (df2["State"] == state) & \
             (df2["Attribute"] == attribute), \
             "Actual"]

# Generate absolute predicted error
df2['Absolute Error'] = None
for year in range(2018, 2022):
  for state in states:
    for attribute in list(set(df2['Attribute'])):
      df2.loc[(df2["Year"] == year) & \
             (df2["State"] == state) & \
             (df2["Attribute"] == attribute), \
             "Absolute Error"] = abs(df2.loc[(df2["Year"] == year) & \
             (df2["State"] == state) & \
             (df2["Attribute"] == attribute), \
             "Error"])

In [None]:
# Save CSV
df2.to_csv('ev_sales_final.csv', index=False)

### Evaluation

In [6]:
# Read in results dataframes
socio_results = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/socioeconomic_final.csv')
ev_sales_results = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ev_sales_final.csv')

##### Socio

In [8]:
socio_results_min_aicc = socio_results.loc[socio_results.loc[socio_results['Prediction Years Out'] < 11].groupby(['Year', 'State', 'Attribute', 'Prediction Years Out']).AICc.idxmin()]
socio_results_min_aicc.to_csv('ev_sales_min_aicc_results.csv', index=False)

When using the best model (according to AICc), this is our error

Developed other models because AICc may not be the best attribute to determine how good a model is, could use a linear combination of AIC, AICc, BIC, Confidence Interval, past years error or absolute error, etc.

In [9]:
# Average error for each state and attribute when predicting 1 to 5 years out
mean_socio_errors = socio_results_min_aicc.loc[socio_results_min_aicc['Year'] < 2020].groupby(['State', 'Attribute', 'Prediction Years Out']).mean()
mean_socio_errors

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Year,Prediction,AIC,AICc,BIC,Actual,Error,Absolute Error
State,Attribute,Prediction Years Out,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AL,Gasoline Price,1,2014.5,22.730720,122.896485,123.418224,125.334236,22.264000,0.466720,2.863840
AL,Gasoline Price,2,2015.0,23.504089,122.044129,122.565868,124.481880,22.324444,1.179644,5.063556
AL,Gasoline Price,3,2015.5,24.619450,121.005560,121.527299,123.443311,21.673750,2.945700,6.853500
AL,Gasoline Price,4,2016.0,26.304514,119.721284,120.243024,122.159036,20.771429,5.533086,8.331486
AL,Gasoline Price,5,2016.5,28.086667,118.091888,118.613627,120.529640,19.706667,8.380000,10.380667
...,...,...,...,...,...,...,...,...,...,...
WI,Transportation Energy Use,6,2017.0,393885.803322,526.407066,527.042639,528.998784,443735.000000,-49849.196678,83963.746678
WI,Transportation Energy Use,7,2017.5,386556.523629,526.270253,526.934284,528.920874,444511.000000,-57954.476371,111612.393038
WI,Transportation Energy Use,8,2018.0,380875.672528,525.378920,526.090382,528.127712,446816.000000,-65940.327472,131062.660805
WI,Transportation Energy Use,9,2018.5,433281.750000,523.816135,524.337874,526.172243,454581.500000,-21299.750000,140079.125000


##### EV Sales

In [10]:
ev_sales_results_min_aicc = ev_sales_results.loc[ev_sales_results.loc[ev_sales_results['Prediction Years Out'] < 6].groupby(['Year', 'State', 'Attribute', 'Prediction Years Out']).AICc.idxmin()]
ev_sales_results_min_aicc.to_csv('ev_sales_min_aicc_results.csv', index=False)

In [11]:
# Generate average errors 
mean_socio_errors = ev_sales_results_min_aicc.loc[(ev_sales_results_min_aicc['Year'] < 2022) & (ev_sales_results_min_aicc['Attribute'] == 'Total')].groupby(['State', 'Attribute', 'Prediction Years Out']).mean()
mean_socio_errors

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Year,Prediction,AIC,AICc,BIC,Actual,Error,Absolute Error
State,Attribute,Prediction Years Out,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AL,Total,1,2019.5,1000.6,71.599645,74.599645,70.818521,1318.0,-317.4,634.8
AL,Total,2,2020.0,1308.066667,70.739666,73.739666,69.958542,1468.666667,-160.6,947.266667
AL,Total,3,2020.5,1687.3,69.180095,72.180095,68.398971,1699.5,-12.2,580.2
AL,Total,4,2021.0,351.0,67.777777,70.777777,66.996653,2413.0,-2062.0,2062.0
AR,Total,1,2019.5,484.05,64.944743,67.944743,64.163619,693.5,-209.45,421.05
AR,Total,2,2020.0,713.933333,63.801068,66.801068,63.019944,779.666667,-65.733333,505.333333
AR,Total,3,2020.5,889.1,62.58173,65.58173,61.800606,900.5,-11.4,145.2
AR,Total,4,2021.0,287.0,61.772424,64.772424,60.9913,1368.0,-1081.0,1081.0
IA,Total,1,2019.5,837.1,71.967719,74.967719,71.186595,1146.5,-309.4,684.0
IA,Total,2,2020.0,1074.133333,70.504188,73.504188,69.723064,1223.0,-148.866667,1117.933333


### ARIMAX

In [12]:
# MISO States and create exogenous variable
states = ['AL', 'AR', 'IL', 'IN', 'IA', 'KY', 'LA', 'MI', 'MN', 'MS', 'MO','ND', 'SD', 'TX', 'WI']
test_exogenous = socio_results_min_aicc[['State', 'Year', 'Attribute', \
                                      'Prediction', \
                                      'Prediction Years Out']].pivot\
                                      (index=["State", "Year", \
                                      "Prediction Years Out"], \
                                       columns='Attribute', values="Prediction")\
                                       .reset_index()[['State', 'Year', \
                                      'Prediction Years Out', 'Gasoline Price', \
                                      'Median Income', 'Population', \
                                      "Renewable Energy Use", \
                                      "Transportation Energy Use", "Stations Opened"]]
test_exogenous

Attribute,State,Year,Prediction Years Out,Gasoline Price,Median Income,Population,Renewable Energy Use,Transportation Energy Use,Stations Opened
0,AL,2010,1,18.4336,24388.906317,4.798463e+06,259149.531742,463910.320000,-0.000005
1,AL,2011,1,22.2228,25803.680709,4.813445e+06,220362.321836,480951.720000,-0.000005
2,AL,2011,2,18.7972,24225.245029,4.839782e+06,242449.786154,468764.640000,-0.000010
3,AL,2012,1,28.3516,24087.840000,4.813371e+06,267474.384389,479794.080000,16.333333
4,AL,2012,2,22.7256,24714.054106,4.841732e+06,196278.316271,486572.440000,-0.000010
...,...,...,...,...,...,...,...,...,...
1645,WI,2027,9,27.3304,17203.920000,5.893710e+06,246109.934693,664634.861239,-50.297478
1646,WI,2027,10,24.7320,15685.000000,5.900375e+06,236496.901893,365133.627432,-35.578192
1647,WI,2028,9,25.7764,13488.200000,5.894634e+06,246264.053900,478389.560000,50.004852
1648,WI,2028,10,27.8760,16178.800000,5.896292e+06,245770.912720,688655.477443,-61.606808


In [15]:
# Setup dataframe columns
df_dict = dict()
df_dict['Year'] = []
df_dict['State'] = []
df_dict["Attribute"] = []
df_dict["Model Parameters"] = []
df_dict["Prediction Years Out"] = []
df_dict["Prediction"] = []
df_dict["Confidence Interval"] = []
df_dict["AIC"] = []
df_dict["AICc"] = []
df_dict["BIC"] = []
exogenous_features = ["Gasoline Price", "Population", "Median Income", "Renewable Energy Use", "Transportation Energy Use", "Stations Opened"]
arimax_ev_sales_models = []

# Setup p, d, q hyperparameters
p_vals = [0, 1, 2]
d_vals = [1, 2]
q_vals = [0, 1, 2]

# Repeat below loops 4 times to predict 4 years into the future for each of
# 2016 to 2020
for i in range(4):
  print("Predicting for", str(2016 + i))
  # Predict value for each relevant attribute
  for col in ['Total']:
    # Predict value for each of MISOs states
    for state in states:
      train = us_ev_sales.loc[(us_ev_sales['State'] == state) &\
                                  (us_ev_sales['Year'] < 2016+i) &\
                                  (us_ev_sales['Year'] >= 2011+i), 
                                  'Total']
      train_exogenous = miso_states_year.loc[(miso_states_year['State'] == state) &\
                                  (miso_states_year['Year'] < 2016+i) &\
                                  (miso_states_year['Year'] >= 2011+i), ]

      test_exogenous = socio_results_min_aicc[['State', 'Year', 'Attribute', \
                                      'Prediction',  \
                                      'Prediction Years Out']].pivot\
                                      (index=["State", "Year", \
                                      "Prediction Years Out"], \
                                       columns='Attribute', values="Prediction")\
                                       .reset_index()[['State', 'Year', \
                                      'Prediction Years Out', 'Gasoline Price', \
                                      'Median Income', 'Population', \
                                      "Renewable Energy Use", \
                                      "Transportation Energy Use", "Stations Opened"]]
 
      # Collect exogenous attributes from socio economic predictions
      test_exog = pd.DataFrame()
      row = None
      for ii in range(4):
        row = test_exogenous.loc[(test_exogenous['Year'] == 2016+i+ii) & \
                           (test_exogenous['State'] == state) &\
                           (test_exogenous['Prediction Years Out'] == ii+1)]
        test_exog = test_exog.append(row[['State', 'Year', 'Gasoline Price', 'Median Income'\
                              , 'Population', "Renewable Energy Use", \
                              "Transportation Energy Use", "Stations Opened"]])

      # Predict for each combination of p, d, q
      for p in p_vals:
        for d in d_vals:
          for q in q_vals:
            try:
              model = arima_order(order=(p, d, q))
              model.fit(train, exogenous=train_exogenous[exogenous_features])
              arimax_ev_sales_models.append([2016+i, col, state, p, d, q, pickle.dumps(model)])
              prediction, confint = model.predict(n_periods=4, return_conf_int=True, exogenous=test_exog[exogenous_features])
            except Exception as e:
              # If model crashes (Linear algebra error or not normalized)
              # Store results
              for ii in range(4):
                df_dict["Year"].append(2016 + i + ii)
                df_dict["State"].append(state)
                df_dict["Attribute"].append(col)
                df_dict["Model Parameters"].append((p, d, q))
                df_dict["Prediction Years Out"].append(ii+1)
                df_dict["Prediction"].append(None)
                df_dict["Confidence Interval"].append(None)
                df_dict["AIC"].append(None)
                df_dict["AICc"].append(None)
                df_dict["BIC"].append(None)
              continue

            # Store results
            for ii in range(len(prediction)):
              df_dict["Year"].append(2016 + i + ii)
              df_dict["State"].append(state)
              df_dict["Attribute"].append(col)
              df_dict["Model Parameters"].append((p, d, q))
              df_dict["Prediction Years Out"].append(ii+1)
              df_dict["Prediction"].append(prediction[ii])
              df_dict["Confidence Interval"].append(confint[ii])
              try:
                aic = False
                aicc = False
                bic = False
                aic = model.aic()
                aicc = model.aicc()
                bic = model.bic()
                df_dict["AIC"].append(aic)
                df_dict["AICc"].append(aicc)
                df_dict["BIC"].append(bic)
              except Exception:
                if aic:
                  df_dict["AIC"].append(aic)
                else:
                  df_dict["AIC"].append(None)
                if aicc:
                  df_dict["AICc"].append(aicc)
                else:
                  df_dict["AICc"].append(None)
                if bic: 
                  df_dict["BIC"].append(bic)
                else:
                  df_dict["BIC"].append(None)
                continue
# Convert dictionary to pandas dataframe
df3 = pd.DataFrame(df_dict)
df3

Predicting for 2016
Predicting for 2017
Predicting for 2018
Predicting for 2019


Unnamed: 0,Year,State,Attribute,Model Parameters,Prediction Years Out,Prediction,Confidence Interval,AIC,AICc,BIC
0,2016,AL,Total,"(0, 1, 0)",1,196.416661,"[196.41664158939247, 196.41668078867215]",-47.563922,-83.563922,-52.473567
1,2017,AL,Total,"(0, 1, 0)",2,200.128917,"[200.1288892756098, 200.12894471176278]",-47.563922,-83.563922,-52.473567
2,2018,AL,Total,"(0, 1, 0)",3,195.697072,"[195.69703773905513, 195.6971056341992]",-47.563922,-83.563922,-52.473567
3,2019,AL,Total,"(0, 1, 0)",4,183.121125,"[183.12108606857527, 183.12116446713463]",-47.563922,-83.563922,-52.473567
4,2016,AL,Total,"(0, 1, 1)",1,196.416661,"[196.41664158939247, 196.41668078867215]",-45.563922,-81.563922,-51.087272
...,...,...,...,...,...,...,...,...,...,...
4315,2022,WI,Total,"(2, 2, 1)",4,5708.045928,"[5708.045820558951, 5708.046035262249]",-40.718780,-78.433066,-50.634045
4316,2019,WI,Total,"(2, 2, 2)",1,2602.527719,"[2602.52769908325, 2602.5277382825298]",-38.718780,-77.718780,-49.535433
4317,2020,WI,Total,"(2, 2, 2)",2,3488.707379,"[3488.707335426639, 3488.7074230788935]",-38.718780,-77.718780,-49.535433
4318,2021,WI,Total,"(2, 2, 2)",3,4480.569049,"[4480.568975216007, 4480.569121886281]",-38.718780,-77.718780,-49.535433


In [20]:
a = [1, 2, 3]
str(a)

'[1, 2, 3]'

In [21]:
textfile = open("/content/us_ev_sales_models.txt", "w")
for element in arimax_ev_sales_models:
  textfile.write(str(element) + "\n")
textfile.close()

In [32]:
# Store actual values for EV Sales
df3["Actual"] = "NA"
for year in range(2016, 2022):
  for state in states:
    for attribute in list(set(df3['Attribute'])):
      df3.loc[(df3["Year"] == year) & \
             (df3["State"] == state) & \
             (df3["Attribute"] == attribute), \
             "Actual"] = us_ev_sales.loc\
             [(us_ev_sales['State'] == state) &\
              (us_ev_sales['Year'] == year), attribute].values[0]

# Store prediction error
df3['Error'] = None
for year in range(2016, 2022):
  for state in states:
    for attribute in list(set(df3['Attribute'])):
      df3.loc[(df3["Year"] == year) & \
             (df3["State"] == state) & \
             (df3["Attribute"] == attribute), \
             "Error"] = df3.loc[(df3["Year"] == year) & \
             (df3["State"] == state) & \
             (df3["Attribute"] == attribute), \
             "Prediction"] - df3.loc[(df3["Year"] == year) & \
             (df3["State"] == state) & \
             (df3["Attribute"] == attribute), \
             "Actual"]

# Store absolute prediction error
df3['Absolute Error'] = None
for year in range(2016, 2022):
  for state in states:
    for attribute in list(set(df3['Attribute'])):
      df3.loc[(df3["Year"] == year) & \
             (df3["State"] == state) & \
             (df3["Attribute"] == attribute), \
             "Absolute Error"] = abs(df3.loc[(df3["Year"] == year) & \
             (df3["State"] == state) & \
             (df3["Attribute"] == attribute), \
             "Error"])
df3.to_csv('arimax_results_final.csv', index=False)


In [33]:
# Read arimax results
arimax_sales_results = pd.read_csv('arimax_results.csv')
arimax_results_min_aicc = arimax_sales_results.loc[arimax_sales_results.loc[arimax_sales_results['Prediction Years Out'] < 6].groupby(['Year', 'State', 'Attribute', 'Prediction Years Out']).AICc.idxmin()]
arimax_results_min_aicc.to_csv('arimax_min_aicc_results.csv')

In [None]:
mean_arimax_results_min_aicc = arimax_results_min_aicc.loc[arimax_results_min_aicc['Year'] < 2022].groupby(['State', 'Attribute', 'Prediction Years Out']).mean()
mean_arimax_results_min_aicc

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Year,Prediction,AIC,AICc,BIC,Actual,Error,Absolute Error
State,Attribute,Prediction Years Out,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AL,Total,1,2017.5,531.002703,-51.656003,-87.822669,-56.872501,646.0,-114.997297,202.64189
AL,Total,2,2018.5,642.782142,-51.656003,-87.822669,-56.872501,810.0,-167.217858,494.016999
AL,Total,3,2019.5,702.850234,-51.656003,-87.822669,-56.872501,1318.0,-615.149766,615.149766
AL,Total,4,2020.0,255.43693,-53.02003,-89.242252,-58.338812,1468.666667,-1213.229736,1213.229736
AR,Total,1,2017.5,206.316419,-51.562638,-87.729305,-56.779136,324.5,-118.183581,118.183581
AR,Total,2,2018.5,180.13067,-51.562638,-87.729305,-56.779136,398.25,-218.11933,218.11933
AR,Total,3,2019.5,167.625837,-51.562638,-87.729305,-56.779136,693.5,-525.874163,525.874163
AR,Total,4,2020.0,103.52762,-52.895544,-89.117766,-58.214326,779.666667,-676.139047,676.139047
IA,Total,1,2017.5,663.742608,-52.118609,-88.118609,-57.335107,595.75,67.992608,331.450064
IA,Total,2,2018.5,898.690135,-52.118609,-88.118609,-57.335107,709.5,189.190135,617.595783
