In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

sales = pd.read_csv('../data/processed/US_EV_SalesData.csv')

socioeconomic = pd.read_csv('../data/processed/state_year.csv')
socioeconomic['Stations Opened'] = socioeconomic['Stations Opened'].fillna(value=0)

data = pd.merge(sales, socioeconomic)
data = data[data['Year'] < 2020]

PREDICTORS = ['Gasoline Price', 'Median Income', 'Population', 'Renewable Energy Use', 'Total Energy Use', 'Transportation Energy Use', 'Stations Opened']
RESPONSES = ['BEV sales', 'PHEV sales', 'Total']

all_scores = {'State': data['State'].unique(), 'OLS': [], 'Log-log': [], 'Random forest': []}

In [2]:
for state in data['State'].unique():
    scores = []

    for lo in data['Year'].unique():
        hi = lo + 5
        if hi > data['Year'].max():
            break

        subset = data[(data['State'] == state) & (lo <= data['Year']) & (data['Year'] < hi)]
        train, test = train_test_split(subset, test_size=0.4, shuffle=False)

        for response in RESPONSES:
            mod = LinearRegression().fit(train[PREDICTORS], train[[response]])
            predict = mod.predict(test[PREDICTORS])

            scores.append(np.sqrt(mean_squared_error(test[[response]], predict)))

    all_scores['OLS'].append(sum(scores) / len(scores))

In [3]:
lndata = pd.DataFrame()
for c in data.columns:
    if c in PREDICTORS or c in RESPONSES: lndata[c] = np.log(data[c] + 1)
    else: lndata[c] = data[c]

for state in lndata['State'].unique():
    scores = []

    for lo in lndata['Year'].unique():
        hi = lo + 5
        if hi > lndata['Year'].max():
            break

        subset = lndata[(lndata['State'] == state) & (lo <= lndata['Year']) & (lndata['Year'] < hi)]
        train, test = train_test_split(subset, test_size=0.4, shuffle=False)

        for response in RESPONSES:
            mod = LinearRegression().fit(train[PREDICTORS], train[[response]])
            predict = mod.predict(test[PREDICTORS])

            scores.append(np.sqrt(mean_squared_error(np.exp(test[[response]]), np.exp(predict))))

    all_scores['Log-log'].append(sum(scores) / len(scores))

In [4]:
for state in data['State'].unique():
    scores = []

    for lo in data['Year'].unique():
        hi = lo + 5
        if hi > data['Year'].max():
            break

        subset = data[(data['State'] == state) & (lo <= data['Year']) & (data['Year'] < hi)]
        train, test = train_test_split(subset, test_size=0.4, shuffle=False)

        for response in RESPONSES:
            mod = RandomForestRegressor().fit(train[PREDICTORS], train[[response]].values.ravel())
            predict = mod.predict(test[PREDICTORS])

            scores.append(np.sqrt(mean_squared_error(test[[response]], predict)))

    all_scores['Random forest'].append(sum(scores) / len(scores))

In [5]:
pd.DataFrame.from_dict(all_scores)

Unnamed: 0,State,OLS,Log-log,Random forest
0,AR,64.255108,3404.030519,62.471818
1,IL,1374.66053,1768.247111,1005.995133
2,IN,362.189262,669.529191,258.560132
3,IA,175.064717,176.375701,118.264527
4,LA,89.103259,289.526923,91.106953
5,MI,1352.237636,1917.817098,534.181909
6,MN,1132.458482,12716.302472,481.133473
7,MS,44.414951,60.793449,36.921054
8,MO,349.320394,442.888912,340.461332
9,ND,24.23297,18.926241,12.238954
