In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_percentage_error, median_absolute_error

from tpot import TPOTRegressor

In [2]:
df = pd.read_csv("../data/raw/Demand_History.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4096 entries, 0 to 4095
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   demand_point_index  4096 non-null   int64  
 1   x_coordinate        4096 non-null   float64
 2   y_coordinate        4096 non-null   float64
 3   2010                4096 non-null   float64
 4   2011                4096 non-null   float64
 5   2012                4096 non-null   float64
 6   2013                4096 non-null   float64
 7   2014                4096 non-null   float64
 8   2015                4096 non-null   float64
 9   2016                4096 non-null   float64
 10  2017                4096 non-null   float64
 11  2018                4096 non-null   float64
dtypes: float64(11), int64(1)
memory usage: 384.1 KB


In [4]:
X = df.drop("2018", axis=1)

In [5]:
y = df["2018"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Linear Regression

In [7]:
reg = LinearRegression().fit(X_train, y_train)

In [8]:
y_pred = reg.predict(X_test)

In [9]:
print(r2_score(y_test, y_pred))

0.9856454627807976


In [10]:
# To compute the MAPE, we need to make sure we have no zero values in test and prediction

# convert y_test series to a dataframe
df_compare = y_test.to_frame()
# add y_pred numpy array as a new column to the dataframe
df_compare["2018_pred"] = y_pred.tolist()
# see https://stackoverflow.com/questions/22649693/drop-rows-with-all-zeros-in-pandas-data-frame
df_compare_no_zeros = df_compare.loc[(df_compare!=0).all(axis=1)]

In [11]:
print(mean_absolute_percentage_error(df_compare_no_zeros["2018"], df_compare_no_zeros["2018_pred"]))

0.3028254141383469


With 30% MAPE we have reached an OK value, see https://stephenallwright.com/good-mape-score/

# Random Forest Regression

In [12]:
reg = RandomForestRegressor().fit(X_train, y_train)

In [13]:
y_pred = reg.predict(X_test)

In [14]:
print(r2_score(y_test, y_pred))

0.9900077641186898


In [15]:
# To compute the MAPE, we need to make sure we have no zero values in test and prediction

# convert y_test series to a dataframe
df_compare = y_test.to_frame()
# add y_pred numpy array as a new column to the dataframe
df_compare["2018_pred"] = y_pred.tolist()
# see https://stackoverflow.com/questions/22649693/drop-rows-with-all-zeros-in-pandas-data-frame
df_compare_no_zeros = df_compare.loc[(df_compare!=0).all(axis=1)]

In [16]:
print(mean_absolute_percentage_error(df_compare_no_zeros["2018"], df_compare_no_zeros["2018_pred"]))

0.0618275901549264


With 6% MAPE we have reached an Very Good value, see https://stephenallwright.com/good-mape-score/

# Gradient Boosting Regression

In [17]:
reg = GradientBoostingRegressor().fit(X_train, y_train)

In [18]:
y_pred = reg.predict(X_test)

In [19]:
print(r2_score(y_test, y_pred))

0.9902135460445505


The R2 score is best for gradient boosting

In [20]:
# To compute the MAPE, we need to make sure we have no zero values in test and prediction

# convert y_test series to a dataframe
df_compare = y_test.to_frame()
# add y_pred numpy array as a new column to the dataframe
df_compare["2018_pred"] = y_pred.tolist()
# see https://stackoverflow.com/questions/22649693/drop-rows-with-all-zeros-in-pandas-data-frame
df_compare_no_zeros = df_compare.loc[(df_compare!=0).all(axis=1)]

In [21]:
print(mean_absolute_percentage_error(df_compare_no_zeros["2018"], df_compare_no_zeros["2018_pred"]))

0.07982698937632556


With 8% MAPE we have reached an Very good value, see https://stephenallwright.com/good-mape-score/

# TPOT

In [22]:
tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -64.79475259929676

Generation 2 - Current best internal CV score: -64.79475259929676

Generation 3 - Current best internal CV score: -64.79475259929676

Generation 4 - Current best internal CV score: -64.1900459034964

Generation 5 - Current best internal CV score: -64.14568266690753

Best pipeline: ExtraTreesRegressor(MaxAbsScaler(input_matrix), bootstrap=False, max_features=0.9500000000000001, min_samples_leaf=3, min_samples_split=4, n_estimators=100)
-58.06679430637211




# ExtraTrees Regression

In [25]:
reg = ExtraTreesRegressor().fit(X_train, y_train)

In [26]:
y_pred = reg.predict(X_test)

In [27]:
print(r2_score(y_test, y_pred))

0.9902236866936038


The R2 score is slightly better for extra trees than for gradient boosting.

In [29]:
# To compute the MAPE, we need to make sure we have no zero values in test and prediction

# convert y_test series to a dataframe
df_compare = y_test.to_frame()
# add y_pred numpy array as a new column to the dataframe
df_compare["2018_pred"] = y_pred.tolist()
# see https://stackoverflow.com/questions/22649693/drop-rows-with-all-zeros-in-pandas-data-frame
df_compare_no_zeros = df_compare.loc[(df_compare!=0).all(axis=1)]

In [30]:
print(mean_absolute_percentage_error(df_compare_no_zeros["2018"], df_compare_no_zeros["2018_pred"]))

0.05982923485645168


With < 6% MAPE we have reached an Very good value, see https://stephenallwright.com/good-mape-score/.