In [16]:
%%capture
!pip install pandas
!pip install plotly-express
!pip install pmdarima
!pip install statsmodels

In [13]:
import pandas as pd
import pmdarima as pm
from sklearn.model_selection import train_test_split
import statsmodel as sm 

In [25]:
def parser(x):
    return pd.datetime(x, "%Y")

In [94]:
df = pd.read_csv('../Data/global_average_yearly_temp_clean.csv')

# Preparing Data for Arima
We have three task to prepare data for ARIMA
1. We have to cast our Year into a datetime object
2. We then have to set it as an index of the array
3. Finally, we have to set the period of the data to Yearly frequency


In [154]:
global_df = df.groupby('Year').mean()
global_df = global_df.reset_index()
global_df["Year"] = pd.to_datetime(global_df.Year, format="%Y")
global_df = global_df.set_index("Year")
global_df.index = global_df.index.to_period("Y")

In [155]:
global_df

Unnamed: 0_level_0,AvgYearlyTemp,AvgTempUncertainty
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1743,4.997946,2.210000
1744,9.893368,2.164530
1745,1.349419,2.002500
1750,9.154983,2.153221
1751,9.235767,2.111132
...,...,...
2009,20.076165,0.344206
2010,20.171044,0.343326
2011,19.971308,0.361486
2012,19.966209,0.471523


# Split data into training and testing
Since this is forecasting, we will not be randomly splitting data. Instead we must split the data with a consecutive date range. We initally split the data from 1743 - 1980 as training and 1980-2013.

Still need to do parameter tuning (order) and excog features

In [170]:
train_df = global_df.loc['1743':'1980']
test_df = global_df.loc['1980':]

In [157]:
train = train_df.sort_index()["AvgYearlyTemp"]
test = test_df.sort_index()["AvgYearlyTemp"]

In [158]:
model = sm.tsa.arima.model.ARIMA(train, order=(1, 0, 0))

In [159]:
model_fit = model.fit()
print(model_fit.summary())


                               SARIMAX Results                                
Dep. Variable:          AvgYearlyTemp   No. Observations:                  234
Model:                 ARIMA(1, 0, 0)   Log Likelihood                -359.601
Date:                Sun, 05 Dec 2021   AIC                            725.203
Time:                        05:06:55   BIC                            735.569
Sample:                    12-31-1743   HQIC                           729.382
                         - 12-31-1980                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         14.4743      3.750      3.860      0.000       7.124      21.825
ar.L1          0.9728      0.018     53.731      0.000       0.937       1.008
sigma2         1.2500      0.037     33.660      0.0

In [138]:
from sklearn.metrics import r2_score

In [147]:
history = [x for x in train]
predictions = list()
# walk-forward validation
for t in range(len(test)):
    model = sm.tsa.arima.model.ARIMA(history, order=(5,1,0))
    model_fit = model.fit()
    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = test[t]
    history.append(obs)
# evaluate forecasts

In [177]:
%%capture --no-display
test_df["Year"] = test_df.index.values
test_df["Year"] = test_df["Year"].apply(str)
test_df["Predictions"] = predictions

In [178]:
print("Test R2 Score", r2_score(test_df["AvgYearlyTemp"], test_df["Predictions"]))

Test R2 Score 0.547381385474828


In [176]:
import plotly.express as px
px.line(test_df, x = "Year", y = ['AvgYearlyTemp', 'Predictions'] )