In [1]:
# Dependencies:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from statsmodels.tsa.arima.model import ARIMA
from matplotlib import pyplot

In [2]:
# Load dataset:
df = pd.read_csv('../Resources/emissions_since1961_total_data.csv')
df.head()

Unnamed: 0,country,country_code,year,total,coal,oil,gas,cement,flaring,other,population,per_capita,temp_change
0,Australia,AUS,1961,90.503896,59.97478,29.115323,0.0,1.413794,0.0,,10483000,8.614547,0.151
1,Australia,AUS,1962,94.823455,61.485584,31.884128,0.003664,1.450079,0.0,,10742000,8.852126,0.118
2,Australia,AUS,1963,100.934264,63.727952,35.658048,0.007328,1.540936,0.0,,10950000,9.245287,-0.111
3,Australia,AUS,1964,108.875767,67.133213,39.943584,0.007328,1.791642,0.0,,11167000,9.780775,-0.029
4,Australia,AUS,1965,120.852669,71.059616,47.9068,0.007328,1.878925,0.0,,11388000,10.638962,0.121


In [3]:
#Checking data types:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2806 entries, 0 to 2805
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   country       2806 non-null   object 
 1   country_code  2806 non-null   object 
 2   year          2806 non-null   int64  
 3   total         2806 non-null   float64
 4   coal          2777 non-null   float64
 5   oil           2777 non-null   float64
 6   gas           2777 non-null   float64
 7   cement        2776 non-null   float64
 8   flaring       2777 non-null   float64
 9   other         1530 non-null   float64
 10  population    2806 non-null   int64  
 11  per_capita    2806 non-null   float64
 12  temp_change   2385 non-null   float64
dtypes: float64(9), int64(2), object(2)
memory usage: 285.1+ KB


In [4]:
# Get categories for encoding:
df_cat = ['country', 'country_code']

In [5]:
# OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit/transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[df_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(df_cat)
encode_df

Unnamed: 0,country_Australia,country_Austria,country_Belarus,country_Belgium,country_Brazil,country_Bulgaria,country_Canada,country_China,country_Croatia,country_Cyprus,...,country_code_PRT,country_code_ROU,country_code_RUS,country_code_SVK,country_code_SVN,country_code_SWE,country_code_TUR,country_code_UKR,country_code_USA,country_code_WLD
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [6]:
df = df.merge(encode_df,left_index=True,right_index=True).drop(columns=df_cat, axis=1)
df

Unnamed: 0,year,total,coal,oil,gas,cement,flaring,other,population,per_capita,...,country_code_PRT,country_code_ROU,country_code_RUS,country_code_SVK,country_code_SVN,country_code_SWE,country_code_TUR,country_code_UKR,country_code_USA,country_code_WLD
0,1961,90.503896,59.974780,29.115323,0.000000,1.413794,0.000000,,10483000,8.614547,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1962,94.823455,61.485584,31.884128,0.003664,1.450079,0.000000,,10742000,8.852126,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1963,100.934264,63.727952,35.658048,0.007328,1.540936,0.000000,,10950000,9.245287,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1964,108.875767,67.133213,39.943584,0.007328,1.791642,0.000000,,11167000,9.780775,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1965,120.852669,71.059616,47.906800,0.007328,1.878925,0.000000,,11388000,10.638962,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2801,2017,36096.739276,14506.973805,12242.627935,7144.928128,1507.923185,391.992176,302.294047,7578157615,4.749682,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2802,2018,36826.506600,14746.830688,12266.016285,7529.846784,1569.218392,412.115746,302.478706,7661776338,4.792753,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2803,2019,37082.558969,14725.978025,12345.653374,7647.528220,1617.506786,439.253991,306.638573,7742681934,4.775633,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2804,2020,35264.085734,14174.564010,11191.808551,7556.290283,1637.537532,407.583673,296.301685,7820981524,4.497423,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [8]:
model = ARIMA(df.iloc[:,10], order=(5,1,0))
model_fit = model.fit()
model_fit.summary()

0,1,2,3
Dep. Variable:,temp_change,No. Observations:,2806.0
Model:,"ARIMA(5, 1, 0)",Log Likelihood,-2301.793
Date:,"Sat, 28 Jan 2023",AIC,4615.586
Time:,18:06:18,BIC,4651.221
Sample:,0,HQIC,4628.448
,- 2806,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,-0.5198,0.017,-29.792,0.000,-0.554,-0.486
ar.L2,-0.2780,0.020,-13.657,0.000,-0.318,-0.238
ar.L3,-0.2356,0.020,-11.813,0.000,-0.275,-0.197
ar.L4,-0.2614,0.019,-13.911,0.000,-0.298,-0.225
ar.L5,-0.1627,0.019,-8.680,0.000,-0.199,-0.126
sigma2,0.3986,0.010,39.897,0.000,0.379,0.418

0,1,2,3
Ljung-Box (L1) (Q):,0.16,Jarque-Bera (JB):,274.22
Prob(Q):,0.69,Prob(JB):,0.0
Heteroskedasticity (H):,0.99,Skew:,-0.31
Prob(H) (two-sided):,0.85,Kurtosis:,4.4
