In [7]:
# read csv file
import pandas as pd
data = pd.read_csv('AirPassengers.csv')
# data
data['Date'] = pd.to_datetime(data['Date'])
data.head()

Unnamed: 0,Date,#Passengers
0,1949-01-01,112
1,1949-02-01,118
2,1949-03-01,132
3,1949-04-01,129
4,1949-05-01,121


In [6]:
a = pd.read_csv('AirPassengers.csv')
a.columns[0]

'Date'

In [8]:
import numpy as np
# extract month and year from dates**
data['Month'] = [i.month for i in data['Date']]
data['Year'] = [i.year for i in data['Date']]

# create a sequence of numbers
data['Series'] = np.arange(1,len(data)+1)


# check the head of the dataset**
data

Unnamed: 0,Date,#Passengers,Month,Year,Series
0,1949-01-01,112,1,1949,1
1,1949-02-01,118,2,1949,2
2,1949-03-01,132,3,1949,3
3,1949-04-01,129,4,1949,4
4,1949-05-01,121,5,1949,5
5,1949-06-01,135,6,1949,6
6,1949-07-01,148,7,1949,7
7,1949-08-01,148,8,1949,8
8,1949-09-01,136,9,1949,9
9,1949-10-01,119,10,1949,10


In [9]:
# split data into train-test set
train = data[data['Year'] < 1953]
test = data[data['Year'] >= 1953]

# check shape
train.shape, test.shape

((48, 5), (11, 5))

In [4]:
# import the regression module**
from pycaret.regression import *
# initialize setup**
s = setup(data = train, test_data = test, target = '#Passengers', fold_strategy = 'timeseries', numeric_features = ['Year', 'Series'], fold = 3, transform_target = True, session_id = 123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,#Passengers
2,Original Data,"(48, 5)"
3,Missing Values,False
4,Numeric Features,2
5,Categorical Features,1
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(48, 23)"


In [5]:
best = compare_models(sort = 'MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
ridge,Ridge Regression,13.905,337.1035,17.0331,-0.0187,0.1078,0.0833,1.1
lr,Linear Regression,14.4255,330.9731,17.6843,0.0276,0.1133,0.087,0.9067
huber,Huber Regressor,14.4987,342.5884,17.5367,-0.0137,0.1127,0.0879,0.0333
br,Bayesian Ridge,14.6013,339.1276,17.4796,-0.0092,0.1137,0.0892,0.48
omp,Orthogonal Matching Pursuit,17.4371,465.6863,21.5443,-0.2893,0.129,0.1007,0.4933
en,Elastic Net,20.0104,676.052,25.7575,-0.9187,0.1531,0.1128,0.0133
lasso,Lasso Regression,21.3887,768.4556,27.3271,-1.1829,0.1631,0.1202,0.77
knn,K Neighbors Regressor,25.2112,997.4612,31.1687,-1.6998,0.1862,0.1393,0.05
et,Extra Trees Regressor,25.7282,847.5944,28.8462,-1.2876,0.1764,0.1468,0.09
rf,Random Forest Regressor,27.1376,1035.4707,31.9862,-1.8163,0.1945,0.152,0.13


In [6]:
prediction_holdout = predict_model(best)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Ridge Regression,8.8416,119.9202,10.9508,0.8408,0.0512,0.0412


In [7]:
import plotly.express as px
# generate predictions on the original dataset**
predictions = predict_model(best, data=data)

# add a date column in the dataset**
predictions['Date'] = pd.date_range(start='1949-01-01', end = '1953-11-01', freq = 'MS')

# line plot**
fig = px.line(predictions, x='Date', y=["#Passengers", "Label"], template = 'plotly_dark')

# add a vertical rectange for test-set separation**
fig.add_vrect(x0="1953-01-01", x1="1953-11-01", fillcolor="grey", opacity=0.25, line_width=0)
fig.show()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Ridge Regression,5.4687,48.8899,6.9921,0.9717,0.0407,0.0329
