In [1]:
import pandas as pd

import catboost as cb

import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import TimeSeriesSplit 
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('/kaggle/input/case-ih-time-series-data/data.csv')

In [3]:
df.head()

Unnamed: 0,date,work,income
0,"Thursday, September 30, 2021",2,2100
1,"Sunday, October 03, 2021",1,3600
2,"Monday, October 04, 2021",1,4000
3,"Tuesday, October 05, 2021",1,7900
4,"Wednesday, October 06, 2021",1,7500


In [4]:
df['date'] = pd.to_datetime(df['date'])

In [5]:
data = pd.DataFrame(df.income.copy())

In [6]:
data.columns = ["y"]

In [7]:
data.head()

Unnamed: 0,y
0,2100
1,3600
2,4000
3,7900
4,7500


In [8]:
for i in range(6, 25):
    data["lag_{}".format(i)] = data.y.shift(i)

In [9]:
data.head()

Unnamed: 0,y,lag_6,lag_7,lag_8,lag_9,lag_10,lag_11,lag_12,lag_13,lag_14,lag_15,lag_16,lag_17,lag_18,lag_19,lag_20,lag_21,lag_22,lag_23,lag_24
0,2100,,,,,,,,,,,,,,,,,,,
1,3600,,,,,,,,,,,,,,,,,,,
2,4000,,,,,,,,,,,,,,,,,,,
3,7900,,,,,,,,,,,,,,,,,,,
4,7500,,,,,,,,,,,,,,,,,,,


In [10]:
x = data.dropna().drop(['y'], axis=1)

y = data.dropna().y

In [11]:
x.head()

Unnamed: 0,lag_6,lag_7,lag_8,lag_9,lag_10,lag_11,lag_12,lag_13,lag_14,lag_15,lag_16,lag_17,lag_18,lag_19,lag_20,lag_21,lag_22,lag_23,lag_24
24,5600.0,6000.0,12500.0,9000.0,5600.0,10000.0,4200.0,5500.0,2600.0,8000.0,6800.0,5800.0,6700.0,10600.0,7500.0,7900.0,4000.0,3600.0,2100.0
25,2100.0,5600.0,6000.0,12500.0,9000.0,5600.0,10000.0,4200.0,5500.0,2600.0,8000.0,6800.0,5800.0,6700.0,10600.0,7500.0,7900.0,4000.0,3600.0
26,6400.0,2100.0,5600.0,6000.0,12500.0,9000.0,5600.0,10000.0,4200.0,5500.0,2600.0,8000.0,6800.0,5800.0,6700.0,10600.0,7500.0,7900.0,4000.0
27,2400.0,6400.0,2100.0,5600.0,6000.0,12500.0,9000.0,5600.0,10000.0,4200.0,5500.0,2600.0,8000.0,6800.0,5800.0,6700.0,10600.0,7500.0,7900.0
28,2000.0,2400.0,6400.0,2100.0,5600.0,6000.0,12500.0,9000.0,5600.0,10000.0,4200.0,5500.0,2600.0,8000.0,6800.0,5800.0,6700.0,10600.0,7500.0


In [12]:
tscv = TimeSeriesSplit(n_splits=5)

In [13]:
def timeseries_train_test_split(x, y, test_size):
    
    test_index = int(len(x)*(1-test_size))
    
    x_train = x.iloc[:test_index]
    y_train = y.iloc[:test_index]
    
    x_test = x.iloc[test_index:]
    y_test = y.iloc[test_index:]
    
    return x_train, x_test, y_train, y_test

In [14]:
x_train, x_test, y_train, y_test = timeseries_train_test_split(x, y, test_size=0.3)

In [15]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(117, 19)
(51, 19)
(117,)
(51,)


In [16]:
train_dataset = cb.Pool(x_train, y_train) 
test_dataset = cb.Pool(x_test, y_test)