# Convert series to supervised data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from autokeras import StructuredDataRegressor
import sys
sys.path.append('../scripts')
from feature_transform import dates_encoder, merge_path
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

In [120]:
train = pd.read_csv('../data/flights_train.csv', parse_dates=['flight_date'])
X_test = pd.read_csv('../data/flights_Xtest.csv', parse_dates=['flight_date'])

In [121]:
train_processed = pd.get_dummies(merge_path(train))
train_processed = train_processed.sort_values(by='flight_date')
train_processed = train_processed.drop(['flight_date'], 1)



In [122]:
def series_to_supervised(data, n_in=4, n_out=4, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = data
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

In [123]:
train_processed = series_to_supervised(data = train_processed)

In [124]:
train_processed

Unnamed: 0,var1(t-4),var2(t-4),var3(t-4),var4(t-4),var5(t-4),var6(t-4),var7(t-4),var8(t-4),var9(t-4),var10(t-4),...,var120(t+3),var121(t+3),var122(t+3),var123(t+3),var124(t+3),var125(t+3),var126(t+3),var127(t+3),var128(t+3),var129(t+3)
6844,9.055556,11.251560,6.053412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6312,12.000000,11.426462,8.129632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8115,11.350000,11.159896,8.330508,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4175,11.454545,11.321779,7.920602,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5734,12.607143,11.634324,8.288052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3394,15.366667,11.478661,10.283845,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8774,12.086957,11.287000,9.064983,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5896,10.529412,11.215433,9.618457,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3997,11.333333,11.164588,10.465630,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [116]:
X = train_processed.drop('var2(t)', axis = 1)
y = train_processed[['var2(t)']]

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=30)

In [117]:
regressor_cb = CatBoostRegressor()
regressor_cb = regressor_cb.fit(train_X, train_y)

Learning rate set to 0.055817
0:	learn: 0.9337431	total: 9.08ms	remaining: 9.07s
1:	learn: 0.9211252	total: 18ms	remaining: 9s
2:	learn: 0.9112093	total: 26.8ms	remaining: 8.9s
3:	learn: 0.8995477	total: 35.4ms	remaining: 8.81s
4:	learn: 0.8883052	total: 43.5ms	remaining: 8.66s
5:	learn: 0.8770074	total: 52.2ms	remaining: 8.65s
6:	learn: 0.8668482	total: 60.8ms	remaining: 8.62s
7:	learn: 0.8571938	total: 69.8ms	remaining: 8.66s
8:	learn: 0.8479704	total: 78.9ms	remaining: 8.69s
9:	learn: 0.8398400	total: 87.2ms	remaining: 8.64s
10:	learn: 0.8326522	total: 95.7ms	remaining: 8.6s
11:	learn: 0.8260625	total: 104ms	remaining: 8.53s
12:	learn: 0.8193732	total: 112ms	remaining: 8.48s
13:	learn: 0.8134882	total: 120ms	remaining: 8.45s
14:	learn: 0.8076327	total: 128ms	remaining: 8.41s
15:	learn: 0.8024552	total: 136ms	remaining: 8.39s
16:	learn: 0.7971008	total: 145ms	remaining: 8.36s
17:	learn: 0.7911988	total: 152ms	remaining: 8.31s
18:	learn: 0.7860651	total: 160ms	remaining: 8.28s
19:	lea

In [118]:
pred = regressor_cb.predict(test_X)
mean_squared_error(test_y, pred, squared=False)

0.4741999836918089