In [20]:
import dataset as ds

from dataset.util import load_clustered_data, train_valid_test, prepare_time_series
from dataset import SlidingWinDataset, ClusterDataset

import pandas as pd
import numpy as np

from torch.utils.data import Dataset
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error

from model.conventional import individual_model_pipeline

In [21]:
cluster_num_series = [57, 37, 10, 45]



In [22]:
data = load_clustered_data(
	cluster_info='../data/cluster_info/cluster_info_a_s.csv',
	processed_data='../data/processed/data.csv'
)


def load_data(_cluster_id, _sliding_win) -> tuple[Dataset, Dataset, Dataset]:
	df = ds.util.load_clustered_data(
		# cluster_info='../data/cluster_info/cluster_info_f_a.csv',
		cluster_info='../data/cluster_info/cluster_info_a_s.csv',
		processed_data='../data/processed/data.csv',
	)[_cluster_id]

	cluster_datasets = {'train': [], 'valid': [], 'test': []}
	for series_id in df.columns:
		series_data, num_time_feat = prepare_time_series(df, series_id)
		series_data = series_data.reset_index().rename(columns={'index': 'time_ticks'})

		_train_data, _valid_data, _test_data = train_valid_test(
			series_data,
			train_ratio=0.7,
			test_ratio=0.2,
		)
		cluster_datasets['train'].append(ds.SlidingWinDataset(_train_data, sliding_win=_sliding_win))
		cluster_datasets['valid'].append(ds.SlidingWinDataset(_valid_data, sliding_win=_sliding_win))
		cluster_datasets['test'].append(ds.SlidingWinDataset(_test_data, sliding_win=_sliding_win))
		
	return (
		ClusterDataset(cluster_datasets['train']),
		ClusterDataset(cluster_datasets['valid']),
		ClusterDataset(cluster_datasets['test']),
	)

In [23]:
for c in range(4):
	train_set, valid_set, test_set = load_data(c, 20)
	
	def process_dataset(dataset):
		y_array, x_array = [], []
		for i in range(len(dataset)):
			cid, (_y, _x) = dataset[i]
			y_array.append(_y[1].item())
			x_array.append(_x[:, 1].numpy())
			
		y_array = np.array(y_array)
		x_array = np.array(x_array)
		return x_array, y_array
	
	train_x, train_y = process_dataset(train_set)
	valid_x, valid_y = process_dataset(valid_set)
	test_x, test_y = process_dataset(test_set)
	
	
	def process_dataset(dataset):
		y_array, x_array = [], []
		for i in range(len(dataset)):
			cid, (_y, _x) = dataset[i]
			y_array.append(_y[1].item())
			x_array.append(_x[:, 1].numpy())
			
		y_array = np.array(y_array)
		x_array = np.array(x_array)
		return x_array, y_array
	
	train_x, train_y = process_dataset(train_set)
	valid_x, valid_y = process_dataset(valid_set)
	test_x, test_y = process_dataset(test_set)
	
	model = SVR()
	model.fit(train_x, train_y)
	
	valid_y_pred = model.predict(valid_x)
	print(
		mean_squared_error(valid_y, valid_y_pred),
		root_mean_squared_error(valid_y, valid_y_pred),
		mean_absolute_error(valid_y, valid_y_pred),
	)
	
	test_y_pred = model.predict(test_x)
	print(
		mean_squared_error(test_y, test_y_pred),
		root_mean_squared_error(test_y, test_y_pred),
		mean_absolute_error(test_y, test_y_pred),
	)
	
	
	y_true2 = test_y.reshape(cluster_num_series[c], -1)
	y_pred2 = test_y_pred.reshape(cluster_num_series[c], -1)
	
	mse = np.square(y_true2 - y_pred2).mean(axis=0)
	mae = abs(y_true2 - y_pred2).mean(axis=0)
	rmse = np.sqrt(mse)
	
	smape = (2 * abs(y_pred2 - y_true2) / (abs(y_pred2) + abs(y_true2))).mean(axis=0)
	dstat = ((y_true2[1:] - y_true2[:-1]) * (y_pred2[1:] - y_pred2[:1]) > 0).mean(axis=0)
	s_rmse = rmse / y_true2.mean(axis=0)
	s_mae = mae / y_true2.mean(axis=0)
	
	pd.Series([s_rmse.mean(), s_mae.mean(), dstat.mean(), smape.mean()], index=['s-rmse', 's-mae', 'dstat', 'smape']).rename(f'cluster {c}').to_csv(f'./SVR_metrics(mean)_cluster_{c}.csv')
	pd.Series([np.median(s_rmse), np.median(s_mae), np.median(dstat), np.median(smape)], index=['s-rmse', 's-mae', 'dstat', 'smape']).rename(f'cluster {c}').to_csv(f'./SVR_metrics(median)_cluster_{c}.csv')

11524866459.346952 107353.93080528981 85651.39796925348
12180683714.538296 110366.1348174262 85928.60947320386
19226248610.2898 138658.74876937913 113628.56601626823
23733056150.564598 154055.36715922816 122740.49561587359
37091545841.45325 192591.65569009798 157980.2969695526
40932269703.547485 202317.2501383594 168077.65097410182
5462744579.145 73910.38207954956 56819.451483748504
8110993758.605544 90061.05572668769 66007.68419553844


In [24]:
assert False

AssertionError: 

In [77]:
cluster_df = data[0]

new_cluster_df = []
for cluster_id in cluster_df.columns:
	cluster_series = cluster_df.loc[:, cluster_id].rename('atm_balance')
	cluster_id_series = pd.Series(
		[cluster_id] * cluster_series.shape[0], 
		index=cluster_series.index, name='atm_name'
	)
	new_cluster_df.append(pd.concat([cluster_series, cluster_id_series], axis=1))
	
new_cluster_df = pd.concat(new_cluster_df, axis=0, ignore_index=True)

new_cluster_df['atm_id'] = new_cluster_df['atm_name'].map({
	_name: _id 
	for _id, _name in enumerate(new_cluster_df['atm_name'].unique())
})

new_cluster_df

Unnamed: 0,atm_balance,atm_name,atm_id
0,412100,9254,0
1,391500,9254,0
2,530200,9254,0
3,413800,9254,0
4,470000,9254,0
...,...,...,...
2188,28500,9192,2
2189,2500,9192,2
2190,7600,9192,2
2191,32200,9192,2
