In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from torch_geometric.data import Data
from torch_geometric.utils import from_networkx
import torch
from torch_geometric.nn import GCNConv
from torch_geometric.nn import GATConv
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import norm
from scipy.stats import ttest_ind
import itertools
from tqdm import tqdm

### **1. 데이터 불러오기**

- yahoo finance를 통해 글로벌 인덱스 지수의 종가 데이터 크롤링 하였음
- 기간 : 2005.01.01 ~ 2024.12.31 (20년치)
- 전처리 : 결측치는 drop과 backfill로 처리
- 최종 데이터 : 248 * 26


In [2]:

index_close = pd.read_csv('data/global_indices.csv')
index_volume = pd.read_csv('data/global_indices_volume.csv')

index_close = index_close.iloc[1003:6288]
index_volume = index_volume.iloc[1003:6288]

# 삭제하려는 컬럼 리스트
columns_to_drop = [ 'MSCI World', 'Euro Stoxx 50', 'Nifty 50', 'MOEX Russia',
    'South Africa Top 40', 'Saudi Tadawul', 'Israel TA-125', 'Vietnam VN-Index', 'Philippine PSEi',
    'Colombia COLCAP', 'Chile IPSA', 'Peru S&P/BVL', 'Pakistan KSE 100', 'Bangladesh DSE General',
    'UAE ADX General', 'Qatar QE Index'
]

index_close = index_close.drop(columns=columns_to_drop, errors='ignore')
index_volume = index_volume.drop(columns=columns_to_drop, errors='ignore')

index_close = index_close.fillna(method='bfill')
index_volume = index_volume.fillna(method = 'bfill')


date_column = index_close.iloc[:, 0]  # 첫 번째 열

close_data = index_close.iloc[:, 1:]  # 두 번째 열부터
volume_data = index_volume.iloc[:, 1:]  # 두 번째 열부터

market_cap_data = close_data * volume_data
market_cap = pd.concat([date_column, market_cap_data], axis=1)

total_market_cap = market_cap.iloc[:, 1:].sum()
top_15_columns = total_market_cap.nlargest(15).index
top_15_market_cap = market_cap[["Date"] + list(top_15_columns)]

top_15_columns = top_15_market_cap.columns[1:]  # 날짜 열 제외

index = index_close[["Date"] + list(top_15_columns)]
index



Unnamed: 0,Date,Turkey BIST 100,Hang Seng,NASDAQ 100,S&P 500,Mexican IPC,FTSE 100,Dow Jones,Russell 2000,S&P/TSX,Nikkei 225,IBEX 35,DAX,Bovespa,Straits Times,CAC 40
1003,2003-12-31 00:00:00,18625.000000,12575.940430,1467.920044,1111.920044,8795.280273,4476.899902,10453.919922,556.909973,8220.900391,10825.169922,7879.192383,4018.500000,22445.0,1764.520020,3557.899902
1004,2004-01-02 00:00:00,19147.699219,12801.480469,1463.569946,1108.479980,8818.190430,4510.200195,10409.849609,560.849976,8293.700195,10825.169922,7879.192383,4018.500000,22445.0,1791.349976,3596.800049
1005,2004-01-05 00:00:00,19696.599609,13005.330078,1496.579956,1122.219971,9054.110352,4513.299805,10544.070312,568.919983,8381.700195,10825.169922,7911.392090,4035.899902,23532.0,1828.660034,3608.290039
1006,2004-01-06 00:00:00,19013.800781,13036.320312,1501.260010,1123.670044,9038.459961,4505.200195,10538.660156,569.890015,8405.099609,10813.990234,7913.692383,4035.439941,23576.0,1828.719971,3595.820068
1007,2004-01-07 00:00:00,19382.800781,13157.679688,1514.260010,1126.329956,9098.219727,4473.000000,10529.030273,574.619995,8388.500000,10757.820312,7913.692383,4004.399902,23320.0,1835.959961,3563.510010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6283,2024-12-23 00:00:00,9626.599609,19883.130859,21503.169922,5974.069824,49450.890625,8102.700195,42906.949219,2237.439941,24749.000000,39161.339844,11435.700195,19848.769531,120767.0,3752.330078,7272.319824
6284,2024-12-24 00:00:00,9672.799805,20098.289062,21797.650391,6040.040039,49316.621094,8137.000000,43297.031250,2259.850098,24846.800781,39036.851562,11473.900391,19984.320312,121078.0,3769.550049,7282.689941
6285,2024-12-26 00:00:00,9949.000000,20090.460938,21768.310547,6037.589844,49535.578125,8149.799805,43325.800781,2280.189941,24796.400391,39568.058594,11531.599609,19984.320312,121078.0,3761.449951,7355.370117
6286,2024-12-27 00:00:00,10025.500000,20090.460938,21473.019531,5970.839844,49290.578125,8149.799805,42992.210938,2244.590088,24796.400391,40281.160156,11531.599609,19984.320312,120269.0,3771.629883,7355.370117


### **2. 수익률 데이터로 변환**

- 종가 데이터를 1일 수익률 데이터로 변환

In [3]:
# 날짜를 인덱스에서 칼럼으로 변환
prices = index.copy()
prices = prices.set_index('Date')

# 숫자로 변환
prices = prices.apply(pd.to_numeric, errors='coerce')

prices = prices.reset_index()
prices['Date'] = pd.to_datetime(prices['Date']).dt.strftime('%Y-%m-%d')

returns = prices.set_index('Date').pct_change().reset_index()

returns = returns.dropna()
returns.set_index('Date', inplace = True)
returns


Unnamed: 0_level_0,Turkey BIST 100,Hang Seng,NASDAQ 100,S&P 500,Mexican IPC,FTSE 100,Dow Jones,Russell 2000,S&P/TSX,Nikkei 225,IBEX 35,DAX,Bovespa,Straits Times,CAC 40
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2004-01-02,0.028064,0.017934,-0.002963,-0.003094,0.002605,0.007438,-0.004216,0.007075,0.008855,0.000000,0.000000,0.000000,0.000000,0.015205,0.010933
2004-01-05,0.028667,0.015924,0.022554,0.012395,0.026754,0.000687,0.012894,0.014389,0.010610,0.000000,0.004087,0.004330,0.048429,0.020828,0.003195
2004-01-06,-0.034666,0.002383,0.003127,0.001292,-0.001729,-0.001795,-0.000513,0.001705,0.002792,-0.001033,0.000291,-0.000114,0.001870,0.000033,-0.003456
2004-01-07,0.019407,0.009309,0.008659,0.002367,0.006612,-0.007147,-0.000914,0.008300,-0.001975,-0.005194,0.000000,-0.007692,-0.010859,0.003959,-0.008985
2004-01-08,0.001140,0.003489,0.010824,0.004963,0.007869,0.004740,0.006022,0.008701,-0.000250,0.007421,0.003753,0.010246,0.017024,0.003116,0.008200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-23,-0.010067,0.008237,0.010053,0.007287,-0.002729,0.002239,0.001557,-0.002199,0.006077,0.011871,-0.002756,-0.001809,-0.010933,0.008710,-0.000297
2024-12-24,0.004799,0.010821,0.013695,0.011043,-0.002715,0.004233,0.009091,0.010016,0.003952,-0.003179,0.003340,0.006829,0.002575,0.004589,0.001426
2024-12-26,0.028554,-0.000389,-0.001346,-0.000406,0.004440,0.001573,0.000664,0.009001,-0.002028,0.013608,0.005029,0.000000,0.000000,-0.002149,0.009980
2024-12-27,0.007689,0.000000,-0.013565,-0.011056,-0.004946,0.000000,-0.007700,-0.015613,0.000000,0.018022,0.000000,0.000000,-0.006682,0.002706,0.000000


In [5]:
# returns.to_csv("returns.csv")

### **3. 각 지수를 노드로, 지수간 상관관계를 엣지로 설정**

- 상관관계를 계산해서 절댓값이 0.3 이상인 것들에 대해서만 노드와 엣지로 구성
- 총 노드 수 : 26, 엣지 수 : 216

In [4]:
test_periods = [
    ("2004-01-01", "2004-06-30", "2004-07-01", "2004-12-31"),
    ("2004-07-01", "2004-12-31", "2005-01-01", "2005-06-30"),
    ("2005-01-01", "2005-06-30", "2005-07-01", "2005-12-31"),
    ("2005-07-01", "2005-12-31", "2006-01-01", "2006-06-30"),
    ("2006-01-01", "2006-06-30", "2006-07-01", "2006-12-31"),
    ("2006-07-01", "2006-12-31", "2007-01-01", "2007-06-30"),
    ("2007-01-01", "2007-06-30", "2007-07-01", "2007-12-31"),
    ("2007-07-01", "2007-12-31", "2008-01-01", "2008-06-30"),
    ("2008-01-01", "2008-06-30", "2008-07-01", "2008-12-31"),
    ("2008-07-01", "2008-12-31", "2009-01-01", "2009-06-30"),
    ("2009-01-01", "2009-06-30", "2009-07-01", "2009-12-31"),
    ("2009-07-01", "2009-12-31", "2010-01-01", "2010-06-30"),
    ("2010-01-01", "2010-06-30", "2010-07-01", "2010-12-31"),
    ("2010-07-01", "2010-12-31", "2011-01-01", "2011-06-30"),
    ("2011-01-01", "2011-06-30", "2011-07-01", "2011-12-31"),
    ("2011-07-01", "2011-12-31", "2012-01-01", "2012-06-30"),
    ("2012-01-01", "2012-06-30", "2012-07-01", "2012-12-31"),
    ("2012-07-01", "2012-12-31", "2013-01-01", "2013-06-30"),
    ("2013-01-01", "2013-06-30", "2013-07-01", "2013-12-31"),
    ("2013-07-01", "2013-12-31", "2014-01-01", "2014-06-30"),
    ("2014-01-01", "2014-06-30", "2014-07-01", "2014-12-31"),
    ("2014-07-01", "2014-12-31", "2015-01-01", "2015-06-30"),
    ("2015-01-01", "2015-06-30", "2015-07-01", "2015-12-31"),
    ("2015-07-01", "2015-12-31", "2016-01-01", "2016-06-30"),
    ("2016-01-01", "2016-06-30", "2016-07-01", "2016-12-31"),
    ("2016-07-01", "2016-12-31", "2017-01-01", "2017-06-30"),
    ("2017-01-01", "2017-06-30", "2017-07-01", "2017-12-31"),
    ("2017-07-01", "2017-12-31", "2018-01-01", "2018-06-30"),
    ("2018-01-01", "2018-06-30", "2018-07-01", "2018-12-31"),
    ("2018-07-01", "2018-12-31", "2019-01-01", "2019-06-30"),
    ("2019-01-01", "2019-06-30", "2019-07-01", "2019-12-31"),
    ("2019-07-01", "2019-12-31", "2020-01-01", "2020-06-30"),
    ("2020-01-01", "2020-06-30", "2020-07-01", "2020-12-31"),
    ("2020-07-01", "2020-12-31", "2021-01-01", "2021-06-30"),
    ("2021-01-01", "2021-06-30", "2021-07-01", "2021-12-31"),
    ("2021-07-01", "2021-12-31", "2022-01-01", "2022-06-30"),
    ("2022-01-01", "2022-06-30", "2022-07-01", "2022-12-31"),
    ("2022-07-01", "2022-12-31", "2023-01-01", "2023-06-30"),
    ("2023-01-01", "2023-06-30", "2023-07-01", "2023-12-31"),
    ("2023-07-01", "2023-12-31", "2024-01-01", "2024-06-30"),
    ("2024-01-01", "2024-06-30", "2024-07-01", "2024-12-31")
]

In [5]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd

def create_graph(train_data):
    """
    Create a graph based on the correlation matrix of the train_data.
    """
    correlation_matrix = train_data.corr()
    G = nx.Graph()
    for i in correlation_matrix.columns:
        for j in correlation_matrix.columns:
            if i != j:
                weight = correlation_matrix.loc[i, j]
                if abs(weight) > 0.3:  # correlation threshold
                    G.add_edge(i, j, weight=weight)
    return G

def plot_and_save_graph(G, title, filename):
    """
    Plot the graph using networkx and matplotlib, and save it to a file.
    """
    plt.figure(figsize=(20, 12))
    pos = nx.kamada_kawai_layout(G)
    weights = nx.get_edge_attributes(G, 'weight')
    edge_colors = ['blue' if w > 0 else 'red' for w in weights.values()]
    edge_widths = [abs(w) * 5 for w in weights.values()]  # 상관계수에 비례
    node_sizes = [len(list(G.neighbors(node))) * 200 for node in G.nodes()]
    node_colors = ['skyblue' if len(list(G.neighbors(node))) > 5 else 'lightgray' for node in G.nodes()]

    nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color=node_colors, alpha=0.9)
    nx.draw_networkx_edges(G, pos, edgelist=weights.keys(), edge_color=edge_colors, width=edge_widths, alpha=0.7)
    nx.draw_networkx_labels(G, pos, font_size=10, font_color='black')

    plt.title(title, fontsize=20, fontweight='bold')
    plt.axis('off')
    plt.savefig(filename, format='png', bbox_inches='tight')
    plt.close()

# Process each test period
for i, (train_start, train_end, test_start, test_end) in enumerate(test_periods):
    print(f"Processing Test {i + 1}: Train ({train_start} to {train_end}), Test ({test_start} to {test_end})")
    
    # Split data into Train and Test
    train_data = returns.loc[train_start:train_end]
    
    # Create graph based on Train data
    G = create_graph(train_data)
    
    # Save the graph with a unique filename
    filename = f"graph_test{i + 1}.png"
    title = f"Spillover Effect Graph (Train: {train_start} to {train_end})"
    plot_and_save_graph(G, title, filename)

print("All graphs have been saved.")


Processing Test 1: Train (2004-01-01 to 2004-06-30), Test (2004-07-01 to 2004-12-31)
Processing Test 2: Train (2004-07-01 to 2004-12-31), Test (2005-01-01 to 2005-06-30)
Processing Test 3: Train (2005-01-01 to 2005-06-30), Test (2005-07-01 to 2005-12-31)
Processing Test 4: Train (2005-07-01 to 2005-12-31), Test (2006-01-01 to 2006-06-30)
Processing Test 5: Train (2006-01-01 to 2006-06-30), Test (2006-07-01 to 2006-12-31)
Processing Test 6: Train (2006-07-01 to 2006-12-31), Test (2007-01-01 to 2007-06-30)
Processing Test 7: Train (2007-01-01 to 2007-06-30), Test (2007-07-01 to 2007-12-31)
Processing Test 8: Train (2007-07-01 to 2007-12-31), Test (2008-01-01 to 2008-06-30)
Processing Test 9: Train (2008-01-01 to 2008-06-30), Test (2008-07-01 to 2008-12-31)
Processing Test 10: Train (2008-07-01 to 2008-12-31), Test (2009-01-01 to 2009-06-30)
Processing Test 11: Train (2009-01-01 to 2009-06-30), Test (2009-07-01 to 2009-12-31)
Processing Test 12: Train (2009-07-01 to 2009-12-31), Test (201

### **4. 기본 데이터를 Tabular Form으로 활용했을 때 성능 평가**

In [6]:
# Pearson Correlation 기반 그래프 생성 함수
n_experiments = 30

def create_correlation_graph(data, threshold=0.3):
    correlation_matrix = data.corr()
    G = nx.Graph()
    for i in correlation_matrix.columns:
        for j in correlation_matrix.columns:
            if i != j:
                weight = correlation_matrix.loc[i, j]
                if abs(weight) > threshold:
                    G.add_edge(i, j, weight=weight)
    return G

def prepare_time_series_data(data_matrix):
    X = data_matrix.iloc[:-1].values
    y = data_matrix.iloc[1:].values
    return X, y

#### 4.1 Random Forest Model 사용

In [11]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import pandas as pd
from tqdm import tqdm

# Initialize Results
final_results = []

# Random Forest Regressor Evaluation
for test_id, (train_start, train_end, test_start, test_end) in enumerate(test_periods):
    print(f"\n=== Processing Test {test_id + 1}: Train ({train_start} to {train_end}), Test ({test_start} to {test_end}) ===")
    
    # Train/Test 데이터 분리
    train_data = returns.loc[train_start:train_end]
    test_data = returns.loc[test_start:test_end]
    
    # 그래프 생성 (Train 데이터 기준)
    G = create_correlation_graph(train_data)
    nodes_in_graph = list(G.nodes)
    
    # Train/Test 데이터 준비
    X_train = train_data[nodes_in_graph].iloc[:-1].values
    y_train_set = train_data[nodes_in_graph].iloc[1:].values
    X_test = test_data[nodes_in_graph].iloc[:-1].values
    y_test_set = test_data[nodes_in_graph].iloc[1:].values
    
    # Target Index Loop
    for target_idx in tqdm(range(len(nodes_in_graph)), desc=f"Processing Target Nodes (Test {test_id + 1})"):
        rmse_scores = []
        rmae_scores = []
        
        for _ in range(n_experiments):  # 반복 실험
            y_train = y_train_set[:, target_idx]
            y_test = y_test_set[:, target_idx]
            
            # Random Forest Regressor 초기화 및 학습
            random_seed = np.random.randint(0, 10000)
            model = RandomForestRegressor(
                n_estimators=100,
                max_depth=None,
                min_samples_split=5,
                min_samples_leaf=3,
                random_state=random_seed
            )
            model.fit(X_train, y_train)
            
            # 예측 및 평가
            y_pred = model.predict(X_test)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            rmae = mean_absolute_error(y_test, y_pred)
            
            # 결과 수집
            r2_scores.append(r2)
            rmse_scores.append(rmse)
            rmae_scores.append(rmae)
        
        # 결과 집계
        final_results.append({
            'Test ID': test_id + 1,
            'Target Index': nodes_in_graph[target_idx],
            'RMSE': np.mean(rmse_scores),
            'RMAE': np.mean(rmae_scores)
        })

# 결과를 DataFrame으로 정리
rf_results = pd.DataFrame(final_results)

# Display results
print("\nFinal Results with Random Forest Regressor:")
rf_results



=== Processing Test 1: Train (2004-01-01 to 2004-06-30), Test (2004-07-01 to 2004-12-31) ===


Processing Target Nodes (Test 1): 100%|██████████| 15/15 [00:21<00:00,  1.43s/it]



=== Processing Test 2: Train (2004-07-01 to 2004-12-31), Test (2005-01-01 to 2005-06-30) ===


Processing Target Nodes (Test 2): 100%|██████████| 14/14 [00:19<00:00,  1.40s/it]



=== Processing Test 3: Train (2005-01-01 to 2005-06-30), Test (2005-07-01 to 2005-12-31) ===


Processing Target Nodes (Test 3): 100%|██████████| 15/15 [00:21<00:00,  1.41s/it]



=== Processing Test 4: Train (2005-07-01 to 2005-12-31), Test (2006-01-01 to 2006-06-30) ===


Processing Target Nodes (Test 4): 100%|██████████| 15/15 [00:21<00:00,  1.46s/it]



=== Processing Test 5: Train (2006-01-01 to 2006-06-30), Test (2006-07-01 to 2006-12-31) ===


Processing Target Nodes (Test 5): 100%|██████████| 15/15 [00:21<00:00,  1.46s/it]



=== Processing Test 6: Train (2006-07-01 to 2006-12-31), Test (2007-01-01 to 2007-06-30) ===


Processing Target Nodes (Test 6): 100%|██████████| 15/15 [00:21<00:00,  1.44s/it]



=== Processing Test 7: Train (2007-01-01 to 2007-06-30), Test (2007-07-01 to 2007-12-31) ===


Processing Target Nodes (Test 7): 100%|██████████| 15/15 [00:21<00:00,  1.46s/it]



=== Processing Test 8: Train (2007-07-01 to 2007-12-31), Test (2008-01-01 to 2008-06-30) ===


Processing Target Nodes (Test 8): 100%|██████████| 15/15 [00:21<00:00,  1.41s/it]



=== Processing Test 9: Train (2008-01-01 to 2008-06-30), Test (2008-07-01 to 2008-12-31) ===


Processing Target Nodes (Test 9): 100%|██████████| 15/15 [00:21<00:00,  1.42s/it]



=== Processing Test 10: Train (2008-07-01 to 2008-12-31), Test (2009-01-01 to 2009-06-30) ===


Processing Target Nodes (Test 10): 100%|██████████| 15/15 [00:22<00:00,  1.52s/it]



=== Processing Test 11: Train (2009-01-01 to 2009-06-30), Test (2009-07-01 to 2009-12-31) ===


Processing Target Nodes (Test 11): 100%|██████████| 15/15 [00:21<00:00,  1.40s/it]



=== Processing Test 12: Train (2009-07-01 to 2009-12-31), Test (2010-01-01 to 2010-06-30) ===


Processing Target Nodes (Test 12): 100%|██████████| 15/15 [00:21<00:00,  1.43s/it]



=== Processing Test 13: Train (2010-01-01 to 2010-06-30), Test (2010-07-01 to 2010-12-31) ===


Processing Target Nodes (Test 13): 100%|██████████| 15/15 [00:21<00:00,  1.44s/it]



=== Processing Test 14: Train (2010-07-01 to 2010-12-31), Test (2011-01-01 to 2011-06-30) ===


Processing Target Nodes (Test 14): 100%|██████████| 15/15 [00:21<00:00,  1.45s/it]



=== Processing Test 15: Train (2011-01-01 to 2011-06-30), Test (2011-07-01 to 2011-12-31) ===


Processing Target Nodes (Test 15): 100%|██████████| 15/15 [00:20<00:00,  1.37s/it]



=== Processing Test 16: Train (2011-07-01 to 2011-12-31), Test (2012-01-01 to 2012-06-30) ===


Processing Target Nodes (Test 16): 100%|██████████| 15/15 [00:21<00:00,  1.43s/it]



=== Processing Test 17: Train (2012-01-01 to 2012-06-30), Test (2012-07-01 to 2012-12-31) ===


Processing Target Nodes (Test 17): 100%|██████████| 15/15 [00:20<00:00,  1.39s/it]



=== Processing Test 18: Train (2012-07-01 to 2012-12-31), Test (2013-01-01 to 2013-06-30) ===


Processing Target Nodes (Test 18): 100%|██████████| 15/15 [00:21<00:00,  1.42s/it]



=== Processing Test 19: Train (2013-01-01 to 2013-06-30), Test (2013-07-01 to 2013-12-31) ===


Processing Target Nodes (Test 19): 100%|██████████| 15/15 [00:21<00:00,  1.45s/it]



=== Processing Test 20: Train (2013-07-01 to 2013-12-31), Test (2014-01-01 to 2014-06-30) ===


Processing Target Nodes (Test 20): 100%|██████████| 15/15 [00:21<00:00,  1.40s/it]



=== Processing Test 21: Train (2014-01-01 to 2014-06-30), Test (2014-07-01 to 2014-12-31) ===


Processing Target Nodes (Test 21): 100%|██████████| 15/15 [00:21<00:00,  1.42s/it]



=== Processing Test 22: Train (2014-07-01 to 2014-12-31), Test (2015-01-01 to 2015-06-30) ===


Processing Target Nodes (Test 22): 100%|██████████| 14/14 [00:20<00:00,  1.45s/it]



=== Processing Test 23: Train (2015-01-01 to 2015-06-30), Test (2015-07-01 to 2015-12-31) ===


Processing Target Nodes (Test 23): 100%|██████████| 15/15 [00:21<00:00,  1.45s/it]



=== Processing Test 24: Train (2015-07-01 to 2015-12-31), Test (2016-01-01 to 2016-06-30) ===


Processing Target Nodes (Test 24): 100%|██████████| 15/15 [00:22<00:00,  1.48s/it]



=== Processing Test 25: Train (2016-01-01 to 2016-06-30), Test (2016-07-01 to 2016-12-31) ===


Processing Target Nodes (Test 25): 100%|██████████| 15/15 [00:21<00:00,  1.44s/it]



=== Processing Test 26: Train (2016-07-01 to 2016-12-31), Test (2017-01-01 to 2017-06-30) ===


Processing Target Nodes (Test 26): 100%|██████████| 15/15 [00:21<00:00,  1.46s/it]



=== Processing Test 27: Train (2017-01-01 to 2017-06-30), Test (2017-07-01 to 2017-12-31) ===


Processing Target Nodes (Test 27): 100%|██████████| 14/14 [00:19<00:00,  1.40s/it]



=== Processing Test 28: Train (2017-07-01 to 2017-12-31), Test (2018-01-01 to 2018-06-30) ===


Processing Target Nodes (Test 28): 100%|██████████| 13/13 [00:17<00:00,  1.37s/it]



=== Processing Test 29: Train (2018-01-01 to 2018-06-30), Test (2018-07-01 to 2018-12-31) ===


Processing Target Nodes (Test 29): 100%|██████████| 14/14 [00:20<00:00,  1.43s/it]



=== Processing Test 30: Train (2018-07-01 to 2018-12-31), Test (2019-01-01 to 2019-06-30) ===


Processing Target Nodes (Test 30): 100%|██████████| 15/15 [00:22<00:00,  1.49s/it]



=== Processing Test 31: Train (2019-01-01 to 2019-06-30), Test (2019-07-01 to 2019-12-31) ===


Processing Target Nodes (Test 31): 100%|██████████| 15/15 [00:21<00:00,  1.42s/it]



=== Processing Test 32: Train (2019-07-01 to 2019-12-31), Test (2020-01-01 to 2020-06-30) ===


Processing Target Nodes (Test 32): 100%|██████████| 14/14 [00:20<00:00,  1.45s/it]



=== Processing Test 33: Train (2020-01-01 to 2020-06-30), Test (2020-07-01 to 2020-12-31) ===


Processing Target Nodes (Test 33): 100%|██████████| 15/15 [00:23<00:00,  1.57s/it]



=== Processing Test 34: Train (2020-07-01 to 2020-12-31), Test (2021-01-01 to 2021-06-30) ===


Processing Target Nodes (Test 34): 100%|██████████| 14/14 [00:19<00:00,  1.39s/it]



=== Processing Test 35: Train (2021-01-01 to 2021-06-30), Test (2021-07-01 to 2021-12-31) ===


Processing Target Nodes (Test 35): 100%|██████████| 15/15 [00:21<00:00,  1.46s/it]



=== Processing Test 36: Train (2021-07-01 to 2021-12-31), Test (2022-01-01 to 2022-06-30) ===


Processing Target Nodes (Test 36): 100%|██████████| 14/14 [00:19<00:00,  1.40s/it]



=== Processing Test 37: Train (2022-01-01 to 2022-06-30), Test (2022-07-01 to 2022-12-31) ===


Processing Target Nodes (Test 37): 100%|██████████| 15/15 [00:20<00:00,  1.40s/it]



=== Processing Test 38: Train (2022-07-01 to 2022-12-31), Test (2023-01-01 to 2023-06-30) ===


Processing Target Nodes (Test 38): 100%|██████████| 15/15 [00:21<00:00,  1.45s/it]



=== Processing Test 39: Train (2023-01-01 to 2023-06-30), Test (2023-07-01 to 2023-12-31) ===


Processing Target Nodes (Test 39): 100%|██████████| 14/14 [00:19<00:00,  1.39s/it]



=== Processing Test 40: Train (2023-07-01 to 2023-12-31), Test (2024-01-01 to 2024-06-30) ===


Processing Target Nodes (Test 40): 100%|██████████| 14/14 [00:19<00:00,  1.38s/it]



=== Processing Test 41: Train (2024-01-01 to 2024-06-30), Test (2024-07-01 to 2024-12-31) ===


Processing Target Nodes (Test 41): 100%|██████████| 14/14 [00:18<00:00,  1.35s/it]


Final Results with Random Forest Regressor:





Unnamed: 0,Test ID,Target Index,RMSE,RMAE
0,1,Turkey BIST 100,0.016237,0.012988
1,1,Hang Seng,0.009278,0.007400
2,1,Straits Times,0.006887,0.005546
3,1,Mexican IPC,0.007730,0.006170
4,1,FTSE 100,0.005883,0.004709
...,...,...,...,...
598,41,DAX,0.008936,0.007178
599,41,Mexican IPC,0.009555,0.007673
600,41,IBEX 35,0.008921,0.006886
601,41,CAC 40,0.009316,0.007310


#### 4.2 Gradient Boost Classifier (XGBoost) Model 사용

In [12]:
final_results = []

# Gradient Boosting Regressor Evaluation
for test_id, (train_start, train_end, test_start, test_end) in enumerate(test_periods):
    print(f"\n=== Processing Test {test_id + 1}: Train ({train_start} to {train_end}), Test ({test_start} to {test_end}) ===")
    
    # Train/Test 데이터 분리
    train_data = returns.loc[train_start:train_end]
    test_data = returns.loc[test_start:test_end]
    
    # 그래프 생성 (Train 데이터 기준)
    G = create_correlation_graph(train_data)
    nodes_in_graph = list(G.nodes)
    
    # Train/Test 데이터 준비
    X_train = train_data[nodes_in_graph].iloc[:-1].values
    y_train_set = train_data[nodes_in_graph].iloc[1:].values
    X_test = test_data[nodes_in_graph].iloc[:-1].values
    y_test_set = test_data[nodes_in_graph].iloc[1:].values
    
    # Target Index Loop
    for target_idx in tqdm(range(len(nodes_in_graph)), desc=f"Processing Target Nodes (Test {test_id + 1})"):
        rmse_scores = []
        rmae_scores = []
        
        for _ in range(n_experiments):  # 반복 실험
            y_train = y_train_set[:, target_idx]
            y_test = y_test_set[:, target_idx]
            
            # Gradient Boosting Regressor 초기화 및 학습
            random_seed = np.random.randint(0, 10000)
            model = GradientBoostingRegressor(
                n_estimators=100,
                min_samples_split=5,
                min_samples_leaf=3,
                random_state=random_seed
            )
            model.fit(X_train, y_train)
            
            # 예측 및 평가
            y_pred = model.predict(X_test)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            rmae = mean_absolute_error(y_test, y_pred)
            
            # 결과 수집
            rmse_scores.append(rmse)
            rmae_scores.append(rmae)
        
        # 결과 집계
        final_results.append({
            'Test ID': test_id + 1,
            'Target Index': nodes_in_graph[target_idx],
            'RMSE': np.mean(rmse_scores),
            'RMAE': np.mean(rmae_scores)
        })

# 결과를 DataFrame으로 정리
gbc_results = pd.DataFrame(final_results)

# Display results
print("\nFinal Results with Gradient Boosting Regressor:")
gbc_results


=== Processing Test 1: Train (2004-01-01 to 2004-06-30), Test (2004-07-01 to 2004-12-31) ===


Processing Target Nodes (Test 1): 100%|██████████| 15/15 [00:11<00:00,  1.29it/s]



=== Processing Test 2: Train (2004-07-01 to 2004-12-31), Test (2005-01-01 to 2005-06-30) ===


Processing Target Nodes (Test 2): 100%|██████████| 14/14 [00:10<00:00,  1.34it/s]



=== Processing Test 3: Train (2005-01-01 to 2005-06-30), Test (2005-07-01 to 2005-12-31) ===


Processing Target Nodes (Test 3): 100%|██████████| 15/15 [00:11<00:00,  1.30it/s]



=== Processing Test 4: Train (2005-07-01 to 2005-12-31), Test (2006-01-01 to 2006-06-30) ===


Processing Target Nodes (Test 4): 100%|██████████| 15/15 [00:11<00:00,  1.27it/s]



=== Processing Test 5: Train (2006-01-01 to 2006-06-30), Test (2006-07-01 to 2006-12-31) ===


Processing Target Nodes (Test 5): 100%|██████████| 15/15 [00:11<00:00,  1.31it/s]



=== Processing Test 6: Train (2006-07-01 to 2006-12-31), Test (2007-01-01 to 2007-06-30) ===


Processing Target Nodes (Test 6): 100%|██████████| 15/15 [00:11<00:00,  1.29it/s]



=== Processing Test 7: Train (2007-01-01 to 2007-06-30), Test (2007-07-01 to 2007-12-31) ===


Processing Target Nodes (Test 7): 100%|██████████| 15/15 [00:11<00:00,  1.32it/s]



=== Processing Test 8: Train (2007-07-01 to 2007-12-31), Test (2008-01-01 to 2008-06-30) ===


Processing Target Nodes (Test 8): 100%|██████████| 15/15 [00:11<00:00,  1.29it/s]



=== Processing Test 9: Train (2008-01-01 to 2008-06-30), Test (2008-07-01 to 2008-12-31) ===


Processing Target Nodes (Test 9): 100%|██████████| 15/15 [00:11<00:00,  1.31it/s]



=== Processing Test 10: Train (2008-07-01 to 2008-12-31), Test (2009-01-01 to 2009-06-30) ===


Processing Target Nodes (Test 10): 100%|██████████| 15/15 [00:11<00:00,  1.29it/s]



=== Processing Test 11: Train (2009-01-01 to 2009-06-30), Test (2009-07-01 to 2009-12-31) ===


Processing Target Nodes (Test 11): 100%|██████████| 15/15 [00:11<00:00,  1.32it/s]



=== Processing Test 12: Train (2009-07-01 to 2009-12-31), Test (2010-01-01 to 2010-06-30) ===


Processing Target Nodes (Test 12): 100%|██████████| 15/15 [00:11<00:00,  1.27it/s]



=== Processing Test 13: Train (2010-01-01 to 2010-06-30), Test (2010-07-01 to 2010-12-31) ===


Processing Target Nodes (Test 13): 100%|██████████| 15/15 [00:11<00:00,  1.32it/s]



=== Processing Test 14: Train (2010-07-01 to 2010-12-31), Test (2011-01-01 to 2011-06-30) ===


Processing Target Nodes (Test 14): 100%|██████████| 15/15 [00:11<00:00,  1.28it/s]



=== Processing Test 15: Train (2011-01-01 to 2011-06-30), Test (2011-07-01 to 2011-12-31) ===


Processing Target Nodes (Test 15): 100%|██████████| 15/15 [00:11<00:00,  1.30it/s]



=== Processing Test 16: Train (2011-07-01 to 2011-12-31), Test (2012-01-01 to 2012-06-30) ===


Processing Target Nodes (Test 16): 100%|██████████| 15/15 [00:11<00:00,  1.30it/s]



=== Processing Test 17: Train (2012-01-01 to 2012-06-30), Test (2012-07-01 to 2012-12-31) ===


Processing Target Nodes (Test 17): 100%|██████████| 15/15 [00:11<00:00,  1.27it/s]



=== Processing Test 18: Train (2012-07-01 to 2012-12-31), Test (2013-01-01 to 2013-06-30) ===


Processing Target Nodes (Test 18): 100%|██████████| 15/15 [00:11<00:00,  1.30it/s]



=== Processing Test 19: Train (2013-01-01 to 2013-06-30), Test (2013-07-01 to 2013-12-31) ===


Processing Target Nodes (Test 19): 100%|██████████| 15/15 [00:11<00:00,  1.31it/s]



=== Processing Test 20: Train (2013-07-01 to 2013-12-31), Test (2014-01-01 to 2014-06-30) ===


Processing Target Nodes (Test 20): 100%|██████████| 15/15 [00:11<00:00,  1.27it/s]



=== Processing Test 21: Train (2014-01-01 to 2014-06-30), Test (2014-07-01 to 2014-12-31) ===


Processing Target Nodes (Test 21): 100%|██████████| 15/15 [00:11<00:00,  1.30it/s]



=== Processing Test 22: Train (2014-07-01 to 2014-12-31), Test (2015-01-01 to 2015-06-30) ===


Processing Target Nodes (Test 22): 100%|██████████| 14/14 [00:10<00:00,  1.33it/s]



=== Processing Test 23: Train (2015-01-01 to 2015-06-30), Test (2015-07-01 to 2015-12-31) ===


Processing Target Nodes (Test 23): 100%|██████████| 15/15 [00:11<00:00,  1.29it/s]



=== Processing Test 24: Train (2015-07-01 to 2015-12-31), Test (2016-01-01 to 2016-06-30) ===


Processing Target Nodes (Test 24): 100%|██████████| 15/15 [00:11<00:00,  1.28it/s]



=== Processing Test 25: Train (2016-01-01 to 2016-06-30), Test (2016-07-01 to 2016-12-31) ===


Processing Target Nodes (Test 25): 100%|██████████| 15/15 [00:11<00:00,  1.30it/s]



=== Processing Test 26: Train (2016-07-01 to 2016-12-31), Test (2017-01-01 to 2017-06-30) ===


Processing Target Nodes (Test 26): 100%|██████████| 15/15 [00:11<00:00,  1.28it/s]



=== Processing Test 27: Train (2017-01-01 to 2017-06-30), Test (2017-07-01 to 2017-12-31) ===


Processing Target Nodes (Test 27): 100%|██████████| 14/14 [00:10<00:00,  1.36it/s]



=== Processing Test 28: Train (2017-07-01 to 2017-12-31), Test (2018-01-01 to 2018-06-30) ===


Processing Target Nodes (Test 28): 100%|██████████| 13/13 [00:09<00:00,  1.42it/s]



=== Processing Test 29: Train (2018-01-01 to 2018-06-30), Test (2018-07-01 to 2018-12-31) ===


Processing Target Nodes (Test 29): 100%|██████████| 14/14 [00:10<00:00,  1.35it/s]



=== Processing Test 30: Train (2018-07-01 to 2018-12-31), Test (2019-01-01 to 2019-06-30) ===


Processing Target Nodes (Test 30): 100%|██████████| 15/15 [00:11<00:00,  1.29it/s]



=== Processing Test 31: Train (2019-01-01 to 2019-06-30), Test (2019-07-01 to 2019-12-31) ===


Processing Target Nodes (Test 31): 100%|██████████| 15/15 [00:11<00:00,  1.32it/s]



=== Processing Test 32: Train (2019-07-01 to 2019-12-31), Test (2020-01-01 to 2020-06-30) ===


Processing Target Nodes (Test 32): 100%|██████████| 14/14 [00:10<00:00,  1.34it/s]



=== Processing Test 33: Train (2020-01-01 to 2020-06-30), Test (2020-07-01 to 2020-12-31) ===


Processing Target Nodes (Test 33): 100%|██████████| 15/15 [00:11<00:00,  1.32it/s]



=== Processing Test 34: Train (2020-07-01 to 2020-12-31), Test (2021-01-01 to 2021-06-30) ===


Processing Target Nodes (Test 34): 100%|██████████| 14/14 [00:10<00:00,  1.34it/s]



=== Processing Test 35: Train (2021-01-01 to 2021-06-30), Test (2021-07-01 to 2021-12-31) ===


Processing Target Nodes (Test 35): 100%|██████████| 15/15 [00:11<00:00,  1.31it/s]



=== Processing Test 36: Train (2021-07-01 to 2021-12-31), Test (2022-01-01 to 2022-06-30) ===


Processing Target Nodes (Test 36): 100%|██████████| 14/14 [00:10<00:00,  1.35it/s]



=== Processing Test 37: Train (2022-01-01 to 2022-06-30), Test (2022-07-01 to 2022-12-31) ===


Processing Target Nodes (Test 37): 100%|██████████| 15/15 [00:11<00:00,  1.30it/s]



=== Processing Test 38: Train (2022-07-01 to 2022-12-31), Test (2023-01-01 to 2023-06-30) ===


Processing Target Nodes (Test 38): 100%|██████████| 15/15 [00:11<00:00,  1.29it/s]



=== Processing Test 39: Train (2023-01-01 to 2023-06-30), Test (2023-07-01 to 2023-12-31) ===


Processing Target Nodes (Test 39): 100%|██████████| 14/14 [00:10<00:00,  1.38it/s]



=== Processing Test 40: Train (2023-07-01 to 2023-12-31), Test (2024-01-01 to 2024-06-30) ===


Processing Target Nodes (Test 40): 100%|██████████| 14/14 [00:10<00:00,  1.34it/s]



=== Processing Test 41: Train (2024-01-01 to 2024-06-30), Test (2024-07-01 to 2024-12-31) ===


Processing Target Nodes (Test 41): 100%|██████████| 14/14 [00:10<00:00,  1.37it/s]


Final Results with Gradient Boosting Regressor:





Unnamed: 0,Test ID,Target Index,RMSE,RMAE
0,1,Turkey BIST 100,0.017771,0.013886
1,1,Hang Seng,0.010120,0.007971
2,1,Straits Times,0.007203,0.005855
3,1,Mexican IPC,0.008758,0.006782
4,1,FTSE 100,0.006462,0.005254
...,...,...,...,...
598,41,DAX,0.009777,0.007771
599,41,Mexican IPC,0.011156,0.008800
600,41,IBEX 35,0.009610,0.007594
601,41,CAC 40,0.010456,0.008057


#### 4.3. Multi-Layer Perceptron Model 사용

In [13]:
# Initialize Results
final_results = []

# MLP Regressor Evaluation
for test_id, (train_start, train_end, test_start, test_end) in enumerate(test_periods):
    print(f"\n=== Processing Test {test_id + 1}: Train ({train_start} to {train_end}), Test ({test_start} to {test_end}) ===")
    
    # Train/Test 데이터 분리
    train_data = returns.loc[train_start:train_end]
    test_data = returns.loc[test_start:test_end]
    
    # 그래프 생성 (Train 데이터 기준)
    G = create_correlation_graph(train_data)
    nodes_in_graph = list(G.nodes)
    
    # Train/Test 데이터 준비
    X_train = train_data[nodes_in_graph].iloc[:-1].values
    y_train_set = train_data[nodes_in_graph].iloc[1:].values
    X_test = test_data[nodes_in_graph].iloc[:-1].values
    y_test_set = test_data[nodes_in_graph].iloc[1:].values
    
    # Target Index Loop
    for target_idx in tqdm(range(len(nodes_in_graph)), desc=f"Processing Target Nodes (Test {test_id + 1})"):
        rmse_scores = []
        rmae_scores = []
        
        for _ in range(n_experiments):  # 반복 실험
            y_train = y_train_set[:, target_idx]
            y_test = y_test_set[:, target_idx]
            
            # MLP Regressor 초기화 및 학습
            random_seed = np.random.randint(0, 10000)
            model = MLPRegressor(
                hidden_layer_sizes=(100,),
                max_iter=2000,
                activation='relu',
                learning_rate_init=0.001,
                early_stopping=True,
                random_state=random_seed
            )
            model.fit(X_train, y_train)
            
            # 예측 및 평가
            y_pred = model.predict(X_test)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            rmae = mean_absolute_error(y_test, y_pred)
            
            # 결과 수집
            rmse_scores.append(rmse)
            rmae_scores.append(rmae)
        
        # 결과 집계
        final_results.append({
            'Test ID': test_id + 1,
            'Target Index': nodes_in_graph[target_idx],
            'RMSE': np.mean(rmse_scores),
            'RMAE': np.mean(rmae_scores)
        })

# 결과를 DataFrame으로 정리
mlp_results = pd.DataFrame(final_results)

# Display results
print("\nFinal Results with MLP Regressor:")
mlp_results


=== Processing Test 1: Train (2004-01-01 to 2004-06-30), Test (2004-07-01 to 2004-12-31) ===


Processing Target Nodes (Test 1): 100%|██████████| 15/15 [00:07<00:00,  1.93it/s]



=== Processing Test 2: Train (2004-07-01 to 2004-12-31), Test (2005-01-01 to 2005-06-30) ===


Processing Target Nodes (Test 2): 100%|██████████| 14/14 [00:09<00:00,  1.48it/s]



=== Processing Test 3: Train (2005-01-01 to 2005-06-30), Test (2005-07-01 to 2005-12-31) ===


Processing Target Nodes (Test 3): 100%|██████████| 15/15 [00:06<00:00,  2.26it/s]



=== Processing Test 4: Train (2005-07-01 to 2005-12-31), Test (2006-01-01 to 2006-06-30) ===


Processing Target Nodes (Test 4): 100%|██████████| 15/15 [00:05<00:00,  2.53it/s]



=== Processing Test 5: Train (2006-01-01 to 2006-06-30), Test (2006-07-01 to 2006-12-31) ===


Processing Target Nodes (Test 5): 100%|██████████| 15/15 [00:05<00:00,  2.61it/s]



=== Processing Test 6: Train (2006-07-01 to 2006-12-31), Test (2007-01-01 to 2007-06-30) ===


Processing Target Nodes (Test 6): 100%|██████████| 15/15 [00:06<00:00,  2.34it/s]



=== Processing Test 7: Train (2007-01-01 to 2007-06-30), Test (2007-07-01 to 2007-12-31) ===


Processing Target Nodes (Test 7): 100%|██████████| 15/15 [00:05<00:00,  2.69it/s]



=== Processing Test 8: Train (2007-07-01 to 2007-12-31), Test (2008-01-01 to 2008-06-30) ===


Processing Target Nodes (Test 8): 100%|██████████| 15/15 [00:05<00:00,  2.65it/s]



=== Processing Test 9: Train (2008-01-01 to 2008-06-30), Test (2008-07-01 to 2008-12-31) ===


Processing Target Nodes (Test 9): 100%|██████████| 15/15 [00:06<00:00,  2.41it/s]



=== Processing Test 10: Train (2008-07-01 to 2008-12-31), Test (2009-01-01 to 2009-06-30) ===


Processing Target Nodes (Test 10): 100%|██████████| 15/15 [00:06<00:00,  2.46it/s]



=== Processing Test 11: Train (2009-01-01 to 2009-06-30), Test (2009-07-01 to 2009-12-31) ===


Processing Target Nodes (Test 11): 100%|██████████| 15/15 [00:05<00:00,  2.91it/s]



=== Processing Test 12: Train (2009-07-01 to 2009-12-31), Test (2010-01-01 to 2010-06-30) ===


Processing Target Nodes (Test 12): 100%|██████████| 15/15 [00:06<00:00,  2.43it/s]



=== Processing Test 13: Train (2010-01-01 to 2010-06-30), Test (2010-07-01 to 2010-12-31) ===


Processing Target Nodes (Test 13): 100%|██████████| 15/15 [00:05<00:00,  2.54it/s]



=== Processing Test 14: Train (2010-07-01 to 2010-12-31), Test (2011-01-01 to 2011-06-30) ===


Processing Target Nodes (Test 14): 100%|██████████| 15/15 [00:05<00:00,  2.64it/s]



=== Processing Test 15: Train (2011-01-01 to 2011-06-30), Test (2011-07-01 to 2011-12-31) ===


Processing Target Nodes (Test 15): 100%|██████████| 15/15 [00:06<00:00,  2.44it/s]



=== Processing Test 16: Train (2011-07-01 to 2011-12-31), Test (2012-01-01 to 2012-06-30) ===


Processing Target Nodes (Test 16): 100%|██████████| 15/15 [00:05<00:00,  2.51it/s]



=== Processing Test 17: Train (2012-01-01 to 2012-06-30), Test (2012-07-01 to 2012-12-31) ===


Processing Target Nodes (Test 17): 100%|██████████| 15/15 [00:05<00:00,  2.51it/s]



=== Processing Test 18: Train (2012-07-01 to 2012-12-31), Test (2013-01-01 to 2013-06-30) ===


Processing Target Nodes (Test 18): 100%|██████████| 15/15 [00:07<00:00,  2.10it/s]



=== Processing Test 19: Train (2013-01-01 to 2013-06-30), Test (2013-07-01 to 2013-12-31) ===


Processing Target Nodes (Test 19): 100%|██████████| 15/15 [00:06<00:00,  2.42it/s]



=== Processing Test 20: Train (2013-07-01 to 2013-12-31), Test (2014-01-01 to 2014-06-30) ===


Processing Target Nodes (Test 20): 100%|██████████| 15/15 [00:06<00:00,  2.30it/s]



=== Processing Test 21: Train (2014-01-01 to 2014-06-30), Test (2014-07-01 to 2014-12-31) ===


Processing Target Nodes (Test 21): 100%|██████████| 15/15 [00:06<00:00,  2.47it/s]



=== Processing Test 22: Train (2014-07-01 to 2014-12-31), Test (2015-01-01 to 2015-06-30) ===


Processing Target Nodes (Test 22): 100%|██████████| 14/14 [00:05<00:00,  2.46it/s]



=== Processing Test 23: Train (2015-01-01 to 2015-06-30), Test (2015-07-01 to 2015-12-31) ===


Processing Target Nodes (Test 23): 100%|██████████| 15/15 [00:06<00:00,  2.38it/s]



=== Processing Test 24: Train (2015-07-01 to 2015-12-31), Test (2016-01-01 to 2016-06-30) ===


Processing Target Nodes (Test 24): 100%|██████████| 15/15 [00:05<00:00,  2.55it/s]



=== Processing Test 25: Train (2016-01-01 to 2016-06-30), Test (2016-07-01 to 2016-12-31) ===


Processing Target Nodes (Test 25): 100%|██████████| 15/15 [00:05<00:00,  2.56it/s]



=== Processing Test 26: Train (2016-07-01 to 2016-12-31), Test (2017-01-01 to 2017-06-30) ===


Processing Target Nodes (Test 26): 100%|██████████| 15/15 [00:06<00:00,  2.45it/s]



=== Processing Test 27: Train (2017-01-01 to 2017-06-30), Test (2017-07-01 to 2017-12-31) ===


Processing Target Nodes (Test 27): 100%|██████████| 14/14 [00:05<00:00,  2.52it/s]



=== Processing Test 28: Train (2017-07-01 to 2017-12-31), Test (2018-01-01 to 2018-06-30) ===


Processing Target Nodes (Test 28): 100%|██████████| 13/13 [00:05<00:00,  2.49it/s]



=== Processing Test 29: Train (2018-01-01 to 2018-06-30), Test (2018-07-01 to 2018-12-31) ===


Processing Target Nodes (Test 29): 100%|██████████| 14/14 [00:05<00:00,  2.60it/s]



=== Processing Test 30: Train (2018-07-01 to 2018-12-31), Test (2019-01-01 to 2019-06-30) ===


Processing Target Nodes (Test 30): 100%|██████████| 15/15 [00:06<00:00,  2.48it/s]



=== Processing Test 31: Train (2019-01-01 to 2019-06-30), Test (2019-07-01 to 2019-12-31) ===


Processing Target Nodes (Test 31): 100%|██████████| 15/15 [00:05<00:00,  2.60it/s]



=== Processing Test 32: Train (2019-07-01 to 2019-12-31), Test (2020-01-01 to 2020-06-30) ===


Processing Target Nodes (Test 32): 100%|██████████| 14/14 [00:05<00:00,  2.45it/s]



=== Processing Test 33: Train (2020-01-01 to 2020-06-30), Test (2020-07-01 to 2020-12-31) ===


Processing Target Nodes (Test 33): 100%|██████████| 15/15 [00:06<00:00,  2.26it/s]



=== Processing Test 34: Train (2020-07-01 to 2020-12-31), Test (2021-01-01 to 2021-06-30) ===


Processing Target Nodes (Test 34): 100%|██████████| 14/14 [00:05<00:00,  2.41it/s]



=== Processing Test 35: Train (2021-01-01 to 2021-06-30), Test (2021-07-01 to 2021-12-31) ===


Processing Target Nodes (Test 35): 100%|██████████| 15/15 [00:06<00:00,  2.35it/s]



=== Processing Test 36: Train (2021-07-01 to 2021-12-31), Test (2022-01-01 to 2022-06-30) ===


Processing Target Nodes (Test 36): 100%|██████████| 14/14 [00:06<00:00,  2.28it/s]



=== Processing Test 37: Train (2022-01-01 to 2022-06-30), Test (2022-07-01 to 2022-12-31) ===


Processing Target Nodes (Test 37): 100%|██████████| 15/15 [00:06<00:00,  2.46it/s]



=== Processing Test 38: Train (2022-07-01 to 2022-12-31), Test (2023-01-01 to 2023-06-30) ===


Processing Target Nodes (Test 38): 100%|██████████| 15/15 [00:07<00:00,  2.04it/s]



=== Processing Test 39: Train (2023-01-01 to 2023-06-30), Test (2023-07-01 to 2023-12-31) ===


Processing Target Nodes (Test 39): 100%|██████████| 14/14 [00:06<00:00,  2.07it/s]



=== Processing Test 40: Train (2023-07-01 to 2023-12-31), Test (2024-01-01 to 2024-06-30) ===


Processing Target Nodes (Test 40): 100%|██████████| 14/14 [00:06<00:00,  2.28it/s]



=== Processing Test 41: Train (2024-01-01 to 2024-06-30), Test (2024-07-01 to 2024-12-31) ===


Processing Target Nodes (Test 41): 100%|██████████| 14/14 [00:05<00:00,  2.72it/s]


Final Results with MLP Regressor:





Unnamed: 0,Test ID,Target Index,RMSE,RMAE
0,1,Turkey BIST 100,0.016176,0.012905
1,1,Hang Seng,0.009535,0.007652
2,1,Straits Times,0.007929,0.006387
3,1,Mexican IPC,0.009357,0.007488
4,1,FTSE 100,0.007327,0.005877
...,...,...,...,...
598,41,DAX,0.010474,0.008135
599,41,Mexican IPC,0.011818,0.009425
600,41,IBEX 35,0.010554,0.008284
601,41,CAC 40,0.010819,0.008517


#### 4.4. K-Nearest Neighbors Model

In [14]:
final_results = []

# KNeighbors Regressor Evaluation
for test_id, (train_start, train_end, test_start, test_end) in enumerate(test_periods):
    print(f"\n=== Processing Test {test_id + 1}: Train ({train_start} to {train_end}), Test ({test_start} to {test_end}) ===")
    
    # Train/Test 데이터 분리
    train_data = returns.loc[train_start:train_end]
    test_data = returns.loc[test_start:test_end]
    
    # 그래프 생성 (Train 데이터 기준)
    G = create_correlation_graph(train_data)
    nodes_in_graph = list(G.nodes)
    
    # Train/Test 데이터 준비
    X_train = train_data[nodes_in_graph].iloc[:-1].values
    y_train_set = train_data[nodes_in_graph].iloc[1:].values
    X_test = test_data[nodes_in_graph].iloc[:-1].values
    y_test_set = test_data[nodes_in_graph].iloc[1:].values
    
    # Target Index Loop
    for target_idx in tqdm(range(len(nodes_in_graph)), desc=f"Processing Target Nodes (Test {test_id + 1})"):
        rmse_scores = []
        rmae_scores = []
        
        for _ in range(n_experiments):  # 반복 실험
            y_train = y_train_set[:, target_idx]
            y_test = y_test_set[:, target_idx]
            
            # Randomize n_neighbors
            random_n_neighbors = np.random.randint(3, 8)
            
            # KNeighbors Regressor 초기화 및 학습
            model = KNeighborsRegressor(n_neighbors=random_n_neighbors, leaf_size=30, n_jobs=None)
            model.fit(X_train, y_train)
            
            # 예측 및 평가
            y_pred = model.predict(X_test)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            rmae = mean_absolute_error(y_test, y_pred)
            
            # 결과 수집
            rmse_scores.append(rmse)
            rmae_scores.append(rmae)
        
        # 결과 집계
        final_results.append({
            'Test ID': test_id + 1,
            'Target Index': nodes_in_graph[target_idx],
            'RMSE': np.mean(rmse_scores),
            'RMAE': np.mean(rmae_scores)
        })

# 결과를 DataFrame으로 정리
knn_results = pd.DataFrame(final_results)

# Display results
print("\nFinal Results with KNeighbors Regressor:")
knn_results


=== Processing Test 1: Train (2004-01-01 to 2004-06-30), Test (2004-07-01 to 2004-12-31) ===


Processing Target Nodes (Test 1): 100%|██████████| 15/15 [00:00<00:00, 83.52it/s]



=== Processing Test 2: Train (2004-07-01 to 2004-12-31), Test (2005-01-01 to 2005-06-30) ===


Processing Target Nodes (Test 2): 100%|██████████| 14/14 [00:00<00:00, 89.83it/s]



=== Processing Test 3: Train (2005-01-01 to 2005-06-30), Test (2005-07-01 to 2005-12-31) ===


Processing Target Nodes (Test 3): 100%|██████████| 15/15 [00:00<00:00, 83.51it/s]



=== Processing Test 4: Train (2005-07-01 to 2005-12-31), Test (2006-01-01 to 2006-06-30) ===


Processing Target Nodes (Test 4): 100%|██████████| 15/15 [00:00<00:00, 54.38it/s]



=== Processing Test 5: Train (2006-01-01 to 2006-06-30), Test (2006-07-01 to 2006-12-31) ===


Processing Target Nodes (Test 5): 100%|██████████| 15/15 [00:00<00:00, 87.05it/s]



=== Processing Test 6: Train (2006-07-01 to 2006-12-31), Test (2007-01-01 to 2007-06-30) ===


Processing Target Nodes (Test 6): 100%|██████████| 15/15 [00:00<00:00, 85.07it/s]



=== Processing Test 7: Train (2007-01-01 to 2007-06-30), Test (2007-07-01 to 2007-12-31) ===


Processing Target Nodes (Test 7): 100%|██████████| 15/15 [00:00<00:00, 82.41it/s]



=== Processing Test 8: Train (2007-07-01 to 2007-12-31), Test (2008-01-01 to 2008-06-30) ===


Processing Target Nodes (Test 8): 100%|██████████| 15/15 [00:00<00:00, 83.74it/s]



=== Processing Test 9: Train (2008-01-01 to 2008-06-30), Test (2008-07-01 to 2008-12-31) ===


Processing Target Nodes (Test 9): 100%|██████████| 15/15 [00:00<00:00, 80.76it/s]



=== Processing Test 10: Train (2008-07-01 to 2008-12-31), Test (2009-01-01 to 2009-06-30) ===


Processing Target Nodes (Test 10): 100%|██████████| 15/15 [00:00<00:00, 84.34it/s]



=== Processing Test 11: Train (2009-01-01 to 2009-06-30), Test (2009-07-01 to 2009-12-31) ===


Processing Target Nodes (Test 11): 100%|██████████| 15/15 [00:00<00:00, 83.51it/s]



=== Processing Test 12: Train (2009-07-01 to 2009-12-31), Test (2010-01-01 to 2010-06-30) ===


Processing Target Nodes (Test 12): 100%|██████████| 15/15 [00:00<00:00, 84.87it/s]



=== Processing Test 13: Train (2010-01-01 to 2010-06-30), Test (2010-07-01 to 2010-12-31) ===


Processing Target Nodes (Test 13): 100%|██████████| 15/15 [00:00<00:00, 85.39it/s]



=== Processing Test 14: Train (2010-07-01 to 2010-12-31), Test (2011-01-01 to 2011-06-30) ===


Processing Target Nodes (Test 14): 100%|██████████| 15/15 [00:00<00:00, 83.94it/s]



=== Processing Test 15: Train (2011-01-01 to 2011-06-30), Test (2011-07-01 to 2011-12-31) ===


Processing Target Nodes (Test 15): 100%|██████████| 15/15 [00:00<00:00, 54.80it/s]



=== Processing Test 16: Train (2011-07-01 to 2011-12-31), Test (2012-01-01 to 2012-06-30) ===


Processing Target Nodes (Test 16): 100%|██████████| 15/15 [00:00<00:00, 86.21it/s]



=== Processing Test 17: Train (2012-01-01 to 2012-06-30), Test (2012-07-01 to 2012-12-31) ===


Processing Target Nodes (Test 17): 100%|██████████| 15/15 [00:00<00:00, 87.33it/s]



=== Processing Test 18: Train (2012-07-01 to 2012-12-31), Test (2013-01-01 to 2013-06-30) ===


Processing Target Nodes (Test 18): 100%|██████████| 15/15 [00:00<00:00, 86.05it/s]



=== Processing Test 19: Train (2013-01-01 to 2013-06-30), Test (2013-07-01 to 2013-12-31) ===


Processing Target Nodes (Test 19): 100%|██████████| 15/15 [00:00<00:00, 86.08it/s]



=== Processing Test 20: Train (2013-07-01 to 2013-12-31), Test (2014-01-01 to 2014-06-30) ===


Processing Target Nodes (Test 20): 100%|██████████| 15/15 [00:00<00:00, 86.74it/s]



=== Processing Test 21: Train (2014-01-01 to 2014-06-30), Test (2014-07-01 to 2014-12-31) ===


Processing Target Nodes (Test 21): 100%|██████████| 15/15 [00:00<00:00, 84.17it/s]



=== Processing Test 22: Train (2014-07-01 to 2014-12-31), Test (2015-01-01 to 2015-06-30) ===


Processing Target Nodes (Test 22): 100%|██████████| 14/14 [00:00<00:00, 88.78it/s]



=== Processing Test 23: Train (2015-01-01 to 2015-06-30), Test (2015-07-01 to 2015-12-31) ===


Processing Target Nodes (Test 23): 100%|██████████| 15/15 [00:00<00:00, 83.65it/s]



=== Processing Test 24: Train (2015-07-01 to 2015-12-31), Test (2016-01-01 to 2016-06-30) ===


Processing Target Nodes (Test 24): 100%|██████████| 15/15 [00:00<00:00, 81.96it/s]



=== Processing Test 25: Train (2016-01-01 to 2016-06-30), Test (2016-07-01 to 2016-12-31) ===


Processing Target Nodes (Test 25): 100%|██████████| 15/15 [00:00<00:00, 83.78it/s]



=== Processing Test 26: Train (2016-07-01 to 2016-12-31), Test (2017-01-01 to 2017-06-30) ===


Processing Target Nodes (Test 26): 100%|██████████| 15/15 [00:00<00:00, 53.69it/s]



=== Processing Test 27: Train (2017-01-01 to 2017-06-30), Test (2017-07-01 to 2017-12-31) ===


Processing Target Nodes (Test 27): 100%|██████████| 14/14 [00:00<00:00, 85.36it/s]



=== Processing Test 28: Train (2017-07-01 to 2017-12-31), Test (2018-01-01 to 2018-06-30) ===


Processing Target Nodes (Test 28): 100%|██████████| 13/13 [00:00<00:00, 86.10it/s]



=== Processing Test 29: Train (2018-01-01 to 2018-06-30), Test (2018-07-01 to 2018-12-31) ===


Processing Target Nodes (Test 29): 100%|██████████| 14/14 [00:00<00:00, 85.53it/s]



=== Processing Test 30: Train (2018-07-01 to 2018-12-31), Test (2019-01-01 to 2019-06-30) ===


Processing Target Nodes (Test 30): 100%|██████████| 15/15 [00:00<00:00, 79.12it/s]



=== Processing Test 31: Train (2019-01-01 to 2019-06-30), Test (2019-07-01 to 2019-12-31) ===


Processing Target Nodes (Test 31): 100%|██████████| 15/15 [00:00<00:00, 83.80it/s]



=== Processing Test 32: Train (2019-07-01 to 2019-12-31), Test (2020-01-01 to 2020-06-30) ===


Processing Target Nodes (Test 32): 100%|██████████| 14/14 [00:00<00:00, 85.00it/s]



=== Processing Test 33: Train (2020-01-01 to 2020-06-30), Test (2020-07-01 to 2020-12-31) ===


Processing Target Nodes (Test 33): 100%|██████████| 15/15 [00:00<00:00, 84.94it/s]



=== Processing Test 34: Train (2020-07-01 to 2020-12-31), Test (2021-01-01 to 2021-06-30) ===


Processing Target Nodes (Test 34): 100%|██████████| 14/14 [00:00<00:00, 87.71it/s]



=== Processing Test 35: Train (2021-01-01 to 2021-06-30), Test (2021-07-01 to 2021-12-31) ===


Processing Target Nodes (Test 35): 100%|██████████| 15/15 [00:00<00:00, 85.10it/s]



=== Processing Test 36: Train (2021-07-01 to 2021-12-31), Test (2022-01-01 to 2022-06-30) ===


Processing Target Nodes (Test 36): 100%|██████████| 14/14 [00:00<00:00, 86.91it/s]



=== Processing Test 37: Train (2022-01-01 to 2022-06-30), Test (2022-07-01 to 2022-12-31) ===


Processing Target Nodes (Test 37): 100%|██████████| 15/15 [00:00<00:00, 56.05it/s]



=== Processing Test 38: Train (2022-07-01 to 2022-12-31), Test (2023-01-01 to 2023-06-30) ===


Processing Target Nodes (Test 38): 100%|██████████| 15/15 [00:00<00:00, 87.09it/s]



=== Processing Test 39: Train (2023-01-01 to 2023-06-30), Test (2023-07-01 to 2023-12-31) ===


Processing Target Nodes (Test 39): 100%|██████████| 14/14 [00:00<00:00, 87.88it/s]



=== Processing Test 40: Train (2023-07-01 to 2023-12-31), Test (2024-01-01 to 2024-06-30) ===


Processing Target Nodes (Test 40): 100%|██████████| 14/14 [00:00<00:00, 89.46it/s]



=== Processing Test 41: Train (2024-01-01 to 2024-06-30), Test (2024-07-01 to 2024-12-31) ===


Processing Target Nodes (Test 41): 100%|██████████| 14/14 [00:00<00:00, 87.55it/s]


Final Results with KNeighbors Regressor:





Unnamed: 0,Test ID,Target Index,RMSE,RMAE
0,1,Turkey BIST 100,0.017258,0.013331
1,1,Hang Seng,0.009962,0.008056
2,1,Straits Times,0.007277,0.005904
3,1,Mexican IPC,0.008695,0.006975
4,1,FTSE 100,0.006657,0.005219
...,...,...,...,...
598,41,DAX,0.008816,0.006969
599,41,Mexican IPC,0.010269,0.008125
600,41,IBEX 35,0.009139,0.006972
601,41,CAC 40,0.009267,0.007326


#### 4.5. Support Vector Machine

In [16]:
# Initialize Results
final_results = []

# SVR Evaluation
for test_id, (train_start, train_end, test_start, test_end) in enumerate(test_periods):
    print(f"\n=== Processing Test {test_id + 1}: Train ({train_start} to {train_end}), Test ({test_start} to {test_end}) ===")
    
    # Train/Test 데이터 분리
    train_data = returns.loc[train_start:train_end]
    test_data = returns.loc[test_start:test_end]
    
    # 그래프 생성 (Train 데이터 기준)
    G = create_correlation_graph(train_data)
    nodes_in_graph = list(G.nodes)
    
    # Train/Test 데이터 준비
    X_train = train_data[nodes_in_graph].iloc[:-1].values
    y_train_set = train_data[nodes_in_graph].iloc[1:].values
    X_test = test_data[nodes_in_graph].iloc[:-1].values
    y_test_set = test_data[nodes_in_graph].iloc[1:].values
    
    # Target Index Loop
    for target_idx in tqdm(range(len(nodes_in_graph)), desc=f"Processing Target Nodes (Test {test_id + 1})"):
        rmse_scores = []
        rmae_scores = []
        
        for _ in range(n_experiments):  # 반복 실험
            y_train = y_train_set[:, target_idx]
            y_test = y_test_set[:, target_idx]
            
            # Randomize SVR hyperparameter C
            random_c = np.random.uniform(0.1, 10)
            
            # SVR 초기화 및 학습
            model = SVR(kernel='rbf', C=random_c, epsilon=0.1)
            model.fit(X_train, y_train)
            
            # 예측 및 평가
            y_pred = model.predict(X_test)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            rmae = mean_absolute_error(y_test, y_pred)
            
            # 결과 수집
            rmse_scores.append(rmse)
            rmae_scores.append(rmae)
        
        # 결과 집계
        final_results.append({
            'Test ID': test_id + 1,
            'Target Index': nodes_in_graph[target_idx],
            'RMSE': np.mean(rmse_scores),
            'RMAE': np.mean(rmae_scores)
        })

# 결과를 DataFrame으로 정리
svr_results = pd.DataFrame(final_results)

# Display results
print("\nFinal Results with SVR:")
svr_results


=== Processing Test 1: Train (2004-01-01 to 2004-06-30), Test (2004-07-01 to 2004-12-31) ===


Processing Target Nodes (Test 1): 100%|██████████| 15/15 [00:00<00:00, 152.31it/s]



=== Processing Test 2: Train (2004-07-01 to 2004-12-31), Test (2005-01-01 to 2005-06-30) ===


Processing Target Nodes (Test 2): 100%|██████████| 14/14 [00:00<00:00, 172.12it/s]



=== Processing Test 3: Train (2005-01-01 to 2005-06-30), Test (2005-07-01 to 2005-12-31) ===


Processing Target Nodes (Test 3): 100%|██████████| 15/15 [00:00<00:00, 76.09it/s]



=== Processing Test 4: Train (2005-07-01 to 2005-12-31), Test (2006-01-01 to 2006-06-30) ===


Processing Target Nodes (Test 4): 100%|██████████| 15/15 [00:00<00:00, 163.55it/s]



=== Processing Test 5: Train (2006-01-01 to 2006-06-30), Test (2006-07-01 to 2006-12-31) ===


Processing Target Nodes (Test 5): 100%|██████████| 15/15 [00:00<00:00, 167.34it/s]



=== Processing Test 6: Train (2006-07-01 to 2006-12-31), Test (2007-01-01 to 2007-06-30) ===


Processing Target Nodes (Test 6): 100%|██████████| 15/15 [00:00<00:00, 169.35it/s]



=== Processing Test 7: Train (2007-01-01 to 2007-06-30), Test (2007-07-01 to 2007-12-31) ===


Processing Target Nodes (Test 7): 100%|██████████| 15/15 [00:00<00:00, 160.45it/s]



=== Processing Test 8: Train (2007-07-01 to 2007-12-31), Test (2008-01-01 to 2008-06-30) ===


Processing Target Nodes (Test 8): 100%|██████████| 15/15 [00:00<00:00, 160.72it/s]



=== Processing Test 9: Train (2008-01-01 to 2008-06-30), Test (2008-07-01 to 2008-12-31) ===


Processing Target Nodes (Test 9): 100%|██████████| 15/15 [00:00<00:00, 158.66it/s]



=== Processing Test 10: Train (2008-07-01 to 2008-12-31), Test (2009-01-01 to 2009-06-30) ===


Processing Target Nodes (Test 10): 100%|██████████| 15/15 [00:00<00:00, 153.30it/s]



=== Processing Test 11: Train (2009-01-01 to 2009-06-30), Test (2009-07-01 to 2009-12-31) ===


Processing Target Nodes (Test 11): 100%|██████████| 15/15 [00:00<00:00, 161.56it/s]



=== Processing Test 12: Train (2009-07-01 to 2009-12-31), Test (2010-01-01 to 2010-06-30) ===


Processing Target Nodes (Test 12): 100%|██████████| 15/15 [00:00<00:00, 163.02it/s]



=== Processing Test 13: Train (2010-01-01 to 2010-06-30), Test (2010-07-01 to 2010-12-31) ===


Processing Target Nodes (Test 13): 100%|██████████| 15/15 [00:00<00:00, 162.67it/s]



=== Processing Test 14: Train (2010-07-01 to 2010-12-31), Test (2011-01-01 to 2011-06-30) ===


Processing Target Nodes (Test 14): 100%|██████████| 15/15 [00:00<00:00, 160.05it/s]



=== Processing Test 15: Train (2011-01-01 to 2011-06-30), Test (2011-07-01 to 2011-12-31) ===


Processing Target Nodes (Test 15): 100%|██████████| 15/15 [00:00<00:00, 76.04it/s]



=== Processing Test 16: Train (2011-07-01 to 2011-12-31), Test (2012-01-01 to 2012-06-30) ===


Processing Target Nodes (Test 16): 100%|██████████| 15/15 [00:00<00:00, 159.73it/s]



=== Processing Test 17: Train (2012-01-01 to 2012-06-30), Test (2012-07-01 to 2012-12-31) ===


Processing Target Nodes (Test 17): 100%|██████████| 15/15 [00:00<00:00, 162.74it/s]



=== Processing Test 18: Train (2012-07-01 to 2012-12-31), Test (2013-01-01 to 2013-06-30) ===


Processing Target Nodes (Test 18): 100%|██████████| 15/15 [00:00<00:00, 161.50it/s]



=== Processing Test 19: Train (2013-01-01 to 2013-06-30), Test (2013-07-01 to 2013-12-31) ===


Processing Target Nodes (Test 19): 100%|██████████| 15/15 [00:00<00:00, 163.25it/s]



=== Processing Test 20: Train (2013-07-01 to 2013-12-31), Test (2014-01-01 to 2014-06-30) ===


Processing Target Nodes (Test 20): 100%|██████████| 15/15 [00:00<00:00, 162.05it/s]



=== Processing Test 21: Train (2014-01-01 to 2014-06-30), Test (2014-07-01 to 2014-12-31) ===


Processing Target Nodes (Test 21): 100%|██████████| 15/15 [00:00<00:00, 161.49it/s]



=== Processing Test 22: Train (2014-07-01 to 2014-12-31), Test (2015-01-01 to 2015-06-30) ===


Processing Target Nodes (Test 22): 100%|██████████| 14/14 [00:00<00:00, 159.80it/s]



=== Processing Test 23: Train (2015-01-01 to 2015-06-30), Test (2015-07-01 to 2015-12-31) ===


Processing Target Nodes (Test 23): 100%|██████████| 15/15 [00:00<00:00, 167.11it/s]



=== Processing Test 24: Train (2015-07-01 to 2015-12-31), Test (2016-01-01 to 2016-06-30) ===


Processing Target Nodes (Test 24): 100%|██████████| 15/15 [00:00<00:00, 165.98it/s]



=== Processing Test 25: Train (2016-01-01 to 2016-06-30), Test (2016-07-01 to 2016-12-31) ===


Processing Target Nodes (Test 25): 100%|██████████| 15/15 [00:00<00:00, 164.91it/s]



=== Processing Test 26: Train (2016-07-01 to 2016-12-31), Test (2017-01-01 to 2017-06-30) ===


Processing Target Nodes (Test 26): 100%|██████████| 15/15 [00:00<00:00, 164.00it/s]



=== Processing Test 27: Train (2017-01-01 to 2017-06-30), Test (2017-07-01 to 2017-12-31) ===


Processing Target Nodes (Test 27): 100%|██████████| 14/14 [00:00<00:00, 164.79it/s]



=== Processing Test 28: Train (2017-07-01 to 2017-12-31), Test (2018-01-01 to 2018-06-30) ===


Processing Target Nodes (Test 28): 100%|██████████| 13/13 [00:00<00:00, 77.29it/s]



=== Processing Test 29: Train (2018-01-01 to 2018-06-30), Test (2018-07-01 to 2018-12-31) ===


Processing Target Nodes (Test 29): 100%|██████████| 14/14 [00:00<00:00, 156.60it/s]



=== Processing Test 30: Train (2018-07-01 to 2018-12-31), Test (2019-01-01 to 2019-06-30) ===


Processing Target Nodes (Test 30): 100%|██████████| 15/15 [00:00<00:00, 165.59it/s]



=== Processing Test 31: Train (2019-01-01 to 2019-06-30), Test (2019-07-01 to 2019-12-31) ===


Processing Target Nodes (Test 31): 100%|██████████| 15/15 [00:00<00:00, 162.53it/s]



=== Processing Test 32: Train (2019-07-01 to 2019-12-31), Test (2020-01-01 to 2020-06-30) ===


Processing Target Nodes (Test 32): 100%|██████████| 14/14 [00:00<00:00, 160.12it/s]



=== Processing Test 33: Train (2020-01-01 to 2020-06-30), Test (2020-07-01 to 2020-12-31) ===


Processing Target Nodes (Test 33): 100%|██████████| 15/15 [00:00<00:00, 148.94it/s]



=== Processing Test 34: Train (2020-07-01 to 2020-12-31), Test (2021-01-01 to 2021-06-30) ===


Processing Target Nodes (Test 34): 100%|██████████| 14/14 [00:00<00:00, 160.30it/s]



=== Processing Test 35: Train (2021-01-01 to 2021-06-30), Test (2021-07-01 to 2021-12-31) ===


Processing Target Nodes (Test 35): 100%|██████████| 15/15 [00:00<00:00, 159.05it/s]



=== Processing Test 36: Train (2021-07-01 to 2021-12-31), Test (2022-01-01 to 2022-06-30) ===


Processing Target Nodes (Test 36): 100%|██████████| 14/14 [00:00<00:00, 160.19it/s]



=== Processing Test 37: Train (2022-01-01 to 2022-06-30), Test (2022-07-01 to 2022-12-31) ===


Processing Target Nodes (Test 37): 100%|██████████| 15/15 [00:00<00:00, 157.98it/s]



=== Processing Test 38: Train (2022-07-01 to 2022-12-31), Test (2023-01-01 to 2023-06-30) ===


Processing Target Nodes (Test 38): 100%|██████████| 15/15 [00:00<00:00, 160.34it/s]



=== Processing Test 39: Train (2023-01-01 to 2023-06-30), Test (2023-07-01 to 2023-12-31) ===


Processing Target Nodes (Test 39): 100%|██████████| 14/14 [00:00<00:00, 159.41it/s]



=== Processing Test 40: Train (2023-07-01 to 2023-12-31), Test (2024-01-01 to 2024-06-30) ===


Processing Target Nodes (Test 40): 100%|██████████| 14/14 [00:00<00:00, 144.55it/s]



=== Processing Test 41: Train (2024-01-01 to 2024-06-30), Test (2024-07-01 to 2024-12-31) ===


Processing Target Nodes (Test 41): 100%|██████████| 14/14 [00:00<00:00, 161.64it/s]


Final Results with SVR:





Unnamed: 0,Test ID,Target Index,RMSE,RMAE
0,1,Turkey BIST 100,0.014364,0.011347
1,1,Hang Seng,0.008040,0.006154
2,1,Straits Times,0.007603,0.006071
3,1,Mexican IPC,0.008731,0.007105
4,1,FTSE 100,0.007154,0.005867
...,...,...,...,...
598,41,DAX,0.008809,0.006957
599,41,Mexican IPC,0.016347,0.014028
600,41,IBEX 35,0.008925,0.007154
601,41,CAC 40,0.008949,0.007166


### **5. 그래프 임베딩 후 성능 변화 관찰**

#### 5.1. GCN Model 임베딩

In [20]:
class GCN(torch.nn.Module):
    def __init__(self, num_features, hidden_dim1, hidden_dim2, embedding_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(int(num_features), hidden_dim1)
        self.conv2 = GCNConv(hidden_dim1, hidden_dim2)
        self.conv3 = GCNConv(hidden_dim2, embedding_dim)
        self.fc = torch.nn.Linear(embedding_dim, 1)  # Output layer for regression

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = self.fc(x)  # Project to a single output for regression
        return x

# 결과 저장용 변수
final_results = []

# 고정된 하이퍼파라미터 설정
hidden_dim1 = 64
hidden_dim2 = 64
embedding_dim = 32
lr = 0.01

# GCN 평가
for test_id, (train_start, train_end, test_start, test_end) in enumerate(test_periods):
    print(f"\n=== Processing Test {test_id + 1}: Train ({train_start} to {train_end}), Test ({test_start} to {test_end}) ===")

    # Train/Test 데이터 분리
    train_data = returns.loc[train_start:train_end]
    test_data = returns.loc[test_start:test_end]

    # 그래프 생성 (Train 데이터 기준)
    G = create_correlation_graph(train_data)
    nodes_in_graph = list(G.nodes)
    node_to_idx = {node: idx for idx, node in enumerate(nodes_in_graph)}
    edge_index = torch.tensor(
        [[node_to_idx[edge[0]], node_to_idx[edge[1]]] for edge in G.edges],
        dtype=torch.long
    ).t()

    # Train/Test 데이터 준비
    X_train, y_train_set = prepare_time_series_data(train_data[nodes_in_graph])
    X_test, y_test_set = prepare_time_series_data(test_data[nodes_in_graph])

    # GCN 모델 평가
    for target_idx in tqdm(range(len(nodes_in_graph)), desc=f"Processing Target Nodes (Test {test_id + 1})"):
        rmse_for_target = []
        rmae_for_target = []

        for _ in range(n_experiments):  # 30번 반복 실험
            y_train = y_train_set[:, target_idx]
            y_test = y_test_set[:, target_idx]

            data_train = Data(
                x=torch.tensor(X_train, dtype=torch.float32),
                y=torch.tensor(y_train, dtype=torch.float32).unsqueeze(1),  # Add dimension for matching
                edge_index=edge_index
            )
            data_test = Data(
                x=torch.tensor(X_test, dtype=torch.float32),
                y=torch.tensor(y_test, dtype=torch.float32).unsqueeze(1),  # Add dimension for matching
                edge_index=edge_index
            )

            # Initialize GCN
            model = GCN(
                num_features=X_train.shape[1],
                hidden_dim1=hidden_dim1,
                hidden_dim2=hidden_dim2,
                embedding_dim=embedding_dim
            )
            optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
            criterion = torch.nn.MSELoss()

            # Train the model
            model.train()
            for epoch in range(100):  # 100 epoch로 학습
                optimizer.zero_grad()
                out = model(data_train).squeeze()  # Ensure output shape matches target
                loss = criterion(out, data_train.y.squeeze())
                loss.backward()
                optimizer.step()

            # Evaluate on Test Data
            model.eval()
            with torch.no_grad():
                y_pred = model(data_test).squeeze().cpu().numpy()
                y_true = data_test.y.squeeze().cpu().numpy()

                rmse = np.sqrt(mean_squared_error(y_true, y_pred))
                rmae = mean_absolute_error(y_true, y_pred)

                rmse_for_target.append(rmse)
                rmae_for_target.append(rmae)

        # Aggregate results for this target
        final_results.append({
            'Test ID': test_id + 1,
            'Target Index': nodes_in_graph[target_idx],
            'RMSE': np.mean(rmse_for_target),
            'RMAE': np.mean(rmae_for_target)
        })

# 결과를 DataFrame으로 정리
gcn_results = pd.DataFrame(final_results)

# Display results
print("\nFinal Results with GCN Regression:")
gcn_results



=== Processing Test 1: Train (2004-01-01 to 2004-06-30), Test (2004-07-01 to 2004-12-31) ===


Processing Target Nodes (Test 1): 100%|██████████| 15/15 [00:30<00:00,  2.01s/it]



=== Processing Test 2: Train (2004-07-01 to 2004-12-31), Test (2005-01-01 to 2005-06-30) ===


Processing Target Nodes (Test 2): 100%|██████████| 14/14 [00:38<00:00,  2.73s/it]



=== Processing Test 3: Train (2005-01-01 to 2005-06-30), Test (2005-07-01 to 2005-12-31) ===


Processing Target Nodes (Test 3): 100%|██████████| 15/15 [00:28<00:00,  1.92s/it]



=== Processing Test 4: Train (2005-07-01 to 2005-12-31), Test (2006-01-01 to 2006-06-30) ===


Processing Target Nodes (Test 4): 100%|██████████| 15/15 [00:29<00:00,  1.94s/it]



=== Processing Test 5: Train (2006-01-01 to 2006-06-30), Test (2006-07-01 to 2006-12-31) ===


Processing Target Nodes (Test 5): 100%|██████████| 15/15 [00:29<00:00,  1.96s/it]



=== Processing Test 6: Train (2006-07-01 to 2006-12-31), Test (2007-01-01 to 2007-06-30) ===


Processing Target Nodes (Test 6): 100%|██████████| 15/15 [00:29<00:00,  1.96s/it]



=== Processing Test 7: Train (2007-01-01 to 2007-06-30), Test (2007-07-01 to 2007-12-31) ===


Processing Target Nodes (Test 7): 100%|██████████| 15/15 [00:31<00:00,  2.07s/it]



=== Processing Test 8: Train (2007-07-01 to 2007-12-31), Test (2008-01-01 to 2008-06-30) ===


Processing Target Nodes (Test 8): 100%|██████████| 15/15 [00:30<00:00,  2.06s/it]



=== Processing Test 9: Train (2008-01-01 to 2008-06-30), Test (2008-07-01 to 2008-12-31) ===


Processing Target Nodes (Test 9): 100%|██████████| 15/15 [00:31<00:00,  2.09s/it]



=== Processing Test 10: Train (2008-07-01 to 2008-12-31), Test (2009-01-01 to 2009-06-30) ===


Processing Target Nodes (Test 10): 100%|██████████| 15/15 [00:35<00:00,  2.37s/it]



=== Processing Test 11: Train (2009-01-01 to 2009-06-30), Test (2009-07-01 to 2009-12-31) ===


Processing Target Nodes (Test 11): 100%|██████████| 15/15 [00:36<00:00,  2.43s/it]



=== Processing Test 12: Train (2009-07-01 to 2009-12-31), Test (2010-01-01 to 2010-06-30) ===


Processing Target Nodes (Test 12): 100%|██████████| 15/15 [00:34<00:00,  2.30s/it]



=== Processing Test 13: Train (2010-01-01 to 2010-06-30), Test (2010-07-01 to 2010-12-31) ===


Processing Target Nodes (Test 13): 100%|██████████| 15/15 [00:34<00:00,  2.31s/it]



=== Processing Test 14: Train (2010-07-01 to 2010-12-31), Test (2011-01-01 to 2011-06-30) ===


Processing Target Nodes (Test 14): 100%|██████████| 15/15 [00:34<00:00,  2.33s/it]



=== Processing Test 15: Train (2011-01-01 to 2011-06-30), Test (2011-07-01 to 2011-12-31) ===


Processing Target Nodes (Test 15): 100%|██████████| 15/15 [00:35<00:00,  2.37s/it]



=== Processing Test 16: Train (2011-07-01 to 2011-12-31), Test (2012-01-01 to 2012-06-30) ===


Processing Target Nodes (Test 16): 100%|██████████| 15/15 [00:36<00:00,  2.42s/it]



=== Processing Test 17: Train (2012-01-01 to 2012-06-30), Test (2012-07-01 to 2012-12-31) ===


Processing Target Nodes (Test 17): 100%|██████████| 15/15 [00:35<00:00,  2.34s/it]



=== Processing Test 18: Train (2012-07-01 to 2012-12-31), Test (2013-01-01 to 2013-06-30) ===


Processing Target Nodes (Test 18): 100%|██████████| 15/15 [00:35<00:00,  2.34s/it]



=== Processing Test 19: Train (2013-01-01 to 2013-06-30), Test (2013-07-01 to 2013-12-31) ===


Processing Target Nodes (Test 19): 100%|██████████| 15/15 [00:34<00:00,  2.33s/it]



=== Processing Test 20: Train (2013-07-01 to 2013-12-31), Test (2014-01-01 to 2014-06-30) ===


Processing Target Nodes (Test 20): 100%|██████████| 15/15 [00:35<00:00,  2.34s/it]



=== Processing Test 21: Train (2014-01-01 to 2014-06-30), Test (2014-07-01 to 2014-12-31) ===


Processing Target Nodes (Test 21): 100%|██████████| 15/15 [00:34<00:00,  2.32s/it]



=== Processing Test 22: Train (2014-07-01 to 2014-12-31), Test (2015-01-01 to 2015-06-30) ===


Processing Target Nodes (Test 22): 100%|██████████| 14/14 [00:32<00:00,  2.30s/it]



=== Processing Test 23: Train (2015-01-01 to 2015-06-30), Test (2015-07-01 to 2015-12-31) ===


Processing Target Nodes (Test 23): 100%|██████████| 15/15 [00:35<00:00,  2.34s/it]



=== Processing Test 24: Train (2015-07-01 to 2015-12-31), Test (2016-01-01 to 2016-06-30) ===


Processing Target Nodes (Test 24): 100%|██████████| 15/15 [00:36<00:00,  2.40s/it]



=== Processing Test 25: Train (2016-01-01 to 2016-06-30), Test (2016-07-01 to 2016-12-31) ===


Processing Target Nodes (Test 25): 100%|██████████| 15/15 [00:36<00:00,  2.42s/it]



=== Processing Test 26: Train (2016-07-01 to 2016-12-31), Test (2017-01-01 to 2017-06-30) ===


Processing Target Nodes (Test 26): 100%|██████████| 15/15 [00:35<00:00,  2.36s/it]



=== Processing Test 27: Train (2017-01-01 to 2017-06-30), Test (2017-07-01 to 2017-12-31) ===


Processing Target Nodes (Test 27): 100%|██████████| 14/14 [00:32<00:00,  2.33s/it]



=== Processing Test 28: Train (2017-07-01 to 2017-12-31), Test (2018-01-01 to 2018-06-30) ===


Processing Target Nodes (Test 28): 100%|██████████| 13/13 [00:29<00:00,  2.30s/it]



=== Processing Test 29: Train (2018-01-01 to 2018-06-30), Test (2018-07-01 to 2018-12-31) ===


Processing Target Nodes (Test 29): 100%|██████████| 14/14 [00:33<00:00,  2.36s/it]



=== Processing Test 30: Train (2018-07-01 to 2018-12-31), Test (2019-01-01 to 2019-06-30) ===


Processing Target Nodes (Test 30): 100%|██████████| 15/15 [00:35<00:00,  2.36s/it]



=== Processing Test 31: Train (2019-01-01 to 2019-06-30), Test (2019-07-01 to 2019-12-31) ===


Processing Target Nodes (Test 31): 100%|██████████| 15/15 [00:35<00:00,  2.37s/it]



=== Processing Test 32: Train (2019-07-01 to 2019-12-31), Test (2020-01-01 to 2020-06-30) ===


Processing Target Nodes (Test 32): 100%|██████████| 14/14 [00:33<00:00,  2.41s/it]



=== Processing Test 33: Train (2020-01-01 to 2020-06-30), Test (2020-07-01 to 2020-12-31) ===


Processing Target Nodes (Test 33): 100%|██████████| 15/15 [00:37<00:00,  2.49s/it]



=== Processing Test 34: Train (2020-07-01 to 2020-12-31), Test (2021-01-01 to 2021-06-30) ===


Processing Target Nodes (Test 34): 100%|██████████| 14/14 [00:33<00:00,  2.42s/it]



=== Processing Test 35: Train (2021-01-01 to 2021-06-30), Test (2021-07-01 to 2021-12-31) ===


Processing Target Nodes (Test 35): 100%|██████████| 15/15 [00:36<00:00,  2.41s/it]



=== Processing Test 36: Train (2021-07-01 to 2021-12-31), Test (2022-01-01 to 2022-06-30) ===


Processing Target Nodes (Test 36): 100%|██████████| 14/14 [00:34<00:00,  2.46s/it]



=== Processing Test 37: Train (2022-01-01 to 2022-06-30), Test (2022-07-01 to 2022-12-31) ===


Processing Target Nodes (Test 37): 100%|██████████| 15/15 [00:37<00:00,  2.48s/it]



=== Processing Test 38: Train (2022-07-01 to 2022-12-31), Test (2023-01-01 to 2023-06-30) ===


Processing Target Nodes (Test 38): 100%|██████████| 15/15 [00:36<00:00,  2.45s/it]



=== Processing Test 39: Train (2023-01-01 to 2023-06-30), Test (2023-07-01 to 2023-12-31) ===


Processing Target Nodes (Test 39): 100%|██████████| 14/14 [00:34<00:00,  2.43s/it]



=== Processing Test 40: Train (2023-07-01 to 2023-12-31), Test (2024-01-01 to 2024-06-30) ===


Processing Target Nodes (Test 40): 100%|██████████| 14/14 [00:34<00:00,  2.48s/it]



=== Processing Test 41: Train (2024-01-01 to 2024-06-30), Test (2024-07-01 to 2024-12-31) ===


Processing Target Nodes (Test 41): 100%|██████████| 14/14 [00:33<00:00,  2.41s/it]


Final Results with GCN Regression:





Unnamed: 0,Test ID,Target Index,RMSE,RMAE
0,1,Turkey BIST 100,0.014673,0.011610
1,1,Hang Seng,0.008098,0.006151
2,1,Straits Times,0.006408,0.005126
3,1,Mexican IPC,0.007655,0.006096
4,1,FTSE 100,0.006028,0.004721
...,...,...,...,...
598,41,DAX,0.008427,0.006736
599,41,Mexican IPC,0.008836,0.007041
600,41,IBEX 35,0.008480,0.006518
601,41,CAC 40,0.008797,0.006972


#### 5.2. GAT Model 임베딩

In [21]:
class GAT(torch.nn.Module):
    def __init__(self, num_features, hidden_dim1, hidden_dim2, embedding_dim, heads=1):
        super(GAT, self).__init__()
        self.conv1 = GATConv(num_features, hidden_dim1, heads=heads)
        self.conv2 = GATConv(hidden_dim1 * heads, hidden_dim2, heads=heads)
        self.conv3 = GATConv(hidden_dim2 * heads, embedding_dim, heads=heads, concat=False)
        self.fc = torch.nn.Linear(embedding_dim, 1)  # Output a single value for regression

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = self.conv2(x, edge_index)
        x = F.elu(x)
        x = self.conv3(x, edge_index)
        x = self.fc(x)  # Linear layer for regression
        return x.squeeze()  # Remove unnecessary dimensions


# 결과 저장용 변수
final_results = []

# 고정된 하이퍼파라미터 설정
hidden_dim1 = 64
hidden_dim2 = 64
embedding_dim = 32
gat_heads = 4
lr = 0.01

# GAT 평가
for test_id, (train_start, train_end, test_start, test_end) in enumerate(test_periods):
    print(f"\n=== Processing Test {test_id + 1}: Train ({train_start} to {train_end}), Test ({test_start} to {test_end}) ===")

    # Train/Test 데이터 분리
    train_data = returns.loc[train_start:train_end]
    test_data = returns.loc[test_start:test_end]

    # 그래프 생성 (Train 데이터 기준)
    G = create_correlation_graph(train_data)
    nodes_in_graph = list(G.nodes)
    node_to_idx = {node: idx for idx, node in enumerate(nodes_in_graph)}
    edge_index = torch.tensor(
        [[node_to_idx[edge[0]], node_to_idx[edge[1]]] for edge in G.edges],
        dtype=torch.long
    ).t()

    # Train/Test 데이터 준비
    X_train, y_train_set = prepare_time_series_data(train_data[nodes_in_graph])
    X_test, y_test_set = prepare_time_series_data(test_data[nodes_in_graph])

    # GAT 모델 평가
    for target_idx in tqdm(range(len(nodes_in_graph)), desc=f"Processing Target Nodes (Test {test_id + 1})"):
        rmse_for_target = []
        rmae_for_target = []

        for _ in range(30):  # 30번 반복 실험
            y_train = y_train_set[:, target_idx]
            y_test = y_test_set[:, target_idx]

            data_train = Data(
                x=torch.tensor(X_train, dtype=torch.float32),
                y=torch.tensor(y_train, dtype=torch.float32),
                edge_index=edge_index
            )
            data_test = Data(
                x=torch.tensor(X_test, dtype=torch.float32),
                y=torch.tensor(y_test, dtype=torch.float32),
                edge_index=edge_index
            )

            # Initialize GAT
            model = GAT(
                num_features=X_train.shape[1],
                hidden_dim1=hidden_dim1,
                hidden_dim2=hidden_dim2,
                embedding_dim=embedding_dim,
                heads=gat_heads
            )
            optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
            criterion = torch.nn.MSELoss()

            # Train the model
            model.train()
            for epoch in range(100):  # 100 epoch로 학습
                optimizer.zero_grad()
                out = model(data_train)
                loss = criterion(out, data_train.y)
                loss.backward()
                optimizer.step()

            # Evaluate on Test Data
            model.eval()
            with torch.no_grad():
                y_pred = model(data_test).cpu().numpy()
                y_true = data_test.y.cpu().numpy()

                rmse = np.sqrt(mean_squared_error(y_true, y_pred))
                rmae = mean_absolute_error(y_true, y_pred)

                rmse_for_target.append(rmse)
                rmae_for_target.append(rmae)

        # Aggregate results for this target
        final_results.append({
            'Test ID': test_id + 1,
            'Target Index': nodes_in_graph[target_idx],
            'RMSE': np.mean(rmse_for_target),
            'RMAE': np.mean(rmae_for_target)
        })

# 결과를 DataFrame으로 정리
gat_results = pd.DataFrame(final_results)

# Display results
print("\nFinal Results with GAT Regression:")
gat_results



=== Processing Test 1: Train (2004-01-01 to 2004-06-30), Test (2004-07-01 to 2004-12-31) ===


Processing Target Nodes (Test 1): 100%|██████████| 15/15 [01:39<00:00,  6.65s/it]



=== Processing Test 2: Train (2004-07-01 to 2004-12-31), Test (2005-01-01 to 2005-06-30) ===


Processing Target Nodes (Test 2): 100%|██████████| 14/14 [01:32<00:00,  6.58s/it]



=== Processing Test 3: Train (2005-01-01 to 2005-06-30), Test (2005-07-01 to 2005-12-31) ===


Processing Target Nodes (Test 3): 100%|██████████| 15/15 [01:52<00:00,  7.52s/it]



=== Processing Test 4: Train (2005-07-01 to 2005-12-31), Test (2006-01-01 to 2006-06-30) ===


Processing Target Nodes (Test 4): 100%|██████████| 15/15 [01:56<00:00,  7.77s/it]



=== Processing Test 5: Train (2006-01-01 to 2006-06-30), Test (2006-07-01 to 2006-12-31) ===


Processing Target Nodes (Test 5): 100%|██████████| 15/15 [02:00<00:00,  8.01s/it]



=== Processing Test 6: Train (2006-07-01 to 2006-12-31), Test (2007-01-01 to 2007-06-30) ===


Processing Target Nodes (Test 6): 100%|██████████| 15/15 [02:01<00:00,  8.08s/it]



=== Processing Test 7: Train (2007-01-01 to 2007-06-30), Test (2007-07-01 to 2007-12-31) ===


Processing Target Nodes (Test 7): 100%|██████████| 15/15 [02:03<00:00,  8.23s/it]



=== Processing Test 8: Train (2007-07-01 to 2007-12-31), Test (2008-01-01 to 2008-06-30) ===


Processing Target Nodes (Test 8): 100%|██████████| 15/15 [02:05<00:00,  8.37s/it]



=== Processing Test 9: Train (2008-01-01 to 2008-06-30), Test (2008-07-01 to 2008-12-31) ===


Processing Target Nodes (Test 9): 100%|██████████| 15/15 [02:04<00:00,  8.31s/it]



=== Processing Test 10: Train (2008-07-01 to 2008-12-31), Test (2009-01-01 to 2009-06-30) ===


Processing Target Nodes (Test 10): 100%|██████████| 15/15 [02:08<00:00,  8.56s/it]



=== Processing Test 11: Train (2009-01-01 to 2009-06-30), Test (2009-07-01 to 2009-12-31) ===


Processing Target Nodes (Test 11): 100%|██████████| 15/15 [02:06<00:00,  8.42s/it]



=== Processing Test 12: Train (2009-07-01 to 2009-12-31), Test (2010-01-01 to 2010-06-30) ===


Processing Target Nodes (Test 12): 100%|██████████| 15/15 [02:06<00:00,  8.44s/it]



=== Processing Test 13: Train (2010-01-01 to 2010-06-30), Test (2010-07-01 to 2010-12-31) ===


Processing Target Nodes (Test 13): 100%|██████████| 15/15 [02:08<00:00,  8.57s/it]



=== Processing Test 14: Train (2010-07-01 to 2010-12-31), Test (2011-01-01 to 2011-06-30) ===


Processing Target Nodes (Test 14): 100%|██████████| 15/15 [02:07<00:00,  8.52s/it]



=== Processing Test 15: Train (2011-01-01 to 2011-06-30), Test (2011-07-01 to 2011-12-31) ===


Processing Target Nodes (Test 15): 100%|██████████| 15/15 [02:06<00:00,  8.40s/it]



=== Processing Test 16: Train (2011-07-01 to 2011-12-31), Test (2012-01-01 to 2012-06-30) ===


Processing Target Nodes (Test 16): 100%|██████████| 15/15 [02:09<00:00,  8.61s/it]



=== Processing Test 17: Train (2012-01-01 to 2012-06-30), Test (2012-07-01 to 2012-12-31) ===


Processing Target Nodes (Test 17): 100%|██████████| 15/15 [02:07<00:00,  8.49s/it]



=== Processing Test 18: Train (2012-07-01 to 2012-12-31), Test (2013-01-01 to 2013-06-30) ===


Processing Target Nodes (Test 18): 100%|██████████| 15/15 [02:06<00:00,  8.46s/it]



=== Processing Test 19: Train (2013-01-01 to 2013-06-30), Test (2013-07-01 to 2013-12-31) ===


Processing Target Nodes (Test 19): 100%|██████████| 15/15 [02:06<00:00,  8.45s/it]



=== Processing Test 20: Train (2013-07-01 to 2013-12-31), Test (2014-01-01 to 2014-06-30) ===


Processing Target Nodes (Test 20): 100%|██████████| 15/15 [02:08<00:00,  8.54s/it]



=== Processing Test 21: Train (2014-01-01 to 2014-06-30), Test (2014-07-01 to 2014-12-31) ===


Processing Target Nodes (Test 21): 100%|██████████| 15/15 [02:04<00:00,  8.31s/it]



=== Processing Test 22: Train (2014-07-01 to 2014-12-31), Test (2015-01-01 to 2015-06-30) ===


Processing Target Nodes (Test 22): 100%|██████████| 14/14 [01:57<00:00,  8.38s/it]



=== Processing Test 23: Train (2015-01-01 to 2015-06-30), Test (2015-07-01 to 2015-12-31) ===


Processing Target Nodes (Test 23): 100%|██████████| 15/15 [02:06<00:00,  8.42s/it]



=== Processing Test 24: Train (2015-07-01 to 2015-12-31), Test (2016-01-01 to 2016-06-30) ===


Processing Target Nodes (Test 24): 100%|██████████| 15/15 [02:11<00:00,  8.76s/it]



=== Processing Test 25: Train (2016-01-01 to 2016-06-30), Test (2016-07-01 to 2016-12-31) ===


Processing Target Nodes (Test 25): 100%|██████████| 15/15 [02:16<00:00,  9.08s/it]



=== Processing Test 26: Train (2016-07-01 to 2016-12-31), Test (2017-01-01 to 2017-06-30) ===


Processing Target Nodes (Test 26): 100%|██████████| 15/15 [02:08<00:00,  8.55s/it]



=== Processing Test 27: Train (2017-01-01 to 2017-06-30), Test (2017-07-01 to 2017-12-31) ===


Processing Target Nodes (Test 27): 100%|██████████| 14/14 [01:57<00:00,  8.40s/it]



=== Processing Test 28: Train (2017-07-01 to 2017-12-31), Test (2018-01-01 to 2018-06-30) ===


Processing Target Nodes (Test 28): 100%|██████████| 13/13 [01:48<00:00,  8.37s/it]



=== Processing Test 29: Train (2018-01-01 to 2018-06-30), Test (2018-07-01 to 2018-12-31) ===


Processing Target Nodes (Test 29): 100%|██████████| 14/14 [01:58<00:00,  8.46s/it]



=== Processing Test 30: Train (2018-07-01 to 2018-12-31), Test (2019-01-01 to 2019-06-30) ===


Processing Target Nodes (Test 30): 100%|██████████| 15/15 [02:07<00:00,  8.47s/it]



=== Processing Test 31: Train (2019-01-01 to 2019-06-30), Test (2019-07-01 to 2019-12-31) ===


Processing Target Nodes (Test 31): 100%|██████████| 15/15 [02:09<00:00,  8.63s/it]



=== Processing Test 32: Train (2019-07-01 to 2019-12-31), Test (2020-01-01 to 2020-06-30) ===


Processing Target Nodes (Test 32): 100%|██████████| 14/14 [02:00<00:00,  8.63s/it]



=== Processing Test 33: Train (2020-01-01 to 2020-06-30), Test (2020-07-01 to 2020-12-31) ===


Processing Target Nodes (Test 33): 100%|██████████| 15/15 [02:13<00:00,  8.89s/it]



=== Processing Test 34: Train (2020-07-01 to 2020-12-31), Test (2021-01-01 to 2021-06-30) ===


Processing Target Nodes (Test 34): 100%|██████████| 14/14 [02:02<00:00,  8.75s/it]



=== Processing Test 35: Train (2021-01-01 to 2021-06-30), Test (2021-07-01 to 2021-12-31) ===


Processing Target Nodes (Test 35): 100%|██████████| 15/15 [02:06<00:00,  8.43s/it]



=== Processing Test 36: Train (2021-07-01 to 2021-12-31), Test (2022-01-01 to 2022-06-30) ===


Processing Target Nodes (Test 36): 100%|██████████| 14/14 [02:00<00:00,  8.62s/it]



=== Processing Test 37: Train (2022-01-01 to 2022-06-30), Test (2022-07-01 to 2022-12-31) ===


Processing Target Nodes (Test 37): 100%|██████████| 15/15 [02:08<00:00,  8.58s/it]



=== Processing Test 38: Train (2022-07-01 to 2022-12-31), Test (2023-01-01 to 2023-06-30) ===


Processing Target Nodes (Test 38): 100%|██████████| 15/15 [02:07<00:00,  8.51s/it]



=== Processing Test 39: Train (2023-01-01 to 2023-06-30), Test (2023-07-01 to 2023-12-31) ===


Processing Target Nodes (Test 39): 100%|██████████| 14/14 [01:58<00:00,  8.47s/it]



=== Processing Test 40: Train (2023-07-01 to 2023-12-31), Test (2024-01-01 to 2024-06-30) ===


Processing Target Nodes (Test 40): 100%|██████████| 14/14 [01:59<00:00,  8.52s/it]



=== Processing Test 41: Train (2024-01-01 to 2024-06-30), Test (2024-07-01 to 2024-12-31) ===


Processing Target Nodes (Test 41): 100%|██████████| 14/14 [01:55<00:00,  8.22s/it]


Final Results with GAT Regression:





Unnamed: 0,Test ID,Target Index,RMSE,RMAE
0,1,Turkey BIST 100,0.014672,0.011611
1,1,Hang Seng,0.008098,0.006153
2,1,Straits Times,0.006407,0.005124
3,1,Mexican IPC,0.007635,0.006081
4,1,FTSE 100,0.006001,0.004696
...,...,...,...,...
598,41,DAX,0.008417,0.006737
599,41,Mexican IPC,0.008822,0.007031
600,41,IBEX 35,0.008479,0.006510
601,41,CAC 40,0.008795,0.006972


### 6. 통계 검정

#### 6.1. 모델별로 Accuracy와 F1 Score에 대해 기초통계량 추출

In [22]:
# rf_results = results_df_rf_time_based
# gbc_results = results_df_gbc_time_based
# mlp_results = results_df_mlp_time_based
# knn_results = results_df_knn_time_based
# sv_results = results_df_svc_time_based
# gcn_results = results_df_gcn_time_based
# gat_results = results_df_gat_time_based

In [25]:
# Function to calculate statistics for a single model and test_id
def calculate_statistics(model_name, results_df):
    rmse_values = results_df['RMSE']
    rmae_values = results_df['RMAE']

    # Calculate mean and standard deviation
    rmse_mean = np.mean(rmse_values)
    rmse_std = np.std(rmse_values, ddof=1)
    rmae_mean = np.mean(rmae_values)
    rmae_std = np.std(rmae_values, ddof=1)

    # Combine mean and std into one string
    rmse_summary = f"{rmse_mean:.4f}±{rmse_std:.4f}"
    rmae_summary = f"{rmae_mean:.4f}±{rmae_std:.4f}"

    return {
        "Model": model_name,
        "Mean RMSE": rmse_mean,
        "RMSE Summary": rmse_summary,
        "Mean RMAE": rmae_mean,
        "RMAE Summary": rmae_summary
    }

# Ensure models list contains DataFrames
models = [
    ("Random Forest", rf_results),
    ("XGBoost", gbc_results),
    ("MLP", mlp_results),
    ("KNN", knn_results),
    ("SVM", svr_results),
    ("GCN", gcn_results),
    ("GAT", gat_results)
]

# Initialize empty list to hold all results
all_test_results = []

# Iterate over all test_ids
for test_id in range(1, 42):  # Assuming test IDs range from 1 to 41
    test_results = []
    for model_name, results_df in models:
        # Ensure results_df is a DataFrame
        if not isinstance(results_df, pd.DataFrame):
            raise TypeError(f"Results for {model_name} must be a DataFrame, got {type(results_df)}")

        # Filter results for the current test_id
        test_df = results_df[results_df['Test ID'] == test_id]
        
        # Calculate statistics and append
        stats = calculate_statistics(model_name, test_df)
        stats["Test ID"] = test_id  # Add test_id to the stats dictionary
        test_results.append(stats)

    # Append all stats for the current test_id to the final results
    all_test_results.extend(test_results)

# Convert to a DataFrame
final_stats_df = pd.DataFrame(all_test_results)

# Display the combined DataFrame
final_stats_df = final_stats_df[['Test ID', 'Model', 'Mean RMSE', 'RMSE Summary', 'Mean RMAE', 'RMAE Summary']]

# Save to CSV for later analysis
final_stats_df.to_csv("reg_model_statistics_summary.csv", index=False)
final_stats_df


Unnamed: 0,Test ID,Model,Mean RMSE,RMSE Summary,Mean RMAE,RMAE Summary
0,1,Random Forest,0.008907,0.0089±0.0032,0.007061,0.0071±0.0025
1,1,XGBoost,0.009779,0.0098±0.0035,0.007715,0.0077±0.0027
2,1,MLP,0.010148,0.0101±0.0028,0.008121,0.0081±0.0022
3,1,KNN,0.009753,0.0098±0.0033,0.007671,0.0077±0.0027
4,1,SVM,0.009988,0.0100±0.0028,0.008044,0.0080±0.0023
...,...,...,...,...,...,...
282,41,MLP,0.012188,0.0122±0.0031,0.009358,0.0094±0.0022
283,41,KNN,0.010823,0.0108±0.0040,0.008255,0.0083±0.0029
284,41,SVM,0.011298,0.0113±0.0045,0.008723,0.0087±0.0036
285,41,GCN,0.010022,0.0100±0.0035,0.007538,0.0075±0.0024


#### 6.2. GCN과 Benchmark Accuracy 성능 비교

##### 6.2.1. RMSE

In [29]:
# Initialize final results list
final_ttest_results = []

# Loop over all test IDs
for test_id in range(1, 42):  # Assuming test IDs range from 1 to 41
    # Filter data for the current test ID
    rf_rmse_test = rf_results[rf_results['Test ID'] == test_id]['RMSE']
    gbc_rmse_test = gbc_results[gbc_results['Test ID'] == test_id]['RMSE']
    mlp_rmse_test = mlp_results[mlp_results['Test ID'] == test_id]['RMSE']
    knn_rmse_test = knn_results[knn_results['Test ID'] == test_id]['RMSE']
    svr_rmse_test = svr_results[svr_results['Test ID'] == test_id]['RMSE']
    gcn_rmse_test = gcn_results[gcn_results['Test ID'] == test_id]['RMSE']

    # Combine RMSEs into a single DataFrame for easier processing
    rmse_stats = pd.concat(
        [rf_rmse_test, gbc_rmse_test, mlp_rmse_test, knn_rmse_test, svr_rmse_test, gcn_rmse_test],
        axis=1,
        ignore_index=True
    )
    rmse_stats.columns = ['RF', 'XGB', 'MLP', 'KNN', 'SVM', 'GCN']

    # Extract GCN column
    gcn_values = rmse_stats['GCN']

    # Perform t-tests comparing GCN with each of the other models
    t_test_results = []
    models = ['RF', 'XGB', 'MLP', 'KNN', 'SVM']

    for model in models:
        other_values = rmse_stats[model]

        # Perform t-test (양측 검정)
        t_stat, p_value_two_sided = ttest_ind(other_values, gcn_values, equal_var=False)

        # 단측 검정으로 p-value 변환 (GCN이 RMSE가 작다는 방향으로 가정)
        p_value_one_sided = p_value_two_sided / 2 if t_stat > 0 else 1 - (p_value_two_sided / 2)

        # Store results
        t_test_results.append({
            'Test ID': test_id,
            'Comparison Model': model,
            'T-Statistic': t_stat,
            'Significance (10%)': p_value_one_sided < 0.1,
            'Significance (5%)': p_value_one_sided < 0.05,
            'Significance (1%)': p_value_one_sided < 0.01,
            'P-Value (One-Sided)': p_value_one_sided
        })

    # Append t-test results for this test ID
    final_ttest_results.extend(t_test_results)

# Convert final results to a DataFrame
final_gcn_rmse_ttest_results_df = pd.DataFrame(final_ttest_results)

# Save to CSV for later analysis
final_gcn_rmse_ttest_results_df.to_csv("reg_gcn_rmse_ttest_results.csv", index=False)

# Display the final results
print("T-Test Results Across All Test Periods:")
final_gcn_rmse_ttest_results_df

T-Test Results Across All Test Periods:


Unnamed: 0,Test ID,Comparison Model,T-Statistic,Significance (10%),Significance (5%),Significance (1%),P-Value (One-Sided)
0,1,RF,0.267122,False,False,False,0.395686
1,1,XGB,1.005736,False,False,False,0.161854
2,1,MLP,1.522545,True,False,False,0.069546
3,1,KNN,1.017248,False,False,False,0.159049
4,1,SVM,1.365394,True,False,False,0.091501
...,...,...,...,...,...,...,...
200,41,RF,0.255652,False,False,False,0.400117
201,41,XGB,0.897497,False,False,False,0.188878
202,41,MLP,1.735267,True,True,False,0.047330
203,41,KNN,0.567363,False,False,False,0.287713


##### 6.2.2. RMAE

In [31]:
from scipy.stats import ttest_ind

# Initialize final results list
final_ttest_results = []

# Loop over all test IDs
for test_id in range(1, 42):  # Assuming test IDs range from 1 to 41
    # Filter data for the current test ID
    rf_rmae_test = rf_results[rf_results['Test ID'] == test_id]['RMAE']
    gbc_rmae_test = gbc_results[gbc_results['Test ID'] == test_id]['RMAE']
    mlp_rmae_test = mlp_results[mlp_results['Test ID'] == test_id]['RMAE']
    knn_rmae_test = knn_results[knn_results['Test ID'] == test_id]['RMAE']
    svr_rmae_test = svr_results[svr_results['Test ID'] == test_id]['RMAE']
    gcn_rmae_test = gcn_results[gcn_results['Test ID'] == test_id]['RMAE']

    # Combine RMAEs into a single DataFrame for easier processing
    rmae_stats = pd.concat(
        [rf_rmae_test, gbc_rmae_test, mlp_rmae_test, knn_rmae_test, svr_rmae_test, gcn_rmae_test],
        axis=1,
        ignore_index=True
    )
    rmae_stats.columns = ['RF', 'XGB', 'MLP', 'KNN', 'SVM', 'GCN']

    # Extract GCN column
    gcn_values = rmae_stats['GCN']

    # Perform t-tests comparing GCN with each of the other models
    t_test_results = []
    models = ['RF', 'XGB', 'MLP', 'KNN', 'SVM']

    for model in models:
        other_values = rmae_stats[model]

        # Perform t-test (양측 검정)
        t_stat, p_value_two_sided = ttest_ind(other_values, gcn_values, equal_var=False)

        # 단측 검정으로 p-value 변환 (GCN이 RMAE가 작다는 방향으로 가정)
        p_value_one_sided = p_value_two_sided / 2 if t_stat > 0 else 1 - (p_value_two_sided / 2)

        # Store results
        t_test_results.append({
            'Test ID': test_id,
            'Comparison Model': model,
            'T-Statistic': t_stat,
            'Significance (10%)': p_value_one_sided < 0.1,
            'Significance (5%)': p_value_one_sided < 0.05,
            'Significance (1%)': p_value_one_sided < 0.01,
            'P-Value (One-Sided)': p_value_one_sided
        })

    # Append t-test results for this test ID
    final_ttest_results.extend(t_test_results)

# Convert final results to a DataFrame
final_gcn_rmae_ttest_results_df = pd.DataFrame(final_ttest_results)

# Save to CSV for later analysis
final_gcn_rmae_ttest_results_df.to_csv("reg_gcn_rmae_ttest_results.csv", index=False)

# Display the final results
print("T-Test Results Across All Test Periods (RMAE):")
final_gcn_rmae_ttest_results_df

T-Test Results Across All Test Periods (RMAE):


Unnamed: 0,Test ID,Comparison Model,T-Statistic,Significance (10%),Significance (5%),Significance (1%),P-Value (One-Sided)
0,1,RF,0.397990,False,False,False,0.346857
1,1,XGB,1.107216,False,False,False,0.139019
2,1,MLP,1.737736,True,True,False,0.046625
3,1,KNN,1.070953,False,False,False,0.146835
4,1,SVM,1.601321,True,False,False,0.060290
...,...,...,...,...,...,...,...
200,41,RF,0.291998,False,False,False,0.386304
201,41,XGB,1.011017,False,False,False,0.160727
202,41,MLP,2.064607,True,True,False,0.024571
203,41,KNN,0.710009,False,False,False,0.242103


#### 6.3. GAT와 Benchmark Accuracy 성능 비교

##### 6.3.1. RMSE

In [32]:
# Initialize final results list
final_ttest_results = []

# Loop over all test IDs
for test_id in range(1, 42):  # Assuming test IDs range from 1 to 41
    # Filter data for the current test ID
    rf_rmse_test = rf_results[rf_results['Test ID'] == test_id]['RMSE']
    gbc_rmse_test = gbc_results[gbc_results['Test ID'] == test_id]['RMSE']
    mlp_rmse_test = mlp_results[mlp_results['Test ID'] == test_id]['RMSE']
    knn_rmse_test = knn_results[knn_results['Test ID'] == test_id]['RMSE']
    svr_rmse_test = svr_results[svr_results['Test ID'] == test_id]['RMSE']
    gat_rmse_test = gat_results[gat_results['Test ID'] == test_id]['RMSE']

    # Combine RMSEs into a single DataFrame for easier processing
    rmse_stats = pd.concat(
        [rf_rmse_test, gbc_rmse_test, mlp_rmse_test, knn_rmse_test, svr_rmse_test, gat_rmse_test],
        axis=1,
        ignore_index=True
    )
    rmse_stats.columns = ['RF', 'XGB', 'MLP', 'KNN', 'SVM', 'GAT']

    # Extract GCN column
    gat_values = rmse_stats['GAT']

    # Perform t-tests comparing GCN with each of the other models
    t_test_results = []
    models = ['RF', 'XGB', 'MLP', 'KNN', 'SVM']

    for model in models:
        other_values = rmse_stats[model]

        # Perform t-test (양측 검정)
        t_stat, p_value_two_sided = ttest_ind(other_values, gat_values, equal_var=False)

        # 단측 검정으로 p-value 변환 (GCN이 RMSE가 작다는 방향으로 가정)
        p_value_one_sided = p_value_two_sided / 2 if t_stat > 0 else 1 - (p_value_two_sided / 2)

        # Store results
        t_test_results.append({
            'Test ID': test_id,
            'Comparison Model': model,
            'T-Statistic': t_stat,
            'Significance (10%)': p_value_one_sided < 0.1,
            'Significance (5%)': p_value_one_sided < 0.05,
            'Significance (1%)': p_value_one_sided < 0.01,
            'P-Value (One-Sided)': p_value_one_sided
        })

    # Append t-test results for this test ID
    final_ttest_results.extend(t_test_results)

# Convert final results to a DataFrame
final_gat_rmse_ttest_results_df = pd.DataFrame(final_ttest_results)

# Save to CSV for later analysis
final_gat_rmse_ttest_results_df.to_csv("reg_gat_rmse_ttest_results.csv", index=False)

# Display the final results
print("T-Test Results Across All Test Periods:")
final_gat_rmse_ttest_results_df

T-Test Results Across All Test Periods:


Unnamed: 0,Test ID,Comparison Model,T-Statistic,Significance (10%),Significance (5%),Significance (1%),P-Value (One-Sided)
0,1,RF,0.278628,False,False,False,0.391309
1,1,XGB,1.015954,False,False,False,0.159451
2,1,MLP,1.533658,True,False,False,0.068170
3,1,KNN,1.027783,False,False,False,0.156600
4,1,SVM,1.376674,True,False,False,0.089762
...,...,...,...,...,...,...,...
200,41,RF,0.258153,False,False,False,0.399162
201,41,XGB,0.899501,False,False,False,0.188353
202,41,MLP,1.736720,True,True,False,0.047201
203,41,KNN,0.569541,False,False,False,0.286984


##### 6.3.2. RMAE

In [33]:
from scipy.stats import ttest_ind

# Initialize final results list
final_ttest_results = []

# Loop over all test IDs
for test_id in range(1, 42):  # Assuming test IDs range from 1 to 41
    # Filter data for the current test ID
    rf_rmae_test = rf_results[rf_results['Test ID'] == test_id]['RMAE']
    gbc_rmae_test = gbc_results[gbc_results['Test ID'] == test_id]['RMAE']
    mlp_rmae_test = mlp_results[mlp_results['Test ID'] == test_id]['RMAE']
    knn_rmae_test = knn_results[knn_results['Test ID'] == test_id]['RMAE']
    svr_rmae_test = svr_results[svr_results['Test ID'] == test_id]['RMAE']
    gat_rmae_test = gat_results[gat_results['Test ID'] == test_id]['RMAE']

    # Combine RMAEs into a single DataFrame for easier processing
    rmae_stats = pd.concat(
        [rf_rmae_test, gbc_rmae_test, mlp_rmae_test, knn_rmae_test, svr_rmae_test, gat_rmae_test],
        axis=1,
        ignore_index=True
    )
    rmae_stats.columns = ['RF', 'XGB', 'MLP', 'KNN', 'SVM', 'GAT']

    # Extract GCN column
    gat_values = rmae_stats['GAT']

    # Perform t-tests comparing GCN with each of the other models
    t_test_results = []
    models = ['RF', 'XGB', 'MLP', 'KNN', 'SVM']

    for model in models:
        other_values = rmae_stats[model]

        # Perform t-test (양측 검정)
        t_stat, p_value_two_sided = ttest_ind(other_values, gat_values, equal_var=False)

        # 단측 검정으로 p-value 변환 (GCN이 RMAE가 작다는 방향으로 가정)
        p_value_one_sided = p_value_two_sided / 2 if t_stat > 0 else 1 - (p_value_two_sided / 2)

        # Store results
        t_test_results.append({
            'Test ID': test_id,
            'Comparison Model': model,
            'T-Statistic': t_stat,
            'Significance (10%)': p_value_one_sided < 0.1,
            'Significance (5%)': p_value_one_sided < 0.05,
            'Significance (1%)': p_value_one_sided < 0.01,
            'P-Value (One-Sided)': p_value_one_sided
        })

    # Append t-test results for this test ID
    final_ttest_results.extend(t_test_results)

# Convert final results to a DataFrame
final_gat_rmae_ttest_results_df = pd.DataFrame(final_ttest_results)

# Save to CSV for later analysis
final_gat_rmae_ttest_results_df.to_csv("reg_gat_rmae_ttest_results.csv", index=False)

# Display the final results
print("T-Test Results Across All Test Periods (RMAE):")
final_gat_rmae_ttest_results_df

T-Test Results Across All Test Periods (RMAE):


Unnamed: 0,Test ID,Comparison Model,T-Statistic,Significance (10%),Significance (5%),Significance (1%),P-Value (One-Sided)
0,1,RF,0.410765,False,False,False,0.342215
1,1,XGB,1.118785,False,False,False,0.136574
2,1,MLP,1.749756,True,True,False,0.045559
3,1,KNN,1.082681,False,False,False,0.144258
4,1,SVM,1.613246,True,False,False,0.058976
...,...,...,...,...,...,...,...
200,41,RF,0.293302,False,False,False,0.385811
201,41,XGB,1.011619,False,False,False,0.160584
202,41,MLP,2.063978,True,True,False,0.024605
203,41,KNN,0.710879,False,False,False,0.241836
