In [1]:
import torch
import os
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import warnings 
warnings.filterwarnings('ignore')

In [2]:
# train dataset
dummy_csv_folder_path = '/astrago_test_case/xgboost/data/train'
#test dataset
test_csv_folder_path = '/astrago_test_case/xgboost/data/test'


dummy_csv_file_paths = [os.path.join(dummy_csv_folder_path, file) for file in os.listdir(dummy_csv_folder_path) if file.endswith('.csv')]


df = pd.DataFrame()
for file_path in dummy_csv_file_paths:
    df_each = pd.read_csv(file_path)
    df = pd.concat([df, df_each], ignore_index=True)


df = df[df['data_num'] != 0]
df = df.reset_index()
df.drop('index', axis=1, inplace=True)

X = df.drop('total_data_inference_time', axis=1).values
y = df['total_data_inference_time'].values

# data slice
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3)

# #fit
# scaler = StandardScaler()
# # # scaler = MinMaxScaler()
# # # scaler = RobustScaler()
# X_train = scaler.fit_transform(X_train)
# X_val = scaler.fit_transform(X_val)

#torch tensor
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.float32)

# data loader
train_data = TensorDataset(X_train, y_train)
val_data = TensorDataset(X_val, y_val)
train_loader = DataLoader(train_data, batch_size = 16, shuffle = True)
val_loader = DataLoader(val_data, batch_size = 16)

In [3]:
test_csv_file_paths = [os.path.join(test_csv_folder_path, file) for file in os.listdir(test_csv_folder_path) if file.endswith('.csv')]

df = pd.DataFrame()
for file_path in test_csv_file_paths:
    df_each = pd.read_csv(file_path)
    df_each['total_inference_time'] = df_each['single_data_inference_time'].cumsum()
    df = pd.concat([df, df_each], ignore_index=True)


df_test = df[df['inference_time'] != 0] # 0 epoch 제거
df_test.drop("model_name", axis=1, inplace=True)
df_test.drop("data_num", axis=1, inplace=True)
df_test.drop("gpu_usage", axis=1, inplace=True)
df_test.drop("cpu_usage", axis=1, inplace=True)
df_test.drop("inference_time", axis=1, inplace=True)
df_test.drop("save_time", axis=1, inplace=True)
df_test.drop('single_data_inference_time', axis=1, inplace=True)
df_test.drop('gpu', axis=1, inplace=True)
df_test.rename(columns={'num' : 'data_num'}, inplace=True)
df_test = df_test[['FLOPS', 'data_num', 'imgsz', 'param', 'total_inference_time']]

df_test['FLOPS'] = 14

X_test = df_test.drop('total_inference_time', axis=1).values
y_test = df_test['total_inference_time'].values

# scaler = StandardScaler()
# X_test = scaler.fit_transform(X_test)

X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

In [4]:
# XGBoost 회귀 모델 정의
import xgboost as xgb
from sklearn.metrics import mean_squared_error

model = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 0.05, min_child_weight = 5, subsample = 0.45, 
                         max_depth = 9, n_estimators = 10000, tree_method='gpu_hist', gpu_id=0)

# 모델 학습
model.fit(X_train, y_train)

# 훈련 데이터셋에 대한 성능 평가
train_preds = model.predict(X_train)
train_mse = mean_squared_error(y_train, train_preds)
print(f'Train MSE: {train_mse:.4f}')

# 검증 데이터셋에 대한 성능 평가
val_preds = model.predict(X_val)  # X_val: 검증 데이터셋
val_mse = mean_squared_error(y_val, val_preds)  # y_val: 검증 데이터의 실제 값
print(f'Validation MSE: {val_mse:.4f}')

y_pred = model.predict(X_test)  # X_val: 검증 데이터셋
test_mse = mean_squared_error(y_test, y_pred)  # y_val: 검증 데이터의 실제 값
print(f'Test MSE: {test_mse:.4f}')

Train MSE: 0.0000
Validation MSE: 0.0000
Test MSE: 6.1487
