In [None]:
import pandas as pd#기본 simple MLP 모델 사용 
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, StandardScaler
import os
import warnings

# 경고 무시
warnings.filterwarnings('ignore')

print("--- [PyTorch Version] Episode-Based Prediction ---")

# ==============================================================================
# [1] 설정 및 하이퍼파라미터
# ==============================================================================
TRAIN_FILE = 'train.csv'
TEST_FOLDER = 'test/'
SAMPLE_FILE = 'sample_submission.csv'
OUTPUT_FILE = 'submission_torch.csv'

# 하이퍼파라미터
BATCH_SIZE = 64
LEARNING_RATE = 0.001
EPOCHS = 30 # 학습 횟수
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f" -> 사용 장치: {DEVICE}")

if not os.path.exists(SAMPLE_FILE):
    raise FileNotFoundError(f"'{SAMPLE_FILE}' 파일이 없습니다.")

# ==============================================================================
# [2] 데이터 전처리
# ==============================================================================
def process_data(df, encoders=None, is_train=True):
    df = df.copy()
    
    # 1. 거리/각도
    df['dist_to_goal'] = np.sqrt((105 - df['start_x'])**2 + (34 - df['start_y'])**2)
    df['angle_to_goal'] = np.arctan2((34 - df['start_y']), (105 - df['start_x']))
    
    # 2. 범주형 인코딩
    cat_cols = ['type_name', 'result_name']
    if is_train:
        encoders = {}
        for col in cat_cols:
            le = LabelEncoder()
            df[col] = df[col].fillna('Unknown').astype(str)
            df[col] = le.fit_transform(df[col])
            encoders[col] = le
    else:
        for col in cat_cols:
            df[col] = df[col].fillna('Unknown').astype(str)
            le = encoders[col]
            df[col] = df[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

    # 3. Aggregation (에피소드 단위 통계)
    num_aggs = {
        'start_x': ['mean', 'std', 'min', 'max', 'last'],
        'start_y': ['mean', 'std', 'min', 'max', 'last'],
        'dist_to_goal': ['mean', 'last'],
        'time_seconds': ['count', 'max', 'min']
    }
    cat_aggs = {
        'type_name': ['nunique', 'last'],
        'result_name': ['last']
    }
    
    agg_dict = {**num_aggs, **cat_aggs}
    
    if is_train:
        agg_dict['end_x'] = ['last']
        agg_dict['end_y'] = ['last']

    grp = df.groupby(['game_id', 'episode_id'])
    df_agg = grp.agg(agg_dict)
    
    df_agg.columns = ['_'.join(col).strip() for col in df_agg.columns.values]
    df_agg = df_agg.reset_index()
    
    # NaN 값 처리 (std 계산 등에서 발생 가능) - 0으로 채움
    df_agg = df_agg.fillna(0)
    
    return df_agg, encoders

# ==============================================================================
# [3] PyTorch Dataset & Model 정의
# ==============================================================================
class SoccerDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y) if y is not None else None
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        return self.X[idx]

class SimpleMLP(nn.Module):
    def __init__(self, input_dim):
        super(SimpleMLP, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128), # 학습 안정화
            nn.ReLU(),
            nn.Dropout(0.2),     # 과적합 방지
            
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            
            nn.Linear(64, 2)     # 출력: end_x, end_y
        )
        
    def forward(self, x):
        return self.net(x)

# ==============================================================================
# [4] 데이터 준비 및 스케일링
# ==============================================================================
print("1. 데이터 로드 및 전처리...")
df_train = pd.read_csv(TRAIN_FILE)
train_agg, saved_encoders = process_data(df_train, is_train=True)

# Target 설정
target_cols = ['end_x_last', 'end_y_last']
features = [c for c in train_agg.columns if c not in ['game_id', 'episode_id'] + target_cols]

# *** 중요: 딥러닝을 위한 스케일링 (StandardScaler) ***
scaler = StandardScaler()
X_train = scaler.fit_transform(train_agg[features])
y_train = train_agg[target_cols].values

# Dataset & DataLoader 생성
train_dataset = SoccerDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# 모델 초기화
model = SimpleMLP(input_dim=len(features)).to(DEVICE)
criterion = nn.MSELoss() # 회귀 문제이므로 평균제곱오차 사용
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# ==============================================================================
# [5] 모델 학습
# ==============================================================================
print(f"2. 모델 학습 시작 (Epochs: {EPOCHS})...")

model.train()
for epoch in range(EPOCHS):
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
        
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    if (epoch + 1) % 5 == 0:
        print(f"   Epoch [{epoch+1}/{EPOCHS}], Loss: {total_loss / len(train_loader):.4f}")

# ==============================================================================
# [6] Test 예측 및 결과 저장
# ==============================================================================
print("3. Test 데이터 예측...")

# Test 파일 로드
test_files = []
for root, dirs, files in os.walk(TEST_FOLDER):
    for file in files:
        if file.endswith(".csv"):
            test_files.append(os.path.join(root, file))

df_test_list = [pd.read_csv(f) for f in test_files]
df_test = pd.concat(df_test_list, ignore_index=True)

# 전처리
test_agg, _ = process_data(df_test, encoders=saved_encoders, is_train=False)

# 스케일링 (Train 기준 적용)
X_test = scaler.transform(test_agg[features])

# 예측
test_dataset = SoccerDataset(X_test)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model.eval()
all_preds = []
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(DEVICE)
        outputs = model(X_batch)
        all_preds.append(outputs.cpu().numpy())

predictions = np.vstack(all_preds)

# Clipping
MAX_X, MAX_Y = 105.0, 68.0
test_agg['pred_end_x'] = np.clip(predictions[:, 0], 0, MAX_X)
test_agg['pred_end_y'] = np.clip(predictions[:, 1], 0, MAX_Y)

# ==============================================================================
# [7] Sample Submission 매핑 (절대 기준)
# ==============================================================================
print("4. Sample Submission 파일 병합...")

submission = pd.read_csv(SAMPLE_FILE)

# Key 분리 (153363_1 -> 153363, 1)
submission['game_id'] = submission['game_episode'].apply(lambda x: int(x.split('_')[0]))
submission['episode_id'] = submission['game_episode'].apply(lambda x: int(x.split('_')[1]))

# 병합
final_df = pd.merge(submission, test_agg[['game_id', 'episode_id', 'pred_end_x', 'pred_end_y']], 
                    on=['game_id', 'episode_id'], 
                    how='left')

# 컬럼 정리
final_df['end_x'] = final_df['pred_end_x']
final_df['end_y'] = final_df['pred_end_y']
final_df = final_df.fillna(0)

# 최종 저장
final_output = final_df[['game_episode', 'end_x', 'end_y']]
final_output.to_csv(OUTPUT_FILE, index=False)

print(f"\n[성공] '{OUTPUT_FILE}' 생성 완료.")
print(f" -> 데이터 확인:\n{final_output.head()}")

--- [High Accuracy] LSTM Sequence Model ---
 -> 사용 장치: cpu
1. 데이터 로드 및 시퀀스 변환...
 -> 시퀀스 변환 중... (Train)
2. 학습 시작 (Epochs: 50)...


KeyboardInterrupt: 