In [None]:
import pandas as pd#버전2에서 각 이벤트마다 물리적 특징 추가 매핑 및 action_id를 기준으로 정렬(시간순서 다를수있다고했기에)
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from sklearn.preprocessing import StandardScaler
import os
import warnings

warnings.filterwarnings('ignore')

print("--- [Physics Enhanced] LSTM with Action-ID Sorting ---")

# ==============================================================================
# [1] 설정
# ==============================================================================
TRAIN_FILE = 'train.csv'
TEST_FOLDER = 'test/'
SAMPLE_FILE = 'sample_submission.csv'
OUTPUT_FILE = 'submission_physics.csv'

BATCH_SIZE = 64
HIDDEN_DIM = 256
NUM_LAYERS = 2
DROPOUT = 0.3
LEARNING_RATE = 0.001
EPOCHS = 50
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if not os.path.exists(SAMPLE_FILE):
    raise FileNotFoundError(f"'{SAMPLE_FILE}' 파일이 없습니다.")

# ==============================================================================
# [2] 핵심 구현 1: 물리적 특징 매핑 (Physics Embedding)
# ==============================================================================
def apply_physics_features(df):
    """
    단순한 Label Encoding 대신, 이벤트의 물리적 성격을 수치화하여 매핑합니다.
    """
    # 1. 속도 점수 (0~1): 공이 얼마나 빠르게 이동하는 이벤트인가?
    # Pass/Shot/Clearance 등은 빠르고, Duel/Touch 등은 느림
    speed_map = {
        'Pass': 0.9, 'Shot': 1.0, 'Clearance': 0.9, 'Cross': 0.9,
        'Carry': 0.4, 'Dribble': 0.5, 
        'Duel': 0.1, 'Foul': 0.0, 'Interception': 0.2, 'Block': 0.1,
        'Save': 0.1, 'Take On': 0.5, 'Ball Recovery': 0.3
    }
    
    # 2. 소유권 점수 (0~1): 선수가 공을 얼마나 통제하고 있는가?
    # Carry/Dribble은 통제 중(1), Pass/Shot은 통제 벗어남(0), 경합은 반반(0.5)
    control_map = {
        'Carry': 1.0, 'Dribble': 1.0, 'Take On': 1.0, 'Ball Touch': 1.0,
        'Pass': 0.0, 'Shot': 0.0, 'Clearance': 0.0, 'Cross': 0.0,
        'Duel': 0.5, 'Interception': 0.5, 'Block': 0.5, 'Foul': 0.0
    }
    
    # 3. 신체 접촉/충돌 가능성 (0~1)
    # Duel/Foul/Tackle은 높음
    contact_map = {
        'Duel': 1.0, 'Foul': 1.0, 'Tackle': 1.0, 'Aerial Duel': 1.0,
        'Block': 0.8, 'Interception': 0.6,
        'Pass': 0.1, 'Carry': 0.2, 'Shot': 0.1
    }

    # 매핑 적용 (없는 키는 평균값 0.5 또는 0.0 처리)
    df['physics_speed'] = df['type_name'].map(speed_map).fillna(0.4)
    df['physics_control'] = df['type_name'].map(control_map).fillna(0.5)
    df['physics_contact'] = df['type_name'].map(contact_map).fillna(0.1)
    
    return df

# ==============================================================================
# [3] 데이터 전처리 및 시퀀스 생성
# ==============================================================================
def create_sequences(df, scaler=None, is_train=True):
    df = df.copy()
    
    # --- [핵심 구현 2] 정렬 순서 보장 ---
    # 데이터 설명서: "action_id와 time_seconds 순서가 불일치할 수 있음"
    # -> 따라서 time_seconds가 아니라 action_id를 최우선 정렬 기준으로 삼아야 함.
    # -> game_id -> episode_id -> action_id 순으로 정렬해야 인과관계가 깨지지 않음.
    df = df.sort_values(['game_id', 'episode_id', 'action_id'])
    
    # 1. 물리적 특징 적용
    df = apply_physics_features(df)
    
    # 2. 기본 거리/각도 계산
    df['dist_to_goal'] = np.sqrt((105 - df['start_x'])**2 + (34 - df['start_y'])**2)
    df['angle_to_goal'] = np.arctan2((34 - df['start_y']), (105 - df['start_x']))
    
    # 3. 시간 차이 (Delta Time) 계산
    # action_id 순으로 정렬했으므로, 이전 행동과의 시간 차이를 구함
    # game_episode 키 생성
    if 'game_episode' not in df.columns:
        df['game_episode'] = df['game_id'].astype(str) + '_' + df['episode_id'].astype(str)
        
    df['time_delta'] = df.groupby('game_episode')['time_seconds'].diff().fillna(0)
    
    # 혹시 action_id 순서인데 시간이 역행하는 경우(데이터 오류) 음수 방지
    df['time_delta'] = df['time_delta'].apply(lambda x: x if x >= 0 else 0)

    # 4. 사용할 피처 선정
    # type_name(LabelEncoding) 대신 physics_features 사용
    feature_cols = [
        'start_x', 'start_y', 
        'dist_to_goal', 'angle_to_goal', 
        'time_delta', 
        'physics_speed', 'physics_control', 'physics_contact'
    ]
    
    # 5. 스케일링
    if is_train:
        scaler = StandardScaler()
        df[feature_cols] = scaler.fit_transform(df[feature_cols].values)
    else:
        df[feature_cols] = scaler.transform(df[feature_cols].values)

    # 6. 시퀀스 변환
    grouped = df.groupby(['game_id', 'episode_id']) # 이미 action_id 정렬됨
    
    sequences = []
    targets = []
    keys = []
    
    print(f" -> 시퀀스 변환 및 물리 피처 적용 ({'Train' if is_train else 'Test'})")
    
    for (game_id, episode_id), group in grouped:
        # 입력 시퀀스
        seq_tensor = torch.tensor(group[feature_cols].values, dtype=torch.float32)
        sequences.append(seq_tensor)
        keys.append((game_id, episode_id))
        
        # 정답 (Train only)
        if is_train:
            target = group[['end_x', 'end_y']].iloc[-1].values
            targets.append(torch.tensor(target, dtype=torch.float32))
            
    return sequences, targets, keys, scaler, len(feature_cols)

# ==============================================================================
# [4] Dataset & LSTM Model (이전과 동일 구조)
# ==============================================================================
class SoccerSeqDataset(Dataset):
    def __init__(self, sequences, targets=None):
        self.sequences = sequences
        self.targets = targets
    def __len__(self): return len(self.sequences)
    def __getitem__(self, idx):
        if self.targets: return self.sequences[idx], self.targets[idx]
        return self.sequences[idx]

def collate_fn(batch):
    has_target = isinstance(batch[0], tuple)
    if has_target:
        sequences = [item[0] for item in batch]
        targets = torch.stack([item[1] for item in batch])
    else:
        sequences = batch
        targets = None
    lengths = torch.tensor([len(seq) for seq in sequences])
    padded_seqs = pad_sequence(sequences, batch_first=True, padding_value=0)
    return padded_seqs, targets, lengths

class SoccerLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout):
        super(SoccerLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, 2)
        )
    def forward(self, x, lengths):
        packed_input = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_input)
        last_hidden = hidden[-1]
        out = self.fc(last_hidden)
        return out

# ==============================================================================
# [5] 실행 파이프라인
# ==============================================================================
print("1. 데이터 로드...")
df_train = pd.read_csv(TRAIN_FILE)

# Train 시퀀스 생성
train_seqs, train_targets, _, scaler, input_dim = create_sequences(df_train, is_train=True)

# DataLoader
train_dataset = SoccerSeqDataset(train_seqs, train_targets)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

# 모델 초기화
model = SoccerLSTM(input_dim, HIDDEN_DIM, NUM_LAYERS, DROPOUT).to(DEVICE)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

print(f"2. 학습 시작 (Epochs: {EPOCHS})...")
model.train()
for epoch in range(EPOCHS):
    total_loss = 0
    for padded_seqs, targets, lengths in train_loader:
        padded_seqs, targets = padded_seqs.to(DEVICE), targets.to(DEVICE)
        
        optimizer.zero_grad()
        outputs = model(padded_seqs, lengths)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    if (epoch + 1) % 5 == 0:
        print(f"   Epoch [{epoch+1}/{EPOCHS}], Loss: {total_loss/len(train_loader):.4f}")

print("3. Test 예측 및 병합...")
test_files = [os.path.join(r, f) for r, d, fs in os.walk(TEST_FOLDER) for f in fs if f.endswith(".csv")]
df_test = pd.concat([pd.read_csv(f) for f in test_files], ignore_index=True)

# Test 시퀀스 생성 (정렬 및 Physics 적용됨)
test_seqs, _, test_keys, _, _ = create_sequences(df_test, scaler=scaler, is_train=False)

test_dataset = SoccerSeqDataset(test_seqs)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

model.eval()
all_preds = []
with torch.no_grad():
    for padded_seqs, _, lengths in test_loader:
        padded_seqs = padded_seqs.to(DEVICE)
        outputs = model(padded_seqs, lengths)
        all_preds.append(outputs.cpu().numpy())

predictions = np.vstack(all_preds)
pred_x = np.clip(predictions[:, 0], 0, 105.0)
pred_y = np.clip(predictions[:, 1], 0, 68.0)

# 결과 매칭
result_df = pd.DataFrame(test_keys, columns=['game_id', 'episode_id'])
result_df['pred_end_x'] = pred_x
result_df['pred_end_y'] = pred_y

# Sample Submission 병합
submission = pd.read_csv(SAMPLE_FILE)
submission['game_id'] = submission['game_episode'].apply(lambda x: int(x.split('_')[0]))
submission['episode_id'] = submission['game_episode'].apply(lambda x: int(x.split('_')[1]))

final_df = pd.merge(submission, result_df, on=['game_id', 'episode_id'], how='left')
final_df['end_x'] = final_df['pred_end_x']
final_df['end_y'] = final_df['pred_end_y']
final_df = final_df.fillna(0)

final_df[['game_episode', 'end_x', 'end_y']].to_csv(OUTPUT_FILE, index=False)
print(f"\n[성공] '{OUTPUT_FILE}' 생성 완료")

--- [Physics Enhanced] LSTM with Action-ID Sorting ---
1. 데이터 로드...
 -> 시퀀스 변환 및 물리 피처 적용 중... (Train)
2. 학습 시작 (Epochs: 50)...
   Epoch [5/50], Loss: 194.8685
   Epoch [10/50], Loss: 180.7657
   Epoch [15/50], Loss: 171.1107
   Epoch [20/50], Loss: 152.0418
   Epoch [25/50], Loss: 121.1683
   Epoch [30/50], Loss: 91.6479
   Epoch [35/50], Loss: 70.8074
   Epoch [40/50], Loss: 56.8144
   Epoch [45/50], Loss: 46.4760
   Epoch [50/50], Loss: 41.7839
3. Test 예측 및 병합...
 -> 시퀀스 변환 및 물리 피처 적용 중... (Test)

[성공] 'submission_physics.csv' 생성 완료.
