In [None]:
import pandas as pd#버전1에서LSTM로 변경 
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from sklearn.preprocessing import StandardScaler, LabelEncoder
import os
import warnings

warnings.filterwarnings('ignore')

print("--- [High Accuracy] LSTM Sequence Model ---")

# ==============================================================================
# [1] 설정 및 하이퍼파라미터
# ==============================================================================
TRAIN_FILE = 'train.csv'
TEST_FOLDER = 'test/'
SAMPLE_FILE = 'sample_submission.csv'
OUTPUT_FILE = 'submission_lstm.csv'

# 하이퍼파라미터 (정확도를 위해 튜닝 가능)
BATCH_SIZE = 64
HIDDEN_DIM = 256  # 은닉층 크기를 키움
NUM_LAYERS = 2    # 층을 쌓아 복잡한 패턴 학습
DROPOUT = 0.3
LEARNING_RATE = 0.001
EPOCHS = 50       # 충분한 학습
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if not os.path.exists(SAMPLE_FILE):
    raise FileNotFoundError(f"'{SAMPLE_FILE}' 파일이 없습니다.")

print(f" -> 사용 장치: {DEVICE}")

# ==============================================================================
# [2] 데이터 전처리: 시퀀스 생성 함수
# ==============================================================================
def create_sequences(df, scaler=None, encoders=None, is_train=True):
    df = df.copy()
    
    # 1. 기본 피처 생성
    df['dist_to_goal'] = np.sqrt((105 - df['start_x'])**2 + (34 - df['start_y'])**2)
    df['angle_to_goal'] = np.arctan2((34 - df['start_y']), (105 - df['start_x']))
    
    # 시간 차이 (Delta Time)
    # game_episode 키 생성 (그룹핑용)
    if 'game_episode' not in df.columns:
        df['game_episode'] = df['game_id'].astype(str) + '_' + df['episode_id'].astype(str)
        
    df['time_delta'] = df.groupby('game_episode')['time_seconds'].diff().fillna(0)

    # 2. 범주형 인코딩
    cat_cols = ['type_name', 'result_name']
    if is_train:
        encoders = {}
        for col in cat_cols:
            le = LabelEncoder()
            df[col] = df[col].fillna('Unknown').astype(str)
            df[col] = le.fit_transform(df[col])
            encoders[col] = le
    else:
        for col in cat_cols:
            df[col] = df[col].fillna('Unknown').astype(str)
            le = encoders[col]
            df[col] = df[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

    # 3. 스케일링 (LSTM은 스케일에 매우 민감함)
    # 사용할 피처 정의
    feature_cols = ['start_x', 'start_y', 'dist_to_goal', 'angle_to_goal', 'time_delta', 'type_name', 'result_name']
    
    if is_train:
        scaler = StandardScaler()
        df[feature_cols] = scaler.fit_transform(df[feature_cols].values)
    else:
        df[feature_cols] = scaler.transform(df[feature_cols].values)

    # 4. 시퀀스 변환 (Group by Episode)
    # 각 에피소드를 (Seq_Len, Features) 형태의 텐서로 변환하여 리스트에 저장
    grouped = df.groupby(['game_id', 'episode_id'])
    
    sequences = []
    targets = []
    keys = [] # (game_id, episode_id) 저장
    
    print(f" -> 시퀀스 변환 중... ({'Train' if is_train else 'Test'})")
    
    for (game_id, episode_id), group in grouped:
        # 입력 시퀀스 (Features)
        seq_tensor = torch.tensor(group[feature_cols].values, dtype=torch.float32)
        sequences.append(seq_tensor)
        keys.append((game_id, episode_id))
        
        # 정답 (Target) - 마지막 행의 end_x, end_y
        if is_train:
            target = group[['end_x', 'end_y']].iloc[-1].values
            targets.append(torch.tensor(target, dtype=torch.float32))
            
    return sequences, targets, keys, scaler, encoders, len(feature_cols)

# ==============================================================================
# [3] Dataset & Collate Function (패딩 처리)
# ==============================================================================
class SoccerSeqDataset(Dataset):
    def __init__(self, sequences, targets=None):
        self.sequences = sequences
        self.targets = targets
        
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        if self.targets:
            return self.sequences[idx], self.targets[idx]
        return self.sequences[idx]

def collate_fn(batch):
    # 배치 내의 가변 길이 시퀀스들을 패딩하여 길이를 맞춤
    # batch: list of (seq, target) or list of seq
    
    has_target = isinstance(batch[0], tuple)
    
    if has_target:
        sequences = [item[0] for item in batch]
        targets = torch.stack([item[1] for item in batch])
    else:
        sequences = batch
        targets = None
        
    # 시퀀스 길이 정보 저장 (pack_padded_sequence용)
    lengths = torch.tensor([len(seq) for seq in sequences])
    
    # 패딩 (가장 긴 시퀀스 길이에 맞춤, 나머지 0으로 채움)
    # batch_first=True -> (Batch, Max_Seq_Len, Feat)
    padded_seqs = pad_sequence(sequences, batch_first=True, padding_value=0)
    
    return padded_seqs, targets, lengths

# ==============================================================================
# [4] LSTM 모델 정의
# ==============================================================================
class SoccerLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout):
        super(SoccerLSTM, self).__init__()
        
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, 
                            batch_first=True, dropout=dropout)
        
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, 2) # end_x, end_y
        )
        
    def forward(self, x, lengths):
        # x: (Batch, Seq_Len, Input_Dim)
        
        # Pack: 패딩된 0 부분을 연산에서 제외하여 속도/정확도 향상
        packed_input = pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
        
        # LSTM 통과
        packed_output, (hidden, cell) = self.lstm(packed_input)
        
        # Unpack은 필요 없음 (마지막 hidden state만 쓰기 때문)
        # hidden: (Num_Layers, Batch, Hidden_Dim)
        # 마지막 층의 hidden state 추출
        last_hidden = hidden[-1]
        
        # 좌표 예측
        out = self.fc(last_hidden)
        return out

# ==============================================================================
# [5] 데이터 준비 및 학습
# ==============================================================================
print("1. 데이터 로드 및 시퀀스 변환...")
df_train = pd.read_csv(TRAIN_FILE)

# Train 시퀀스 생성
train_seqs, train_targets, _, scaler, saved_encoders, input_dim = create_sequences(df_train, is_train=True)

# Dataset & DataLoader
train_dataset = SoccerSeqDataset(train_seqs, train_targets)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

# 모델 생성
model = SoccerLSTM(input_dim, HIDDEN_DIM, NUM_LAYERS, DROPOUT).to(DEVICE)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

print(f"2. 학습 시작 (Epochs: {EPOCHS})...")
model.train()

for epoch in range(EPOCHS):
    total_loss = 0
    for padded_seqs, targets, lengths in train_loader:
        padded_seqs = padded_seqs.to(DEVICE)
        targets = targets.to(DEVICE)
        # lengths는 CPU에 있어도 됨 (pack_padded_sequence가 처리)
        
        optimizer.zero_grad()
        outputs = model(padded_seqs, lengths)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    if (epoch + 1) % 5 == 0:
        avg_loss = total_loss / len(train_loader)
        print(f"   Epoch [{epoch+1}/{EPOCHS}], Loss: {avg_loss:.4f}")

# ==============================================================================
# [6] Test 예측 및 결과 병합
# ==============================================================================
print("3. Test 데이터 처리...")

# Test 파일 로드
test_files = []
for root, dirs, files in os.walk(TEST_FOLDER):
    for file in files:
        if file.endswith(".csv"):
            test_files.append(os.path.join(root, file))

df_test_list = [pd.read_csv(f) for f in test_files]
df_test = pd.concat(df_test_list, ignore_index=True)

# Test 시퀀스 생성 (Targets=None, Keys 필요)
test_seqs, _, test_keys, _, _, _ = create_sequences(df_test, scaler=scaler, encoders=saved_encoders, is_train=False)

test_dataset = SoccerSeqDataset(test_seqs)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print("4. 예측 수행...")
model.eval()
all_preds = []

with torch.no_grad():
    for padded_seqs, _, lengths in test_loader:
        padded_seqs = padded_seqs.to(DEVICE)
        outputs = model(padded_seqs, lengths)
        all_preds.append(outputs.cpu().numpy())

predictions = np.vstack(all_preds)

# Clipping
MAX_X, MAX_Y = 105.0, 68.0
pred_x = np.clip(predictions[:, 0], 0, MAX_X)
pred_y = np.clip(predictions[:, 1], 0, MAX_Y)

# 결과 DataFrame 생성
# test_keys 순서대로 예측값이 나왔으므로 그대로 매칭
result_df = pd.DataFrame(test_keys, columns=['game_id', 'episode_id'])
result_df['pred_end_x'] = pred_x
result_df['pred_end_y'] = pred_y

# ==============================================================================
# [7] Sample Submission 병합 (사진 기준 컬럼: game_episode)
# ==============================================================================
print("5. Sample Submission 파일 병합...")

submission = pd.read_csv(SAMPLE_FILE)

# game_episode 컬럼을 쪼개서 키 생성
submission['game_id'] = submission['game_episode'].apply(lambda x: int(x.split('_')[0]))
submission['episode_id'] = submission['game_episode'].apply(lambda x: int(x.split('_')[1]))

# 병합 (Left Join)
final_df = pd.merge(submission, result_df, on=['game_id', 'episode_id'], how='left')

# 컬럼 정리
final_df['end_x'] = final_df['pred_end_x']
final_df['end_y'] = final_df['pred_end_y']
final_df = final_df.fillna(0)

# 최종 저장 (game_episode, end_x, end_y)
final_output = final_df[['game_episode', 'end_x', 'end_y']]
final_output.to_csv(OUTPUT_FILE, index=False)

print(f"\n[성공] '{OUTPUT_FILE}' 생성 완료.")
print(f" -> 데이터 확인:\n{final_output.head()}")

--- [High Accuracy] LSTM Sequence Model ---
 -> 사용 장치: cpu
1. 데이터 로드 및 시퀀스 변환...
 -> 시퀀스 변환 중... (Train)
2. 학습 시작 (Epochs: 50)...
   Epoch [5/50], Loss: 192.5835
   Epoch [10/50], Loss: 177.9152
   Epoch [15/50], Loss: 168.7846
   Epoch [20/50], Loss: 152.9207
   Epoch [25/50], Loss: 127.3425
   Epoch [30/50], Loss: 97.6084
   Epoch [35/50], Loss: 75.6162
   Epoch [40/50], Loss: 62.3177
   Epoch [45/50], Loss: 50.3468
   Epoch [50/50], Loss: 43.1649
3. Test 데이터 처리...
 -> 시퀀스 변환 중... (Test)
4. 예측 수행...
5. Sample Submission 파일 병합...

[성공] 'submission_lstm.csv' 생성 완료.
 -> 데이터 확인:
  game_episode      end_x      end_y
0     153363_1  64.134933  11.696858
1     153363_2  33.889278  58.680855
2     153363_6  61.135262  61.424477
3     153363_7  74.947136  20.479153
4     153363_8  79.275505   4.700223
