### 필요 패키지 Import

In [1]:
import pandas as pd
import gc

from src.pipeline import main

### 임베딩 데이터 로드

In [None]:
# 1. 데이터 읽기 및 병합
what_columns = \
    [f'mean_{i}' for i in range(64)] \
    # + [f'last_{i}' for i in range(64)] \
    # + ['row_id']

# 각 fold 데이터 읽기
fold_dfs = []
for i in range(5):
    fold_path = f'./data/seq_w2v_embedding/embed_valid_fold{i}.parquet'
    fold_df = pd.read_parquet(fold_path, columns=what_columns)
    fold_dfs.append(fold_df)

# 병합 (axis=0으로 행 방향 병합)
train_seq_embeddings = pd.concat(fold_dfs, axis=0, ignore_index=True)

# 메모리 정리
del fold_dfs
gc.collect()

# 2. row_id 기준으로 정렬 및 인덱스 재설정
train_seq_embeddings.sort_values(by='row_id', inplace=True)

# 3. 중복 row_id 확인 및 처리
print(f"중복 row_id 개수: {train_seq_embeddings['row_id'].duplicated().sum()}")

# 중복이 있다면 처리 (첫 번째 값만 유지)
if train_seq_embeddings['row_id'].duplicated().any():
    print("중복 row_id 제거 중...")
    train_seq_embeddings = train_seq_embeddings.drop_duplicates(subset=['row_id'], keep='first')

# 4. row_id를 인덱스로 설정 후 컬럼 제거
train_seq_embeddings = train_seq_embeddings.set_index('row_id')

# 5. train_df의 인덱스와 일치하도록 재정렬 (필요한 경우)
# train_df의 row_id 또는 인덱스 순서와 일치시키기
train_seq_embeddings = train_seq_embeddings.reset_index(drop=True)

print(f"최종 train shape: {train_seq_embeddings.shape}")
print(f"인덱스 중복 확인: {train_seq_embeddings.index.duplicated().any()}")

test_seq_embeddings = pd.read_parquet('./data/seq_w2v_embedding/test/test_ensemble.parquet', columns=what_columns)
test_seq_embeddings.drop(columns=['row_id'], inplace=True)

print('\n최종 test.shape', test_seq_embeddings.shape)

중복 row_id 개수: 0
최종 train shape: (10704179, 128)
인덱스 중복 확인: False

최종 test.shape (1527298, 128)


### 통계치 데이터 로드

In [None]:
# ===================================
# 1. 데이터 불러오기 및 학습 데이터 준비
# ===================================

# 1. seq 통계치 데이터 (len, unique_len, first, last, max, min, mean, std)
train_seq_stats = pd.read_parquet('./data/seq_stats/train_seq_stats.parquet')
test_seq_stats = pd.read_parquet('./data/seq_stats/test_seq_stats.parquet')
print('train_seq_stats.shape', train_seq_stats.shape)
print('test_seq_stats.shape', test_seq_stats.shape)

# 2. seq 반복 패턴 데이터 ('max_streak', 'consecutive_dupe_ratio', 'is_last_in_streak', 'num_unique_streaks', 'avg_streak_length')
train_seq_repeat_pattern = pd.read_parquet('./data/seq_repeat_pattern/train_seq_repeat_pattern.parquet')
test_seq_repeat_pattern = pd.read_parquet('./data/seq_repeat_pattern/test_seq_repeat_pattern.parquet')
print('train_seq_repeat_pattern.shape', train_seq_repeat_pattern.shape)
print('test_seq_repeat_pattern.shape', test_seq_repeat_pattern.shape)

# 3. train, test 데이터
train_df = pd.read_parquet('./data/train_optimized.parquet')
test_df = pd.read_parquet('./data/test_optimized.parquet')
print('train_df.shape', train_df.shape)
print('test_df.shape', test_df.shape)

# 4. 상호작용 데이터 생성
train_df['age_inv_interaction'] = train_df['age_group'].astype(str) + '_' + train_df['inventory_id'].astype(str)
test_df['age_inv_interaction'] = test_df['age_group'].astype(str) + '_' + test_df['inventory_id'].astype(str)

train_df['unique_len_ratio'] = train_df['seq_unique_len'] / train_df['seq_len']
test_df['unique_len_ratio'] = test_df['seq_unique_len'] / test_df['seq_len']

train_df['unique_len_diff'] = train_df['seq_len'] - train_df['seq_unique_len']
test_df['unique_len_diff'] = test_df['seq_len'] - test_df['seq_unique_len']

print('\n', '='*30)
print('상호작용 데이터 생성 후')
print('='*30)
print('train.shape', train_df.shape)
print('test.shape', test_df.shape)

# 5. 테이블 병합
train_df = pd.concat([train_df, train_seq_stats, train_seq_repeat_pattern, test_seq_embeddings], axis=1)
test_df = pd.concat([test_df, test_seq_stats, test_seq_repeat_pattern, test_seq_embeddings], axis=1)

print('\n', '='*30)
print('병합 후')
print('='*30)
print('train.shape', train_df.shape)
print('test.shape', test_df.shape)

train_seq_stats.shape (10704179, 8)
test_seq_stats.shape (1527298, 8)
train_seq_repeat_pattern.shape (10704179, 5)
test_seq_repeat_pattern.shape (1527298, 5)
train_df.shape (10704179, 119)
test_df.shape (1527298, 118)

병합 후
train.shape (10704179, 260)
test.shape (10704179, 259)


In [6]:
# 실행
oof_preds, test_preds = main(train_df, test_df)

Feature types updated.
✅ Loaded fold assignments: ./data/seq_w2v_embedding/fold_assign.parquet
   Shape: (10704179, 2)
   Folds: [4 3 2 0 1]
Using saved fold assignments for reproducibility

===== Fold 0 =====
Train size: 8,563,343, Val size: 2,140,836
Scale pos weight: 51.43
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[181]	valid_0's toss_score: 0.348136
Fold 0 Score: 0.34814

===== Fold 1 =====
Train size: 8,563,343, Val size: 2,140,836
Scale pos weight: 51.43
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[176]	valid_0's toss_score: 0.347671
Fold 1 Score: 0.34767

===== Fold 2 =====
Train size: 8,563,343, Val size: 2,140,836
Scale pos weight: 51.43
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[174]	valid_0's toss_score: 0.347253
Fold 2 Score: 0.34725

===== Fold 3 =====
Train size: 8,563,343, Val size: 2,140,836
Scale pos weight: 

FileNotFoundError: [Errno 2] No such file or directory: './raw_data/sample_submission.csv'