In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import os
from pathlib import Path
from tqdm import tqdm

# 데이터 디렉토리
DATA_DIR = Path("../data")
RAW_ZIP_PATH = DATA_DIR / "geolife_raw_clean.zip"
RAW_EXTRACTED_PATH = DATA_DIR / "raw"  # 압축 해제 후 경로
PROCESSED_DIR = DATA_DIR / "processed"
EXTERNAL_DIR = DATA_DIR / "external"

PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
EXTERNAL_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
if not RAW_EXTRACTED_PATH.exists():
    with zipfile.ZipFile(RAW_ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(RAW_EXTRACTED_PATH)
    print("압축 해제 완료")
else:
    print("이미 압축 해제됨")

In [6]:
def load_trajectory_file(filepath):
    df = pd.read_csv(
        filepath,
        skiprows=6,
        names=["lat", "lon", "zero", "altitude", "date_days", "date", "time"],
    )
    df["datetime"] = pd.to_datetime(df["date"] + " " + df["time"])
    return df[["datetime", "lat", "lon", "altitude"]]

# 예시
sample_file = list(RAW_EXTRACTED_PATH.glob("*.plt"))[0]
df_sample = load_trajectory_file(sample_file)
df_sample.head()

Unnamed: 0,datetime,lat,lon,altitude
0,2009-07-05 02:53:07,39.99984,116.325001,487
1,2009-07-05 02:53:12,39.999899,116.324809,477
2,2009-07-05 02:53:17,40.000017,116.324672,468
3,2009-07-05 02:53:22,40.000234,116.324729,460
4,2009-07-05 02:53:27,40.000363,116.32467,450


In [8]:
def add_context_features(df, user_id):
    df = df.copy()
    df["user_id"] = user_id
    df["hour"] = df["datetime"].dt.hour
    df["dayofweek"] = df["datetime"].dt.dayofweek
    df["month"] = df["datetime"].dt.month
    return df

all_data = []

for user_path in tqdm(sorted(RAW_EXTRACTED_PATH.glob("*"))):
    user_id = user_path.name
    traj_files = list(user_path.glob("*.plt"))
    
    for traj_file in traj_files:
        try:
            df_traj = load_trajectory_file(traj_file)
            df_traj = add_context_features(df_traj, user_id)
            all_data.append(df_traj)
        except Exception as e:
            print(f"❌ Failed to load {traj_file.name}: {e}")

df_all = pd.concat(all_data).reset_index(drop=True)
df_all.head()

100%|██████████| 17784/17784 [00:00<00:00, 89384.88it/s]


ValueError: No objects to concatenate

In [None]:
sns.countplot(x="hour", data=df_all)
plt.title("시간대별 위치 기록 수")
plt.show()

In [None]:
# 전체 trajectory 데이터
df_all.to_parquet(PROCESSED_DIR / "trajectories.parquet", index=False)

# 맥락 정보만 별도로 저장
df_context = df_all[["datetime", "user_id", "hour", "dayofweek", "month"]]
df_context.to_parquet(EXTERNAL_DIR / "context_features.parquet", index=False)

print("✅ 저장 완료")