In [3]:
# 자동화된 전처리 스크립트 (MovieLens 1M 데이터셋 기준)

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import os

def preprocess_movielens_1m(ratings_path, slot_count=10, min_rating=4.0, min_core=10, output_dir="pda"):
    # 1. Load ratings
    df = pd.read_csv(ratings_path, sep="::", engine="python", 
                     names=["user_id", "item_id", "rating", "timestamp"])
    
    # 2. Filter by rating threshold (implicit feedback)
    df = df[df["rating"] >= min_rating].copy()
    df["click"] = 1

    # 3. Convert timestamp to datetime
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s")
    df = df.sort_values("timestamp")

    # 4. 10-core filtering
    def filter_g_k_one(data, k=10, u_name='uid', i_name='iid', y_name='click'):
        item_group = data.groupby(i_name).agg({y_name: 'count'})
        item_g10 = item_group[item_group[y_name] >= k].index
        data_new = data[data[i_name].isin(item_g10)]
        user_group = data_new.groupby(u_name).agg({y_name: 'count'})
        user_g10 = user_group[user_group[y_name] >= k].index
        data_new = data_new[data_new[u_name].isin(user_g10)]
        return data_new

    def filter_tot(data, k=10, u_name='uid', i_name='iid', y_name='click'):
        data_new = data
        while True:
            data_new = filter_g_k_one(data_new, k=k, u_name=u_name, i_name=i_name, y_name=y_name)
            m1 = data_new.groupby(i_name).agg({y_name: 'count'})
            m2 = data_new.groupby(u_name).agg({y_name: 'count'})
            num1 = m1[y_name].min()
            num2 = m2[y_name].min()
            if num1 >= k and num2 >= k:
                break
        return data_new

    # 5. Encode user/item ids
    user_encoder = LabelEncoder()
    item_encoder = LabelEncoder()
    df["uid"] = user_encoder.fit_transform(df["user_id"])
    df["iid"] = item_encoder.fit_transform(df["item_id"])

    # 6. Apply core filtering
    df_filtered = filter_tot(df, k=min_core, u_name="uid", i_name="iid", y_name="click")

    # 7. Time slot assignment
    min_time, max_time = df_filtered["timestamp"].min(), df_filtered["timestamp"].max()
    time_bins = pd.date_range(start=min_time, end=max_time, periods=slot_count + 1)
    df_filtered["time"] = pd.cut(df_filtered["timestamp"], bins=time_bins, labels=False, include_lowest=True)

    # 8. Add stars
    df_filtered["stars"] = 1

    # 9. Save train_with_time.txt
    # os.makedirs(output_dir, exist_ok=True)
    train_with_time_path = os.path.join("train_with_time.txt")
    df_filtered[["uid", "iid", "time", "stars"]].to_csv(train_with_time_path, sep=" ", index=False, header=False)

    # 10. Save t_*.txt files for each slot
    split_dir = os.path.join("split_data")
    # os.makedirs(split_dir, exist_ok=True)
    for t in range(slot_count):
        df_t = df_filtered[df_filtered["time"] == t]
        item_user_dict = df_t.groupby("iid")["uid"].apply(list)
        lines = [f"{item} " + " ".join(map(str, users)) for item, users in item_user_dict.items()]
        with open(os.path.join(split_dir, f"t_{t}.txt"), "w") as f:
            f.write("\n".join(lines))

    return df_filtered[["uid", "iid", "time", "stars"]].head(10)

# 실제 실행 경로와 파일명 확인 후 여기에 삽입
preprocess_movielens_1m("ml-1m/ratings.dat")



Unnamed: 0,uid,iid,time,stars
1000138,6037,758,0,1
1000153,6037,2082,0,1
999873,6037,563,0,1
1000007,6037,1688,0,1
1000192,6037,1745,0,1
999967,6037,2767,0,1
999920,6037,202,0,1
999868,6037,547,0,1
999980,6037,3116,0,1
999888,6037,807,0,1


In [4]:
import pandas as pd

# 경로 설정
train_file = "train_with_time.txt"

# 데이터 불러오기
df = pd.read_csv(train_file, sep=' ', names=["uid", "iid", "time", "stars"])

# valid/test 시점 지정
valid_slot = df["time"].max() - 1  # 예: 8
test_slot = df["time"].max()       # 예: 9

# 분할
valid_df = df[df["time"] == valid_slot]
test_df = df[df["time"] == test_slot]

# 저장
valid_df.to_csv("valid_with_time.txt", sep=' ', index=False, header=False)
test_df.to_csv( "test_with_time.txt", sep=' ', index=False, header=False)

print("✅ valid/test 데이터 생성 완료!")


✅ valid/test 데이터 생성 완료!
