In [77]:
import pandas as pd
import datetime

In [78]:
users = pd.read_csv("../data/ml-1m/users.dat", sep="::", engine="python", header=None)
movies = pd.read_csv("../data/ml-1m/movies.dat", sep="::", engine="python", header=None)
ratings = pd.read_csv("../data/ml-1m/ratings.dat", sep="::", engine="python", header=None)

users.columns = ["user_id", "gender", "age", "occupation", "zip"]
movies.columns = ["movie_id", "title", "genre"]
ratings.columns = ["user_id", "movie_id", "rating", "timestamp"]

users = users.drop("zip", axis=1)

In [79]:
ages = {
    1:  "Under 18",
    18:  "18-24",
    25:  "25-34",
    35:  "35-44",
    45:  "45-49",
    50:  "50-55",
    56:  "56+",
}

occupations = {
	0:  "other",
	1:  "academic/educator",
	2:  "artist",
	3:  "clerical/admin",
	4:  "college/grad student",
	5:  "customer service",
	6:  "doctor/health care",
	7:  "executive/managerial",
	8:  "farmer",
	9:  "homemaker",
	10:  "K-12 student",
	11:  "lawyer",
	12:  "programmer",
	13:  "retired",
	14:  "sales/marketing",
	15:  "scientist",
	16:  "self-employed",
	17:  "technician/engineer",
	18:  "tradesman/craftsman",
	19:  "unemployed",
	20:  "writer",
}

In [80]:
users.age = users.age.replace(ages)
users.occupation = users.occupation.replace(occupations)

In [81]:
movies["year"] = movies.title.str.extract("\((\d{4})\)", expand=True)
movies.year = movies.year.apply(lambda x: str(int(x) // 10 * 10))

In [82]:
ratings = ratings.sort_values(by="timestamp")
ratings.timestamp = ratings.timestamp.apply(datetime.datetime.fromtimestamp)

In [83]:
split_date = datetime.datetime(year=2000, month=12, day=1)

In [84]:
train_df = ratings[ratings.timestamp < split_date]
test_df = ratings[ratings.timestamp >= split_date]

In [85]:
train_raw_sequences = (
    train_df.groupby("user_id")
    .movie_id.agg(list)
    .apply(lambda ls: " ".join(map(str, ls)))
    .rename("sequence")
)
test_raw_sequences = (
    test_df.groupby("user_id")
    .movie_id.agg(list)
    .apply(lambda ls: " ".join(map(str, ls)))
    .rename("sequence")
)

In [86]:
train_raw_sequences.to_csv("../data/ml-1m/train_raw_sequences.csv")
test_raw_sequences.to_csv("../data/ml-1m/test_raw_sequences.csv")
movies.to_csv("../data/ml-1m/movies.csv", index=False)
users.to_csv("../data/ml-1m/users.csv", index=False)

In [87]:
assert False

AssertionError: 

In [None]:
train_raw_sequences = pd.read_csv("../data/ml-1m/train_raw_sequences.csv", index_col="user_id")
train_raw_sequences.movie_id = train_raw_sequences.movie_id.apply(lambda s: s.split(" "))

test_raw_sequences = pd.read_csv("../data/ml-1m/test_raw_sequences.csv", index_col="user_id")
test_raw_sequences.movie_id = test_raw_sequences.movie_id.apply(lambda s: s.split(" "))

In [None]:
train_raw_sequences

Unnamed: 0_level_0,movie_id
user_id,Unnamed: 1_level_1
688,"[2028, 2729, 1196, 2670, 3701, 1193, 1278]"
689,"[1197, 2915, 1334, 1936, 3074, 908, 2568, 3845..."
690,"[1213, 1717, 1197, 1097, 2519, 1517, 1193, 266..."
691,"[1193, 1194, 1196, 553, 1197, 3819, 1148, 720,..."
692,"[2433, 1729, 1193, 174, 1617]"
...,...
6036,"[1721, 1883, 3438, 2376, 2428, 2683, 2572, 270..."
6037,"[1882, 702, 1267, 2028, 3508, 3148, 562, 858, ..."
6038,"[3396, 920, 1210, 2146, 1387, 356, 1079, 1148,..."
6039,"[111, 282, 2067, 1230, 930, 947, 3088, 3022, 3..."


In [None]:
# 各系列ごとの末尾n個をテストデータとする
import random
random.seed(0)

a = []

counter = {}
counter[-1] = 0
for i in range(5 + 1):
    counter[i * 10] = 0

for _, (user_id, movie_id) in sequences.iterrows():
    seq = movie_id
    seq_l = len(seq)

    while True:
        l = random.randint(-1, 5)
        if l == -1:
            # そのまま採用する
            train_seq = seq
            test_seq = []
            train_l = -1
            counter[train_l] += 1
            a.append((user_id, train_l, train_seq, test_seq))
            break
        else:
            train_l = l * 10
            if counter[train_l] == 500:
                continue

            test_l = 20
            required_l = train_l + test_l
            if seq_l < required_l:
                continue
            train_seq = seq[-required_l:-test_l]
            test_seq = seq[-test_l:]
            counter[train_l] += 1
            a.append((user_id, train_l, train_seq, test_seq))
            break

In [None]:
train_sequences = {}
test_sequences = {}
all_test_sequences = {}

test_size = [0, 10, 20, 30, 40, 50]
for sz in test_size:
    test_sequences[sz] = {}

for user_id, train_l, train_seq, test_seq in a:
    assert user_id not in train_sequences
    train_sequences[user_id] = " ".join(train_seq)
    if train_l >= 0:
        all_test_sequences[user_id] = " ".join(test_seq)
        test_sequences[train_l][user_id] = " ".join(test_seq)

for sz in test_size:
    file_path = f"../data/ml-1m-new/test-{sz}.csv"
    test_sz_df = pd.DataFrame.from_dict(test_sequences[sz], orient="index", columns=["sequence"])
    test_sz_df.index.name = "user_id"
    test_sz_df.to_csv(file_path)

In [None]:
train_df = pd.DataFrame.from_dict(train_sequences, orient="index", columns=["sequence"])
train_df.index.name = "user_id"
train_df.to_csv("../data/ml-1m-new/train.csv")

In [None]:
test_df = pd.DataFrame.from_dict(all_test_sequences, orient="index", columns=["sequence"])
test_df.index.name = "user_id"
test_df.to_csv("../data/ml-1m-new/test.csv")