In [1]:
import pandas as pd

movies = pd.read_csv(
    "ml-1m/movies.dat",
    sep="::",
    encoding="latin1",
    engine="python",
    header=None,
    names=["movie_id", "title", "genres"],
)
ratings = pd.read_csv(
    "ml-1m/ratings.dat",
    sep="::",
    encoding="latin1",
    engine="python",
    header=None,
    names=["user_id", "movie_id", "rating", "date"],
)
users = pd.read_csv(
    "ml-1m/users.dat",
    sep="::",
    encoding="latin1",
    engine="python",
    header=None,
    names=["user_id", "gender", "age", "occupation", "zip"],
)

In [2]:
# ref : https://recruit.gmo.jp/engineer/jisedai/blog/python_movie_recommendation/
# ref : https://recruit.gmo.jp/engineer/jisedai/blog/movielens_fmm/

# 公開年の取り出し
movies["release"] = (
    movies["title"].str.findall(r"\((\d{4})\)$").apply(lambda x: x[0]).astype(int)
)
# ジャンルのリスト化
movies["genres"] = movies["genres"].str.split("|")

# UNIX時間→日付へ
ratings["date"] = pd.to_datetime(ratings["date"], unit="s")

# ageカテゴリとoccupation（職業）カテゴリの変換
age_map = {
    1: " -18",
    18: "18-24",
    25: "25-34",
    35: "35-44",
    45: "45-49",
    50: "50-55",
    56: "56+",
}
occupation_map = {
    0: "other",
    1: "academic/educator",
    2: "artist",
    3: "clerical/admin",
    4: "college/grad student",
    5: "customer service",
    6: "doctor/health care",
    7: "executive/managerial",
    8: "farmer",
    9: "homemaker",
    10: "K-12 student",
    11: "lawyer",
    12: "programmer",
    13: "retired",
    14: "sales/marketing",
    15: "scientist",
    16: "self-employed",
    17: "technician/engineer",
    18: "tradesman/craftsman",
    19: "unemployed",
    20: "writer",
}
users["age"] = users["age"].map(age_map)
users["occupation"] = users["occupation"].map(occupation_map)