In [94]:
import pandas as pd
import re
movies=pd.read_csv('movies.csv')
movies_db=pd.read_csv('movies_db.csv')

### 1.数据清洗

In [95]:
movies_db=movies_db.drop(
    columns=['durations','votes']
)
movies_db['INFO'] = (
    movies_db['genres'].fillna('').astype(str) + ' ' +
    movies_db['countries'].fillna('').astype(str) + ' ' +
    movies_db['reviews'].fillna('').astype(str)
)

movies_db = movies_db.drop(columns=['genres', 'countries', 'reviews'])
movies_db['title'] = movies_db['title'].apply(lambda x: ''.join(re.findall(r'[\u4e00-\u9fff]+', str(x))))
movies_db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1172 entries, 0 to 1171
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   subject_id  1172 non-null   object
 1   title       1172 non-null   object
 2   year        1172 non-null   object
 3   rating      1168 non-null   object
 4   directors   1169 non-null   object
 5   INFO        1172 non-null   object
dtypes: object(6)
memory usage: 55.1+ KB


删除无用列并只保留评分大于6.5的电影

In [96]:
movies = movies.drop(
    columns=['COVER','IMDB_ID','MINS','OFFICIAL_SITE','RELEASE_DATE','SLUG','ACTOR_IDS','DIRECTOR_IDS','LANGUAGES','GENRES','ALIAS','ACTORS']
)
movies = movies[movies['DOUBAN_SCORE'] >= 6.5]

筛选评分人数大于5000的电影并降序排列

In [97]:
movies_new = movies[(movies['DOUBAN_VOTES'] >= 3000)].sort_values(by=['DOUBAN_SCORE','DOUBAN_VOTES'], ascending=[False,False])[['DIRECTORS','MOVIE_ID','NAME','DOUBAN_SCORE','STORYLINE','TAGS','REGIONS','YEAR']]

### 2.余弦相似度模型构建

In [98]:
movies_new['INFO'] = (
    movies['STORYLINE'].fillna('').astype(str) + " " +
    movies['TAGS'].fillna('').astype(str) + " " +
    movies['REGIONS'].fillna('').astype(str)
#    + " " +
#    movies['DIRECTORS'].fillna('').astype(str) + " " +
#    movies['ACTORS'].fillna('').astype(str) + " " +
#    movies['YEAR'].fillna('').astype(str)
)
movies_new = movies_new.drop(columns=['STORYLINE', 'TAGS', 'REGIONS'
                                      #, 'DIRECTORS', 'ACTORS', 'YEAR'
                                     ])

In [99]:
movies_db_renamed = movies_db.rename(columns={
    'subject_id': 'MOVIE_ID',
    'title': 'NAME',
    'year': 'YEAR',
    'rating': 'DOUBAN_SCORE',
    'directors': 'DIRECTORS'
})
movies_db_renamed = movies_db_renamed[['DIRECTORS', 'MOVIE_ID', 'NAME', 'DOUBAN_SCORE', 'YEAR', 'INFO']]
movies_new = pd.concat([movies_new, movies_db_renamed], ignore_index=True)

print(movies_new.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5058 entries, 0 to 5057
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   DIRECTORS     4913 non-null   object
 1   MOVIE_ID      5058 non-null   object
 2   NAME          5058 non-null   object
 3   DOUBAN_SCORE  5054 non-null   object
 4   YEAR          5058 non-null   object
 5   INFO          5058 non-null   object
dtypes: object(6)
memory usage: 237.2+ KB
None


In [100]:
director_label = pd.read_csv("director_label.csv")
director_to_label = dict(zip(director_label["DIRECTOR"], director_label["LABEL"]))
movies_new["LABEL"] = movies_new["DIRECTORS"].apply(
    lambda x: ",".join({director_to_label.get(d.strip()) for d in str(x).split("/") if director_to_label.get(d.strip())})
    if pd.notna(x) else None
)

In [101]:
movies_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5058 entries, 0 to 5057
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   DIRECTORS     4913 non-null   object
 1   MOVIE_ID      5058 non-null   object
 2   NAME          5058 non-null   object
 3   DOUBAN_SCORE  5054 non-null   object
 4   YEAR          5058 non-null   object
 5   INFO          5058 non-null   object
 6   LABEL         4913 non-null   object
dtypes: object(7)
memory usage: 276.7+ KB


In [102]:
import jieba
from sklearn.feature_extraction.text import CountVectorizer

In [103]:
# 中文分词器
def chinese_tokenizer(text):
    return jieba.lcut(str(text))
stopwords = [
    "的", "了", "在", "是", "我", "有", "和", "就", "不", "人", 
    "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去",
    "你", "会", "着", "没有", "看", "好", "自己", "这", "那", 
    "为", "之", "对", "与", "而", "并", "等", "被", "及", "或",
    "但", "所以", "如果", "因为", "然后", "而且", "那么", "他们", 
    "我们", "你们", "它们", "什么", "哪个", "哪些", "哪里", "时候",
    "他", "她", "它", "咱们", "大家", "谁", "怎样", "怎么", "多少", "为什么",
    "这里", "那里", "这样", "那样", "这个", "那个", "这些", "那些",
    "地", "得", "所", "过", "吗", "呢", "吧", "啊", "呀", "嘛", "哇", "啦",
    "从", "自", "以", "向", "关于", "对于", "根据", "按照", "通过", "由于",
    "并且", "或者", "虽然", "即使", "尽管", "不管", "只要", "只有", "除非",
    "最", "太", "更", "非常", "十分", "特别", "极其", "比较", "稍微", "有点",
    "刚", "才", "正在", "已经", "曾经", "马上", "立刻", "永远", "一直", "总是",
    "常常", "经常", "往往", "不断", "偶尔", "又", "再", "还", "仅", "光",
    "能", "能够", "可以", "可能", "应该", "应当", "想", "愿意", "肯", "敢",
    "来", "去", "进", "出", "回", "起", "开",
    "些", "一些", "所有", "每个", "某个", "各种", "多个", "几个", "第一", "第二",
    "就是", "只是", "可是", "真是", "也是", "不是", "也是", "就是", "正是",
    "一样", "一般", "一点", "一起", "一直", "一下", "一些", "一种", "一次"
]
cv = CountVectorizer(
    max_features=10000,
    tokenizer=chinese_tokenizer,
    stop_words=stopwords, 
    token_pattern=None  
)

vector = cv.fit_transform(movies_new['INFO'].astype(str)).toarray()

In [104]:
# ========= 去噪变分自编码器（DVAE）— 稳定版（自定义Layer加KL） =========
import numpy as np
import tensorflow as tf
from tensorflow import keras

inp_dim    = vector.shape[1]
code_dim   = 64            # 可改 64/128
epochs     = 20
batch_size = 256
beta_kl    = 1.0          # β-VAE 系数

# 确保 dtype 稳定
vector = vector.astype("float32", copy=False)

# --- 编码器：输入端去噪 ---
inputs = keras.Input(shape=(inp_dim,), name="bow_counts")
x = keras.layers.GaussianNoise(0.15)(inputs)          # 去噪；也可换 Dropout(0.3)
x = keras.layers.Dense(1000, activation="selu")(x)
x = keras.layers.Dense(256,  activation="selu")(x)
z_mean   = keras.layers.Dense(code_dim, name="z_mean")(x)
z_logvar = keras.layers.Dense(code_dim, name="z_logvar")(x)

def reparameterize(args):
    mu, logvar = args
    eps = tf.random.normal(shape=tf.shape(mu))
    return mu + tf.exp(0.5 * logvar) * eps

z = keras.layers.Lambda(reparameterize, name="z")([z_mean, z_logvar])

# 打包一个 encoder（包含三个输出：mean, logvar, z）
encoder = keras.Model(inputs, [z_mean, z_logvar, z], name="dvae_encoder")

# --- 解码器：线性输出，用 MSE 重构计数 ---
latent_inputs = keras.Input(shape=(code_dim,), name="z_in")
d = keras.layers.Dense(256,  activation="selu")(latent_inputs)
d = keras.layers.Dense(1000, activation="selu")(d)
recons = keras.layers.Dense(inp_dim, activation=None, name="recon")(d)
decoder = keras.Model(latent_inputs, recons, name="dvae_decoder")

# --- 自定义 KL 层：把 KL 正则通过 layer.add_loss() 注入 ---
class KLDivergenceLayer(keras.layers.Layer):
    def __init__(self, beta=1.0, scale=1.0, **kwargs):
        super().__init__(**kwargs)
        self.beta = beta
        self.scale = scale  # 用于与 MSE 标度对齐

    def call(self, inputs):
        mu, logvar = inputs
        # KL = -0.5 * sum(1 + logvar - exp(logvar) - mu^2)
        kl_per_sample = -0.5 * tf.reduce_sum(
            1.0 + logvar - tf.exp(logvar) - tf.square(mu), axis=1
        )
        kl = tf.reduce_mean(kl_per_sample) / float(self.scale)
        self.add_loss(self.beta * kl)
        # 返回个“占位输出”，不参与后续计算
        return tf.zeros_like(mu[:, :1])

# --- 组装 DVAE：重构 + KL层（只为加loss，不改计算图）---
z_mean_out, z_logvar_out, z_out = encoder(inputs)
_ = KLDivergenceLayer(beta=beta_kl, scale=inp_dim, name="kl_reg")([z_mean_out, z_logvar_out])
recons_out = decoder(z_out)

vae = keras.Model(inputs, recons_out, name="dvae")

# --- 训练：重构用 MSE；KL 已由 KL 层注入 ---
vae.compile(optimizer=keras.optimizers.Adam(1e-3), loss="mse")
vae.fit(vector, vector, epochs=epochs, batch_size=batch_size, validation_split=0.1, verbose=1)

# --- 用 z_mean 作为电影向量（更稳、更可插值）---
z_mean_val = encoder.predict(vector, verbose=0)[0]   # 形状: [N, code_dim]
feature = z_mean_val

# ========= DVAE 结束 =========

from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(feature)


Epoch 1/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 132ms/step - loss: 0.0486 - val_loss: 0.0869
Epoch 2/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 122ms/step - loss: 0.0198 - val_loss: 0.0610
Epoch 3/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 119ms/step - loss: 0.0148 - val_loss: 0.0466
Epoch 4/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 123ms/step - loss: 0.0130 - val_loss: 0.0408
Epoch 5/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 211ms/step - loss: 0.0123 - val_loss: 0.0371
Epoch 6/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 121ms/step - loss: 0.0117 - val_loss: 0.0345
Epoch 7/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 276ms/step - loss: 0.0113 - val_loss: 0.0335
Epoch 8/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 117ms/step - loss: 0.0110 - val_loss: 0.0333
Epoch 9/20
[1m18/18[0m [32m━━━━━━━━━━

In [105]:
from sklearn.metrics.pairwise import cosine_similarity

In [152]:
def recommand(movie_name, sample_top=15, pick_n=5):
    label_idx = movies_new.index[movies_new['NAME'] == movie_name]
    if len(label_idx) == 0:
        print("未找到该影片")
        return
    pos = movies_new.index.get_loc(label_idx[0])

    sims = similarity[pos]
    cand = np.argsort(-sims)   # 按相似度降序排列索引
    cand = cand[cand != pos]   # 去掉自身
    top_candidates = cand[:sample_top]  # 取前15个
    
    # 如果数量足够，从前15中随机选5个（无放回）
    n_pick = min(pick_n, len(top_candidates))
    selected = np.random.choice(top_candidates, n_pick, replace=False)

    recs = []
    for j in selected:
        recs.append({
            "电影名": movies_new.iloc[j]['NAME'],
            "豆瓣评分": movies_new.iloc[j]['DOUBAN_SCORE'],
            "流派": movies_new.iloc[j]['LABEL'],
            "相似度": sims[j],
            "导演": movies_new.iloc[j]['DIRECTORS']
        })
    df = pd.DataFrame(recs).sort_values(by="相似度", ascending=False).reset_index(drop=True)
    return df

In [153]:
import os
import pandas as pd

USER_LIKE = "user_wish.csv"
USER_DISLIKE = "user_dislike.csv"
REQUIRED = ["douban_id", "title", "info", "mark_time", "my_rating_stars", "my_rating_label"]

SCHEMA = {
    "douban_id": "Int64",     # 可空整型
    "title": "string",
    "info": "string",
    "mark_time": "string",
    "my_rating_stars": "Float64",  # 可空浮点
    "my_rating_label": "string",
}

def _empty_df_with_schema():
    return pd.DataFrame({c: pd.Series(dtype=t) for c, t in SCHEMA.items()})[REQUIRED]

def _ensure_csv(path: str):
    if not os.path.exists(path):
        _empty_df_with_schema().to_csv(path, index=False, encoding="utf-8-sig")

def _read_csv(path: str) -> pd.DataFrame:
    _ensure_csv(path)
    df = pd.read_csv(path, encoding="utf-8-sig")
    # 补齐缺列并按 schema 强制类型
    for c in REQUIRED:
        if c not in df.columns:
            df[c] = pd.NA
    df = df[REQUIRED].copy()
    for c, t in SCHEMA.items():
        try:
            if t == "Int64":
                df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")
            elif t == "Float64":
                df[c] = pd.to_numeric(df[c], errors="coerce").astype("Float64")
            else:
                df[c] = df[c].astype("string")
        except Exception:
            pass
    return df

def _record_from_movies_new_by_title(title: str) -> pd.DataFrame | None:
    """从 movies_new 里用片名精确匹配一行，并转为六列记录；找不到返回 None"""
    m = movies_new.loc[movies_new["NAME"] == title]
    if m.empty:
        return None
    r = m.iloc[0]
    rec = {
        "douban_id": r.get("MOVIE_ID", pd.NA),
        "title": r.get("NAME", pd.NA),
        "info": r.get("INFO", pd.NA),
        "mark_time": pd.NA,            # 初始无标记时间
        "my_rating_stars": pd.NA,      # 初始无个人评分
        "my_rating_label": pd.NA,      # 初始无标签
    }
    return pd.DataFrame([rec], columns=REQUIRED)

def _write_union(path: str, rec_df: pd.DataFrame) -> pd.DataFrame:
    """把一条规范记录写入 path，并按 douban_id 去重；返回写入后的 DataFrame"""
    # 先把新纪录也套用 schema
    rec_df = rec_df.reindex(columns=REQUIRED)
    for c, t in SCHEMA.items():
        try:
            if t == "Int64":
                rec_df[c] = pd.to_numeric(rec_df[c], errors="coerce").astype("Int64")
            elif t == "Float64":
                rec_df[c] = pd.to_numeric(rec_df[c], errors="coerce").astype("Float64")
            else:
                rec_df[c] = rec_df[c].astype("string")
        except Exception:
            pass

    df = _read_csv(path)

    if df.empty:
        out = rec_df.dropna(subset=["douban_id"])
    else:
        out = pd.concat([rec_df, df], ignore_index=True)
        out = out.dropna(subset=["douban_id"]).drop_duplicates(subset=["douban_id"], keep="first")

    out = out[REQUIRED]
    out.to_csv(path, index=False, encoding="utf-8-sig")
    return out  # ← 返回最新表

def _remove_by_id(path: str, douban_id_val) -> pd.DataFrame:
    """从 path 中移除某 douban_id；返回删除后的 DataFrame"""
    df = _read_csv(path)
    df["douban_id"] = pd.to_numeric(df["douban_id"], errors="coerce").astype("Int64")
    df = df[df["douban_id"] != pd.Series([douban_id_val]).astype("Int64").iloc[0]]
    df.to_csv(path, index=False, encoding="utf-8-sig")
    return df  # ← 返回最新表

def _feedback(title: str, like: bool = True) -> tuple[pd.DataFrame, pd.DataFrame]:
    """把片名对应的 movies_new 行格式化后写入 like/dislike，并保持互斥；
       返回 (wish_new_df, dislike_new_df)"""
    rec = _record_from_movies_new_by_title(title)
    if rec is None:
        print(f"未在 movies_new 中找到：{title}")
        return _read_csv(USER_LIKE), _read_csv(USER_DISLIKE)

    did = pd.to_numeric(rec.iloc[0]["douban_id"], errors="coerce")
    if pd.isna(did):
        print(f"该影片缺少 MOVIE_ID，已跳过：{title}")
        return _read_csv(USER_LIKE), _read_csv(USER_DISLIKE)

    if like:
        wish_new = _write_union(USER_LIKE, rec)
        dislike_new = _remove_by_id(USER_DISLIKE, did)
        print(f"✅ 已加入想看：{title}")
    else:
        dislike_new = _write_union(USER_DISLIKE, rec)
        wish_new = _remove_by_id(USER_LIKE, did)
        print(f"✅ 已加入不喜欢：{title}")

    return wish_new, dislike_new


#### 3.添加个人信息

In [154]:
user_like = "user_wish.csv"
user_dislike = "user_dislike.csv"

wish_src = "me_movies_wish.csv"
done_src = "me_movies_done.csv"

REQUIRED = ["douban_id","title","info","mark_time","my_rating_stars","my_rating_label"]

wish = pd.read_csv(wish_src, encoding="utf-8-sig")
done = pd.read_csv(done_src, encoding="utf-8-sig")

wish = wish.reindex(columns=REQUIRED, fill_value=pd.NA)
done = done.reindex(columns=REQUIRED, fill_value=pd.NA)

done["my_rating_stars"] = pd.to_numeric(done["my_rating_stars"], errors="coerce")

likes_from_done = done[done["my_rating_stars"] >= 4]
dislikes_from_done = done[done["my_rating_stars"] <= 2]

user_like_df = pd.concat([likes_from_done, wish], ignore_index=True) \
                 .drop_duplicates(subset=["douban_id"], keep="first")

user_dislike_df = dislikes_from_done.drop_duplicates(subset=["douban_id"], keep="first")

user_like_df.to_csv(user_like, index=False, encoding="utf-8-sig", columns=REQUIRED)
user_dislike_df.to_csv(user_dislike, index=False, encoding="utf-8-sig", columns=REQUIRED)

进行推荐

In [160]:
df = recommand("潜行者", sample_top=15, pick_n=5)
if df is None or len(df) == 0:
    print("没有可推荐的结果。")
else:
    print("\n为你推荐：")
    for _, row in df.iterrows():
        title = str(row.get("电影名", row.get("NAME", row.get("title", ""))))
        score = row.get("豆瓣评分", row.get("DOUBAN_SCORE", row.get("my_rating_stars", ""))) or ""
        print(f"- {title}（豆瓣：{score}）")

        ans = input("喜欢输入 y，不喜欢输入 n（回车跳过，q 结束）：").strip().lower()
        if ans == "y":
            user_like_df, user_dislike_df = _feedback(title, like=True)   # ← 内存立即更新
        elif ans == "n":
            user_like_df, user_dislike_df = _feedback(title, like=False)  # ← 内存立即更新
        elif ans == "q":
            break

    print("like条数：", len(user_like_df), " | dislike条数：", len(user_dislike_df))


为你推荐：
- 躲藏（豆瓣：6.8）


喜欢输入 y，不喜欢输入 n（回车跳过，q 结束）： n


✅ 已加入不喜欢：躲藏
- 屏住呼吸（豆瓣：7.1）


喜欢输入 y，不喜欢输入 n（回车跳过，q 结束）： n


✅ 已加入不喜欢：屏住呼吸
- 超人：钢铁之躯（豆瓣：7.0）


喜欢输入 y，不喜欢输入 n（回车跳过，q 结束）： n


✅ 已加入不喜欢：超人：钢铁之躯
- 不道德的审判（豆瓣：8.2）


喜欢输入 y，不喜欢输入 n（回车跳过，q 结束）： y


✅ 已加入想看：不道德的审判
- 乡愁（豆瓣：9.0）


喜欢输入 y，不喜欢输入 n（回车跳过，q 结束）： y


✅ 已加入想看：乡愁
like条数： 520  | dislike条数： 35


In [151]:
user_like_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   douban_id        520 non-null    Int64  
 1   title            520 non-null    string 
 2   info             520 non-null    string 
 3   mark_time        517 non-null    string 
 4   my_rating_stars  388 non-null    Float64
 5   my_rating_label  0 non-null      string 
dtypes: Float64(1), Int64(1), string(4)
memory usage: 25.5 KB


In [None]:
#根据id匹配导演和流派