In [47]:
import pandas as pd
import re
movies=pd.read_csv('movies.csv')
movies_db=pd.read_csv('movies_db.csv')

### 1.数据清洗

In [48]:
movies_db=movies_db.drop(
    columns=['durations','votes']
)
movies_db['INFO'] = (
    movies_db['genres'].fillna('').astype(str) + ' ' +
    movies_db['countries'].fillna('').astype(str) + ' ' +
    movies_db['reviews'].fillna('').astype(str)
)

movies_db = movies_db.drop(columns=['genres', 'countries', 'reviews'])
movies_db['title'] = movies_db['title'].apply(lambda x: ''.join(re.findall(r'[\u4e00-\u9fff]+', str(x))))
movies_db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1172 entries, 0 to 1171
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   subject_id  1172 non-null   object
 1   title       1172 non-null   object
 2   year        1172 non-null   object
 3   rating      1168 non-null   object
 4   directors   1169 non-null   object
 5   INFO        1172 non-null   object
dtypes: object(6)
memory usage: 55.1+ KB


删除无用列并只保留评分大于6.5的电影

In [49]:
movies = movies.drop(
    columns=['COVER','IMDB_ID','MINS','OFFICIAL_SITE','RELEASE_DATE','SLUG','ACTOR_IDS','DIRECTOR_IDS','LANGUAGES','GENRES','ALIAS','ACTORS']
)
movies = movies[movies['DOUBAN_SCORE'] >= 6.5]

筛选评分人数大于5000的电影并降序排列

In [50]:
movies_new = movies[(movies['DOUBAN_VOTES'] >= 3000)].sort_values(by=['DOUBAN_SCORE','DOUBAN_VOTES'], ascending=[False,False])[['DIRECTORS','MOVIE_ID','NAME','DOUBAN_SCORE','STORYLINE','TAGS','REGIONS','YEAR']]

### 2.余弦相似度模型构建

In [51]:
movies_new['INFO'] = (
    movies['STORYLINE'].fillna('').astype(str) + " " +
    movies['TAGS'].fillna('').astype(str) + " " +
    movies['REGIONS'].fillna('').astype(str)
#    + " " +
#    movies['DIRECTORS'].fillna('').astype(str) + " " +
#    movies['ACTORS'].fillna('').astype(str) + " " +
#    movies['YEAR'].fillna('').astype(str)
)
movies_new = movies_new.drop(columns=['STORYLINE', 'TAGS', 'REGIONS'
                                      #, 'DIRECTORS', 'ACTORS', 'YEAR'
                                     ])

In [52]:
movies_db_renamed = movies_db.rename(columns={
    'subject_id': 'MOVIE_ID',
    'title': 'NAME',
    'year': 'YEAR',
    'rating': 'DOUBAN_SCORE',
    'directors': 'DIRECTORS'
})
movies_db_renamed = movies_db_renamed[['DIRECTORS', 'MOVIE_ID', 'NAME', 'DOUBAN_SCORE', 'YEAR', 'INFO']]
movies_new = pd.concat([movies_new, movies_db_renamed], ignore_index=True)
print(movies_new.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5058 entries, 0 to 5057
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   DIRECTORS     4913 non-null   object
 1   MOVIE_ID      5058 non-null   object
 2   NAME          5058 non-null   object
 3   DOUBAN_SCORE  5054 non-null   object
 4   YEAR          5058 non-null   object
 5   INFO          5058 non-null   object
dtypes: object(6)
memory usage: 237.2+ KB
None


In [53]:
#添加中文分词库
import jieba
from sklearn.feature_extraction.text import CountVectorizer

In [54]:
# 中文分词器
def chinese_tokenizer(text):
    return jieba.lcut(str(text))
stopwords = [
    "的", "了", "在", "是", "我", "有", "和", "就", "不", "人", 
    "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去",
    "你", "会", "着", "没有", "看", "好", "自己", "这", "那", 
    "为", "之", "对", "与", "而", "并", "等", "被", "及", "或",
    "但", "所以", "如果", "因为", "然后", "而且", "那么", "他们", 
    "我们", "你们", "它们", "什么", "哪个", "哪些", "哪里", "时候",
    "他", "她", "它", "咱们", "大家", "谁", "怎样", "怎么", "多少", "为什么",
    "这里", "那里", "这样", "那样", "这个", "那个", "这些", "那些",
    "地", "得", "所", "过", "吗", "呢", "吧", "啊", "呀", "嘛", "哇", "啦",
    "从", "自", "以", "向", "关于", "对于", "根据", "按照", "通过", "由于",
    "并且", "或者", "虽然", "即使", "尽管", "不管", "只要", "只有", "除非",
    "最", "太", "更", "非常", "十分", "特别", "极其", "比较", "稍微", "有点",
    "刚", "才", "正在", "已经", "曾经", "马上", "立刻", "永远", "一直", "总是",
    "常常", "经常", "往往", "不断", "偶尔", "又", "再", "还", "仅", "光",
    "能", "能够", "可以", "可能", "应该", "应当", "想", "愿意", "肯", "敢",
    "来", "去", "进", "出", "回", "起", "开",
    "些", "一些", "所有", "每个", "某个", "各种", "多个", "几个", "第一", "第二",
    "就是", "只是", "可是", "真是", "也是", "不是", "也是", "就是", "正是",
    "一样", "一般", "一点", "一起", "一直", "一下", "一些", "一种", "一次"
]
cv = CountVectorizer(
    max_features=10000,
    tokenizer=chinese_tokenizer,
    stop_words=stopwords,   # 中文停用词表（如果有就传 list）
    token_pattern=None      # 必须设 None，否则 tokenizer 会被覆盖
)

vector = cv.fit_transform(movies_new['INFO'].astype(str)).toarray()

In [55]:
# ========= 去噪变分自编码器（DVAE）— 稳定版（自定义Layer加KL） =========
import numpy as np
import tensorflow as tf
from tensorflow import keras

inp_dim    = vector.shape[1]
code_dim   = 64            # 可改 64/128
epochs     = 20
batch_size = 256
beta_kl    = 1.0          # β-VAE 系数

# 确保 dtype 稳定
vector = vector.astype("float32", copy=False)

# --- 编码器：输入端去噪 ---
inputs = keras.Input(shape=(inp_dim,), name="bow_counts")
x = keras.layers.GaussianNoise(0.15)(inputs)          # 去噪；也可换 Dropout(0.3)
x = keras.layers.Dense(1000, activation="selu")(x)
x = keras.layers.Dense(256,  activation="selu")(x)
z_mean   = keras.layers.Dense(code_dim, name="z_mean")(x)
z_logvar = keras.layers.Dense(code_dim, name="z_logvar")(x)

def reparameterize(args):
    mu, logvar = args
    eps = tf.random.normal(shape=tf.shape(mu))
    return mu + tf.exp(0.5 * logvar) * eps

z = keras.layers.Lambda(reparameterize, name="z")([z_mean, z_logvar])

# 打包一个 encoder（包含三个输出：mean, logvar, z）
encoder = keras.Model(inputs, [z_mean, z_logvar, z], name="dvae_encoder")

# --- 解码器：线性输出，用 MSE 重构计数 ---
latent_inputs = keras.Input(shape=(code_dim,), name="z_in")
d = keras.layers.Dense(256,  activation="selu")(latent_inputs)
d = keras.layers.Dense(1000, activation="selu")(d)
recons = keras.layers.Dense(inp_dim, activation=None, name="recon")(d)
decoder = keras.Model(latent_inputs, recons, name="dvae_decoder")

# --- 自定义 KL 层：把 KL 正则通过 layer.add_loss() 注入 ---
class KLDivergenceLayer(keras.layers.Layer):
    def __init__(self, beta=1.0, scale=1.0, **kwargs):
        super().__init__(**kwargs)
        self.beta = beta
        self.scale = scale  # 用于与 MSE 标度对齐

    def call(self, inputs):
        mu, logvar = inputs
        # KL = -0.5 * sum(1 + logvar - exp(logvar) - mu^2)
        kl_per_sample = -0.5 * tf.reduce_sum(
            1.0 + logvar - tf.exp(logvar) - tf.square(mu), axis=1
        )
        kl = tf.reduce_mean(kl_per_sample) / float(self.scale)
        self.add_loss(self.beta * kl)
        # 返回个“占位输出”，不参与后续计算
        return tf.zeros_like(mu[:, :1])

# --- 组装 DVAE：重构 + KL层（只为加loss，不改计算图）---
z_mean_out, z_logvar_out, z_out = encoder(inputs)
_ = KLDivergenceLayer(beta=beta_kl, scale=inp_dim, name="kl_reg")([z_mean_out, z_logvar_out])
recons_out = decoder(z_out)

vae = keras.Model(inputs, recons_out, name="dvae")

# --- 训练：重构用 MSE；KL 已由 KL 层注入 ---
vae.compile(optimizer=keras.optimizers.Adam(1e-3), loss="mse")
vae.fit(vector, vector, epochs=epochs, batch_size=batch_size, validation_split=0.1, verbose=1)

# --- 用 z_mean 作为电影向量（更稳、更可插值）---
z_mean_val = encoder.predict(vector, verbose=0)[0]   # 形状: [N, code_dim]
feature = z_mean_val

# ========= DVAE 结束 =========

from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(feature)


Epoch 1/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 248ms/step - loss: 0.0486 - val_loss: 0.0909
Epoch 2/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 234ms/step - loss: 0.0204 - val_loss: 0.0637
Epoch 3/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 251ms/step - loss: 0.0152 - val_loss: 0.0466
Epoch 4/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 241ms/step - loss: 0.0131 - val_loss: 0.0405
Epoch 5/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 248ms/step - loss: 0.0123 - val_loss: 0.0629
Epoch 6/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 231ms/step - loss: 0.0119 - val_loss: 0.0377
Epoch 7/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 267ms/step - loss: 0.0113 - val_loss: 0.0337
Epoch 8/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 262ms/step - loss: 0.0112 - val_loss: 0.0325
Epoch 9/20
[1m18/18[0m [32m━━━━━━━━━━

In [56]:
from sklearn.metrics.pairwise import cosine_similarity

In [57]:
def recommand(movie_name, sample_top=15, pick_n=5):
    label_idx = movies_new.index[movies_new['NAME'] == movie_name]
    if len(label_idx) == 0:
        print("未找到该影片")
        return
    pos = movies_new.index.get_loc(label_idx[0])

    sims = similarity[pos]
    cand = np.argsort(-sims)   # 按相似度降序排列索引
    cand = cand[cand != pos]   # 去掉自身
    top_candidates = cand[:sample_top]  # 取前15个
    
    # 如果数量足够，从前15中随机选5个（无放回）
    n_pick = min(pick_n, len(top_candidates))
    selected = np.random.choice(top_candidates, n_pick, replace=False)

    recs = []
    for j in selected:
        recs.append({
            "电影名": movies_new.iloc[j]['NAME'],
            "豆瓣评分": movies_new.iloc[j]['DOUBAN_SCORE'],
            "相似度": sims[j]
        })
    df = pd.DataFrame(recs).sort_values(by="相似度", ascending=False).reset_index(drop=True)
    return df


In [58]:
recommand("苦月亮")

Unnamed: 0,电影名,豆瓣评分,相似度
0,钢铁侠,8.2,0.943882
1,情迷六月花,7.5,0.942665
2,美国派2,7.0,0.939954
3,秘密与谎言,8.2,0.938467
4,心慌方,7.9,0.937128
