In [1]:
import pandas as pd
movies=pd.read_csv('movies.csv')

### 1.数据清洗

In [2]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140502 entries, 0 to 140501
Data columns (total 21 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   MOVIE_ID       140502 non-null  int64  
 1   NAME           140502 non-null  object 
 2   ALIAS          30322 non-null   object 
 3   ACTORS         82851 non-null   object 
 4   COVER          50654 non-null   object 
 5   DIRECTORS      70244 non-null   object 
 6   DOUBAN_SCORE   140502 non-null  float64
 7   DOUBAN_VOTES   140502 non-null  float64
 8   GENRES         136452 non-null  object 
 9   IMDB_ID        113256 non-null  object 
 10  LANGUAGES      131442 non-null  object 
 11  MINS           140502 non-null  float64
 12  OFFICIAL_SITE  9821 non-null    object 
 13  REGIONS        136501 non-null  object 
 14  RELEASE_DATE   77687 non-null   object 
 15  SLUG           140502 non-null  object 
 16  STORYLINE      87052 non-null   object 
 17  TAGS           97398 non-null

删除无用列并只保留评分大于6.5的电影

In [3]:
movies["GENRES"].head(10)

0       剧情/爱情
1       动作/爱情
2          剧情
3          爱情
4       剧情/历史
5       剧情/爱情
6          悬疑
7          科幻
8    动作/悬疑/古装
9          科幻
Name: GENRES, dtype: object

In [4]:
movies = movies.drop(
    columns=['COVER','IMDB_ID','MINS','OFFICIAL_SITE','RELEASE_DATE','SLUG','ACTOR_IDS','DIRECTOR_IDS','LANGUAGES','GENRES','ALIAS']
)
movies = movies[movies['DOUBAN_SCORE'] >= 6.5]

In [5]:
movies.isnull().sum()

MOVIE_ID           0
NAME               0
ACTORS          2023
DIRECTORS       2688
DOUBAN_SCORE       0
DOUBAN_VOTES       0
REGIONS            0
STORYLINE        492
TAGS               0
YEAR               0
dtype: int64

筛选评分人数大于5000的电影并降序排列

In [6]:
movies_new = movies[(movies['DOUBAN_VOTES'] >= 3000)].sort_values(by=['DOUBAN_SCORE','DOUBAN_VOTES'], ascending=[False,False])[['ACTORS','DIRECTORS','MOVIE_ID','NAME','DOUBAN_SCORE','STORYLINE','TAGS','REGIONS','YEAR']]

### 2.余弦相似度模型构建

In [7]:
movies_new['INFO'] = (
    movies['STORYLINE'].fillna('').astype(str) + " " +
    movies['TAGS'].fillna('').astype(str) + " " +
    movies['REGIONS'].fillna('').astype(str)
#    + " " +
#    movies['DIRECTORS'].fillna('').astype(str) + " " +
#    movies['ACTORS'].fillna('').astype(str) + " " +
#    movies['YEAR'].fillna('').astype(str)
)
movies_new = movies_new.drop(columns=['STORYLINE', 'TAGS', 'REGIONS'
                                      #, 'DIRECTORS', 'ACTORS', 'YEAR'
                                     ])

In [8]:
movies_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3886 entries, 64156 to 40584
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ACTORS        3825 non-null   object 
 1   DIRECTORS     3744 non-null   object 
 2   MOVIE_ID      3886 non-null   int64  
 3   NAME          3886 non-null   object 
 4   DOUBAN_SCORE  3886 non-null   float64
 5   YEAR          3886 non-null   float64
 6   INFO          3886 non-null   object 
dtypes: float64(2), int64(1), object(4)
memory usage: 242.9+ KB


In [9]:
#添加中文分词库
import jieba
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
# 中文分词器
def chinese_tokenizer(text):
    return jieba.lcut(str(text))
stopwords = [
    "的", "了", "在", "是", "我", "有", "和", "就", "不", "人", 
    "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去",
    "你", "会", "着", "没有", "看", "好", "自己", "这", "那", 
    "为", "之", "对", "与", "而", "并", "等", "被", "及", "或",
    "但", "所以", "如果", "因为", "然后", "而且", "那么", "他们", 
    "我们", "你们", "它们", "什么", "哪个", "哪些", "哪里", "时候"
]
cv = CountVectorizer(
    max_features=5000,
    tokenizer=chinese_tokenizer,
    stop_words=stopwords,   # 中文停用词表（如果有就传 list）
    token_pattern=None      # 必须设 None，否则 tokenizer 会被覆盖
)

vector = cv.fit_transform(movies_new['INFO'].astype(str)).toarray()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\86187\AppData\Local\Temp\jieba.cache
Loading model cost 1.552 seconds.
Prefix dict has been built successfully.


In [11]:
vector.shape

(3886, 5000)

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
similarity=cosine_similarity(vector)

In [14]:
similarity

array([[1.        , 0.34076584, 0.39717755, ..., 0.4830387 , 0.2358598 ,
        0.25715143],
       [0.34076584, 1.        , 0.71652678, ..., 0.64992399, 0.70318363,
        0.70137366],
       [0.39717755, 0.71652678, 1.        , ..., 0.62121336, 0.63570436,
        0.63871908],
       ...,
       [0.4830387 , 0.64992399, 0.62121336, ..., 1.        , 0.58649709,
        0.60962353],
       [0.2358598 , 0.70318363, 0.63570436, ..., 0.58649709, 1.        ,
        0.62765135],
       [0.25715143, 0.70137366, 0.63871908, ..., 0.60962353, 0.62765135,
        1.        ]])

In [15]:
movies_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3886 entries, 64156 to 40584
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ACTORS        3825 non-null   object 
 1   DIRECTORS     3744 non-null   object 
 2   MOVIE_ID      3886 non-null   int64  
 3   NAME          3886 non-null   object 
 4   DOUBAN_SCORE  3886 non-null   float64
 5   YEAR          3886 non-null   float64
 6   INFO          3886 non-null   object 
dtypes: float64(2), int64(1), object(4)
memory usage: 242.9+ KB


In [16]:
import numpy as np

def recommand(movie_name, topk=5):
    label_idx = movies_new.index[movies_new['NAME'] == movie_name]
    if len(label_idx) == 0:
        print("未找到该影片")
        return
    pos = movies_new.index.get_loc(label_idx[0])

    sims = similarity[pos]
    cand = np.argsort(-sims)
    cand = cand[cand != pos][:topk]

    recs = []
    for j in cand:
        recs.append({
            "电影名": movies_new.iloc[j]['NAME'],
            "豆瓣评分": movies_new.iloc[j]['DOUBAN_SCORE'],
        })
    df = pd.DataFrame(recs)
    return df

In [17]:
recommand("乡愁")

Unnamed: 0,电影名,豆瓣评分
0,质数的孤独,7.0
1,八部半,8.5
2,全面回忆,7.0
3,控制的极限,6.8
4,尤里西斯的凝视,9.0


### 后续需添加导演和时代标签等关系网