In [65]:
import pandas as pd
movies=pd.read_csv('movies.csv')

### 1.数据清洗

In [66]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140502 entries, 0 to 140501
Data columns (total 21 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   MOVIE_ID       140502 non-null  int64  
 1   NAME           140502 non-null  object 
 2   ALIAS          30322 non-null   object 
 3   ACTORS         82851 non-null   object 
 4   COVER          50654 non-null   object 
 5   DIRECTORS      70244 non-null   object 
 6   DOUBAN_SCORE   140502 non-null  float64
 7   DOUBAN_VOTES   140502 non-null  float64
 8   GENRES         136452 non-null  object 
 9   IMDB_ID        113256 non-null  object 
 10  LANGUAGES      131442 non-null  object 
 11  MINS           140502 non-null  float64
 12  OFFICIAL_SITE  9821 non-null    object 
 13  REGIONS        136501 non-null  object 
 14  RELEASE_DATE   77687 non-null   object 
 15  SLUG           140502 non-null  object 
 16  STORYLINE      87052 non-null   object 
 17  TAGS           97398 non-null

删除无用列并只保留评分大于6.5的电影

In [67]:
movies = movies.drop(
    columns=['COVER','IMDB_ID','MINS','OFFICIAL_SITE','RELEASE_DATE','SLUG','ACTOR_IDS','DIRECTOR_IDS','LANGUAGES','GENRES','ALIAS']
)
movies = movies[movies['DOUBAN_SCORE'] >= 6.5]

In [68]:
movies.isnull().sum()

MOVIE_ID           0
NAME               0
ACTORS          2023
DIRECTORS       2688
DOUBAN_SCORE       0
DOUBAN_VOTES       0
REGIONS            0
STORYLINE        492
TAGS               0
YEAR               0
dtype: int64

筛选评分人数大于5000的电影并降序排列

In [69]:
movies_new = movies[(movies['DOUBAN_VOTES'] >= 5000)].sort_values(by=['DOUBAN_SCORE','DOUBAN_VOTES'], ascending=[False,False])[['ACTORS','DIRECTORS','MOVIE_ID','NAME','DOUBAN_SCORE','STORYLINE','TAGS','REGIONS','YEAR']]

### 2.余弦相似度模型构建

In [70]:
movies_new['INFO'] = (
    movies['STORYLINE'].fillna('').astype(str) + " " +
    movies['TAGS'].fillna('').astype(str) + " " +
    movies['REGIONS'].fillna('').astype(str)
#    + " " +
#    movies['DIRECTORS'].fillna('').astype(str) + " " +
#    movies['ACTORS'].fillna('').astype(str) + " " +
#    movies['YEAR'].fillna('').astype(str)
)
movies_new = movies_new.drop(columns=['STORYLINE', 'TAGS', 'REGIONS'
                                      #, 'DIRECTORS', 'ACTORS', 'YEAR'
                                     ])

In [71]:
movies_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3056 entries, 56283 to 25580
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ACTORS        3024 non-null   object 
 1   DIRECTORS     2992 non-null   object 
 2   MOVIE_ID      3056 non-null   int64  
 3   NAME          3056 non-null   object 
 4   DOUBAN_SCORE  3056 non-null   float64
 5   YEAR          3056 non-null   float64
 6   INFO          3056 non-null   object 
dtypes: float64(2), int64(1), object(4)
memory usage: 191.0+ KB


In [72]:
#添加中文分词库
import jieba
from sklearn.feature_extraction.text import CountVectorizer

In [73]:
# 中文分词器
def chinese_tokenizer(text):
    return jieba.lcut(str(text))
stopwords = [
    "的", "了", "在", "是", "我", "有", "和", "就", "不", "人", 
    "都", "一", "一个", "上", "也", "很", "到", "说", "要", "去",
    "你", "会", "着", "没有", "看", "好", "自己", "这", "那", 
    "为", "之", "对", "与", "而", "并", "等", "被", "及", "或",
    "但", "所以", "如果", "因为", "然后", "而且", "那么", "他们", 
    "我们", "你们", "它们", "什么", "哪个", "哪些", "哪里", "时候"
]
cv = CountVectorizer(
    max_features=10000,
    tokenizer=chinese_tokenizer,
    stop_words=stopwords,   # 中文停用词表（如果有就传 list）
    token_pattern=None      # 必须设 None，否则 tokenizer 会被覆盖
)

vector = cv.fit_transform(movies_new['INFO'].astype(str)).toarray()

In [74]:
vector.shape

(3056, 10000)

In [75]:
from sklearn.metrics.pairwise import cosine_similarity

In [76]:
similarity=cosine_similarity(vector)

In [77]:
similarity

array([[1.        , 0.7011158 , 0.52947991, ..., 0.72615163, 0.68596309,
        0.65913964],
       [0.7011158 , 1.        , 0.47090606, ..., 0.67462559, 0.62135833,
        0.60410053],
       [0.52947991, 0.47090606, 1.        , ..., 0.42168307, 0.38451942,
        0.3755704 ],
       ...,
       [0.72615163, 0.67462559, 0.42168307, ..., 1.        , 0.66989075,
        0.6192904 ],
       [0.68596309, 0.62135833, 0.38451942, ..., 0.66989075, 1.        ,
        0.70092582],
       [0.65913964, 0.60410053, 0.3755704 , ..., 0.6192904 , 0.70092582,
        1.        ]])

In [78]:
movies_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3056 entries, 56283 to 25580
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ACTORS        3024 non-null   object 
 1   DIRECTORS     2992 non-null   object 
 2   MOVIE_ID      3056 non-null   int64  
 3   NAME          3056 non-null   object 
 4   DOUBAN_SCORE  3056 non-null   float64
 5   YEAR          3056 non-null   float64
 6   INFO          3056 non-null   object 
dtypes: float64(2), int64(1), object(4)
memory usage: 191.0+ KB


In [79]:
import numpy as np

def recommand(movie_name, topk=5):
    label_idx = movies_new.index[movies_new['NAME'] == movie_name]
    if len(label_idx) == 0:
        print("未找到该影片")
        return
    pos = movies_new.index.get_loc(label_idx[0])

    sims = similarity[pos]
    cand = np.argsort(-sims)
    cand = cand[cand != pos][:topk]

    recs = []
    for j in cand:
        recs.append({
            "电影名": movies_new.iloc[j]['NAME'],
            "豆瓣评分": movies_new.iloc[j]['DOUBAN_SCORE'],
        })
    df = pd.DataFrame(recs)
    return df

In [81]:
recommand("乡愁")

Unnamed: 0,电影名,豆瓣评分
0,八部半,8.5
1,质数的孤独,7.0
2,全面回忆,7.0
3,生化危机4：战神再生,6.5
4,你好，再见,8.1


### 后续需添加导演和时代标签等关系网