## 读取电影数据

In [1]:
import tqdm
from functools import reduce
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import pairwise_distances

In [2]:
df = pd.read_csv("../dataset/processed_movies.csv")
df.head()

Unnamed: 0,actors,director,name,rat,region,traits,type,movie_id
0,杰夫里·乔 Jeffrey Chyau|姜星 Sung Kang|吴玉 Jade Wu|萨曼...,迈克·姜 Michael Kang,汽车旅馆,7.0,美国,青春|文艺,喜剧|剧情,1
1,海瑟·格拉汉姆|杰瑞·奥康奈尔|约翰·考伯特,Brian Herzlinger,安吉拉怀孕记,5.7,美国,女性,爱情|喜剧,2
2,史蒂夫·卡瑞尔|摩根·弗里曼|劳伦·格拉汉姆|约翰尼·西蒙斯|约翰·古德曼,汤姆·沙迪亚克,冒牌天神2,6.5,美国,魔幻|励志|搞笑,科幻|喜剧|奇幻|剧情,3
3,安吉拉·兰斯伯瑞|杰拉丁·卓别林|托尼·柯蒂斯|爱德华·福克斯|罗克·赫德森,盖伊·汉弥尔顿,破镜谋杀案,7.0,英国,经典,悬疑|惊悚|犯罪,4
4,北野武|浅野忠信|大楠道代|夏川结衣|岸部一德,北野武,座头市,8.0,日本,经典,音乐|犯罪|动作|剧情|武侠,5


In [3]:
df.shape

(23034, 8)

## 对type, actors, director, trait四个类型数据分别构建一个频度统计字典，用于之后的one-hot编码

In [4]:
def get_dim_dict(df, dim_name):
    type_list = list(map(lambda x: x.split('|'), df[dim_name]))
    type_list = [x for l in type_list for x in l]
    def reduce_func(x, y):
        for i in x:
            if i[0] == y[0][0]:
                x.remove(i)
                x.append(((i[0], i[1] + 1)))
                return x
        x.append(y[0])
        return x
    l = filter(lambda x: x != None, map(lambda x: [(x, 1)], type_list))
    type_zip = reduce(reduce_func, list(l))
    type_dict = {}
    for i in type_zip:
        type_dict[i[0]] = i[1]
    return type_dict

In [5]:
type_dict = get_dim_dict(df, 'type')
print(type_dict['历史'])

1295


In [6]:
actors_dict = get_dim_dict(df, 'actors')
print(actors_dict['金标'])

1


In [7]:
director_dict = get_dim_dict(df, 'director')
print(director_dict['吴思远'])

5


In [8]:
trait_dict = get_dim_dict(df, 'traits')
print(trait_dict['经典'])

5994


## 删除无用或冗余维度

In [9]:
df_ = df.drop(['name', 'rat', 'movie_id'], axis=1)

## 将电影数据转换为字典列表，由于演员和导演均过万维，实际计算时过于稀疏，当演员或导演只出现一次时，标记为冷门演员或导演

In [10]:
movie_dict_list = []
for i in df_.index:
    movie_dict = {}
    # type
    for s_type in df_.iloc[i]['type'].split('|'):
        movie_dict[s_type] = 1
    # actors
    for s_actor in df_.iloc[i]['actors'].split('|'):
        if actors_dict[s_actor] < 2:
            movie_dict['other_actor'] = 1
        else:
            movie_dict[s_actor] = 1
    # regions
    movie_dict[df_.iloc[i]['region']] = 1
    # director
    for s_director in df_.iloc[i]['director'].split('|'):
        if director_dict[s_director] < 2:
            movie_dict['other_director'] = 1
        else:
            movie_dict[s_director] = 1
    # trait
    for s_trait in df_.iloc[i]['traits'].split('|'):
        movie_dict[s_trait] = 1
    movie_dict_list.append(movie_dict)

In [11]:
movie_dict_list

[{'喜剧': 1,
  '剧情': 1,
  'other_actor': 1,
  '美国': 1,
  'other_director': 1,
  '青春': 1,
  '文艺': 1},
 {'爱情': 1,
  '喜剧': 1,
  '海瑟·格拉汉姆': 1,
  '杰瑞·奥康奈尔': 1,
  '约翰·考伯特': 1,
  '美国': 1,
  'other_director': 1,
  '女性': 1},
 {'科幻': 1,
  '喜剧': 1,
  '奇幻': 1,
  '剧情': 1,
  '史蒂夫·卡瑞尔': 1,
  '摩根·弗里曼': 1,
  '劳伦·格拉汉姆': 1,
  '约翰尼·西蒙斯': 1,
  '约翰·古德曼': 1,
  '美国': 1,
  '汤姆·沙迪亚克': 1,
  '魔幻': 1,
  '励志': 1,
  '搞笑': 1},
 {'悬疑': 1,
  '惊悚': 1,
  '犯罪': 1,
  '安吉拉·兰斯伯瑞': 1,
  '杰拉丁·卓别林': 1,
  '托尼·柯蒂斯': 1,
  '爱德华·福克斯': 1,
  '罗克·赫德森': 1,
  '英国': 1,
  '盖伊·汉弥尔顿': 1,
  '经典': 1},
 {'音乐': 1,
  '犯罪': 1,
  '动作': 1,
  '剧情': 1,
  '武侠': 1,
  '北野武': 1,
  '浅野忠信': 1,
  '大楠道代': 1,
  '夏川结衣': 1,
  '岸部一德': 1,
  '日本': 1,
  '经典': 1},
 {'爱情': 1,
  '冒险': 1,
  '喜剧': 1,
  '马克·韦伯': 1,
  '泽娜·格雷': 1,
  '埃曼纽尔·施莱琪': 1,
  '克里斯·艾略特': 1,
  '珍·斯马特': 1,
  '美国': 1,
  'other_director': 1,
  '青春': 1},
 {'爱情': 1,
  '喜剧': 1,
  '剧情': 1,
  '周迅': 1,
  '佟大为': 1,
  '钟汉良': 1,
  '张梓琳': 1,
  '郭书瑶': 1,
  '中国大陆': 1,
  '郭在容': 1,
  '青春': 1,
  '搞笑': 1},
 {'爱情': 1,
  '剧情

In [12]:
movie_dict_list[0]

{'喜剧': 1,
 '剧情': 1,
 'other_actor': 1,
 '美国': 1,
 'other_director': 1,
 '青春': 1,
 '文艺': 1}

## 使用DictVectorizer进行向量化，做one-hot编码

In [13]:
v = DictVectorizer()
X = v.fit_transform(movie_dict_list)
X.shape

(23034, 16952)

In [14]:
movie_dict_list[0:1]

[{'喜剧': 1,
  '剧情': 1,
  'other_actor': 1,
  '美国': 1,
  'other_director': 1,
  '青春': 1,
  '文艺': 1}]

## 计算cosine相似度

In [15]:
item_similarity = pairwise_distances(X, metric='cosine')

## 测试代码，查看相似度的准确性

In [16]:
compare_index = 3
index = 0
_max_index = 0
_max = 1
for i in item_similarity[compare_index]:
    if i < _max and i != 0:
        _max = 1
        _max_index = index
    index = index + 1

In [17]:
_max_index, _max

(23026, 1)

In [18]:
index_of_sim = _max_index

In [19]:
df.iloc[index_of_sim]

actors      Ray Liotta|Linda Boston|Ele Bardha|Ron Causey|...
director                                         Chris Fisher
name                                                街头之王2：汽车城
rat                                                       4.8
region                                                     美国
traits                                                     黑帮
type                                              犯罪|动作|悬疑|剧情
movie_id                                                23027
Name: 23026, dtype: object

In [20]:
movie_dict_list[index_of_sim]

{'犯罪': 1,
 '动作': 1,
 '悬疑': 1,
 '剧情': 1,
 'Ray Liotta': 1,
 'other_actor': 1,
 '美国': 1,
 'other_director': 1,
 '黑帮': 1}

In [21]:
df_106 = pd.DataFrame(data=X.todense()[index_of_sim], columns=v.feature_names_)
df_0 = pd.DataFrame(data=X.todense()[compare_index], columns=v.feature_names_)
df_diff = pd.concat([df_0, df_106], axis=0, ignore_index=True)
df_diff = df_diff.T

In [22]:
df_diff[(df_diff[0] != 0) | (df_diff[1] != 0)]

Unnamed: 0,0,1
Ray Liotta,0.0,1.0
other_actor,0.0,1.0
other_director,0.0,1.0
剧情,0.0,1.0
动作,0.0,1.0
安吉拉·兰斯伯瑞,1.0,0.0
悬疑,1.0,1.0
惊悚,1.0,0.0
托尼·柯蒂斯,1.0,0.0
杰拉丁·卓别林,1.0,0.0


In [23]:
df_sim = pd.DataFrame(data=item_similarity)
item_similarity.shape[0]

23034

## 最相似的前200个电影

In [24]:
df_sim.nsmallest(200, 6600)[6600].to_dict().items()

dict_items([(6600, 0.0), (18013, 0.49748109237039395), (21127, 0.5165062215847719), (1813, 0.5232687053772038), (19634, 0.5232687053772038), (9883, 0.5441576941614482), (10743, 0.5441576941614482), (16840, 0.5441576941614482), (18452, 0.5441576941614482), (22587, 0.5454545454545454), (2471, 0.5648058601107554), (13764, 0.5648058601107554), (19039, 0.5648058601107554), (19151, 0.5648058601107554), (2271, 0.5735985672887791), (4030, 0.5735985672887791), (5631, 0.5735985672887791), (8681, 0.5735985672887791), (15140, 0.5735985672887791), (21518, 0.5735985672887791), (21817, 0.5735985672887791), (2344, 0.5818789949964546), (3049, 0.5818789949964546), (5709, 0.5818789949964546), (7373, 0.5818789949964546), (10567, 0.5818789949964546), (11728, 0.5818789949964546), (14316, 0.5818789949964546), (17606, 0.5818789949964546), (19025, 0.5818789949964546), (2629, 0.5954800825220548), (3105, 0.5954800825220548), (3852, 0.5970885179873099), (6497, 0.5970885179873099), (8886, 0.5970885179873099), (112