In [68]:
import pandas as pd 
from py2neo import Graph, Node, Subgraph, NodeMatcher,Relationship, RelationshipMatcher
'''
    加载数据集
'''
def load_data() -> dict[str:pd.DataFrame]:
    df_actor_movie = pd.read_csv('./dataset/movie_act.csv')
    df_actor = pd.read_csv('./dataset/movie_actor.csv')
    df_movie = pd.read_csv('./dataset/movie_movie.csv')
    df_popularity = pd.read_csv('./dataset/movie_popularity.csv')
    df_user = pd.read_csv('./dataset/user_user.csv')

    return {
        'actor_movie' : df_actor_movie,
        'actor': df_actor,
        'movie': df_movie,
        'popularity': df_popularity,
        'user': df_user
    }
df = load_data()


dict_movie = {}
dict_actor = {}
dict_actor_movie = {}

'''
    将数据集的电影、演员和演员->电影的映射关系数据转化为字典
'''
# 获取受欢迎的电影ID
list_popularity_movie = list(df['popularity']['movieid_id'])

# 电影转成字典
for i in range(len(df['movie'])):
    row = df['movie'].iloc[i]
    if(row['movieid'] in list_popularity_movie):    # 只选择受欢迎的
        dict_movie.update({row['movieid'] : row.to_dict()})
# 获取受欢迎电影的所有演员
filter = df['actor_movie']['movieid_id'].isin(list_popularity_movie)
list_popularity_actor =  list(df['actor_movie']['actorid_id'][filter])
# 演员转成字典
for i in range(len(df['actor'])):
    row = df['actor'].iloc[i]                      
    if(row['actorid'] in list_popularity_actor):# 只选择受欢迎的
        dict_actor.update({row['actorid'] : row.to_dict()})
# 演员->电影的映射转成字典
for i in range(len(df['actor_movie'])):
    row = df['actor_movie'].iloc[i]                 
    dict_actor_movie.update({row['actorid_id'] : row.to_dict()})
url = "http://localhost:7474"
username = 'neo4j'
password = '123456'
''''
    连接Neo4j 将字典型的数据插入到Neo4j
'''
# 插入到Neo4j 
graph = Graph(url, auth=(username, password))
# 清空数据库
graph.delete_all()
############## 批量创建电影节点

nodes = []
for mid, node_movie in dict_movie.items():
    nodes.append(Node('movie', **node_movie))
############## 批量创建演员节点
for aid, node_actor in dict_actor.items():
    nodes.append(Node('actor', **node_actor))
## 使用 Subject 子图插入节点(每批次插入50)
batch_size = 50
for i in range(0, len(nodes), batch_size):
    graph.create(Subgraph(nodes=nodes[i: i+batch_size]))
################# 批量创建关系
rels = []
# 定义查找节点的匹配器
nodeMatcher = NodeMatcher(graph)
# 遍历演员->电影映射数据
for aid, node_actor_movie in dict_actor_movie.items():
    
    mid = node_actor_movie['movieid_id']
    # 查找关系的电影节点和演员节点
    node_movie = nodeMatcher.match('movie', movieid = mid).first()
    node_actor = nodeMatcher.match('actor', actorid = aid).first()
    # 建立关系
    if node_movie != None and node_actor != None:
        rels.append(Relationship(node_actor, 'acted', node_movie,name='acted'))
## 最后使用Subjecgt子图插入关系(每批次插入50)
batch_size = 50
for i in range(0, len(rels), batch_size):
    graph.create(Subgraph(relationships=rels[i:i+batch_size]))
print('-' * 50)

rm = RelationshipMatcher(graph)
print('查询 movie 电影节点的统计结果: ', nodeMatcher.match('movie').count())
print('查询 actor 演员节点的统计结果: ', nodeMatcher.match('actor').count())
print('查询 acted 扮演关系的统计结果: ', rm.match(name='acted').count())

--------------------------------------------------
查询 movie 电影节点的统计结果:  62
查询 actor 演员节点的统计结果:  240
查询 acted 扮演关系的统计结果:  91


In [69]:
'''
    删除不包含演员信息的电影
'''
i = 0
for node in nodeMatcher.match('movie').all():
    if(rm.match([None, node], 'acted').count() == 0):
        i += 1
        graph.delete(node)
print('删除不包含演员信息的电影统计: ', i)

删除不包含演员信息的电影统计:  24


In [70]:
'''
    删除没有扮演电影信息的演员【可能是数据的缺失导致的】
'''
i = 0
for node in nodeMatcher.match('actor').all():
    if(rm.match([node, None], 'acted').count() == 0):
        i += 1
        graph.delete(node)
print('删除没有扮演电影信息的演员: ', i)

删除没有扮演电影信息的演员:  149


In [71]:
print('movie 电影节点: ', nodeMatcher.match('movie').count())
print('actor 演员节点: ', nodeMatcher.match('actor').count())
print('acted 扮演关系: ', rm.match(name='acted').count())

movie 电影节点:  38
actor 演员节点:  91
acted 扮演关系:  91


In [78]:
'''
    查找包含演员最多的电影 Top 3
'''

dict_movies = {}
for node in nodeMatcher.match('movie'):
    nodes_actors = rm.match([None, node], name='acted').all()
    dict_movies.update({node:{
        'node_movie' : node,
        'count': len(nodes_actors),
        'actors': nodes_actors
    }})
# 对字典进行降序排序, 结果为元组

top3 = sorted(dict_movies.items(), key=lambda x : x[1]['count'], reverse=True)[:3]

print(' 查找受欢迎电影中包含演员最多的电影 Top 3')
for node in top3:
    print(f"电影编号:{node[0]['movieid']} 电影名称: {node[0]['name']} 出现演员: {node[1]['count']}")

 查找受欢迎电影中包含演员最多的电影 Top 3
电影编号:tt0118480 电影名称: Stargate SG-1 出现演员: 5
电影编号:tt0110912 电影名称: Pulp Fiction 出现演员: 5
电影编号:tt1068678 电影名称: Veronika Decides to Die 出现演员: 5
