In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from functools import reduce
import time

In [2]:
# movie path
movie_filepath = "../dataset/movie.csv"
# user path
user_filepath = "../dataset/user.csv"

In [4]:
movies = pd.read_csv(movie_filepath)
movies.shape

(93160, 7)

In [5]:
movies.head()

Unnamed: 0,类型,主演,地区,导演,特色,评分,电影名
0,剧情,徐峥|王传君|周一围|谭卓|章宇,中国大陆,文牧野,经典,8.9,我不是药神
1,剧情,冯小刚|许晴|张涵予|刘桦|李易峰,中国大陆,管虎,经典,7.8,老炮儿
2,剧情,王宝强|刘昊然|肖央|刘承羽|尚语贤,中国大陆,陈思诚,经典,6.7,唐人街探案2
3,剧情,任素汐|大力|刘帅良|裴魁山|阿如那,中国大陆,周申|刘露,经典,8.3,驴得水
4,剧情,徐峥|王宝强|李曼|李小璐|左小青,中国大陆,叶伟民,经典,7.5,人在囧途


In [6]:
users = pd.read_csv(user_filepath)

In [7]:
users.shape

(199813, 6)

In [8]:
users.head()

Unnamed: 0,评分,用户名,评论时间,用户ID,电影名,类型
0,2,身似,2018-01-05 15:05:06,1,心雨花露,爱情
1,4,有意识的贱民,2018-01-05 15:05:06,3,战争的恐怖,战争
2,2,亿万露电,2018-01-05 15:05:06,4,豪勇七蛟龙,战争
3,2,Marni,2018-01-05 15:05:06,5,无序之主,犯罪
4,4,马西嘻嘻嘻,2018-01-05 15:05:06,6,时装店风波,同性


In [9]:
# 准备电影数据
def prepare_movie_data(movies):
    names_set = set(movies['电影名'].tolist())  # 全量的电影名

    movies_list = []  # 接受电影信息的字典
    for movie_name in tqdm(names_set):
        movieinfo = movies[movies['电影名'] == movie_name]

        g = lambda x : movieinfo[x].iloc[0]  # 取某个列中的第一个数据
        actors = movieinfo['主演'].tolist()  # 主演的列表
        actors = reduce(lambda x, y: x + '|' + y, list(set(actors)))  # 主演列表去重再使用|进行分割
        types = movieinfo['类型'].tolist()  # 电影类型的列表
        types = reduce(lambda x, y : x + '|' + y, list(set(types)))  # 电影类型列表去重再使用|进行分割
        traits = movieinfo['特色'].tolist()  # 电影特色列表
        traits = reduce(lambda x, y : x + '|' + y, list(set(traits)))  # 特色列表去重再使用|进行分割

        movies_info = {"name": movie_name,
                       "type": types,
                       "actors": actors,
                       "region": g("地区"),
                       "director": g("导演"),
                       "traits": traits,
                       "rat": g("评分")}
        movies_list.append(movies_info)
    movies_df = pd.DataFrame(movies_list)  # 字典转换成DataFrame
    movies_df['movie_id'] = range(1, len(movies_df.index) + 1)  # 添加电影id
    movies_df.to_csv("../dataset/processed_movies.csv", index=False)
    return movies_df

In [10]:
prepare_movie_data(movies)  # 处理电影数据

100%|███████████████████████████████████████████████████████████████████████████| 23034/23034 [02:30<00:00, 152.93it/s]


Unnamed: 0,actors,director,name,rat,region,traits,type,movie_id
0,杰夫里·乔 Jeffrey Chyau|姜星 Sung Kang|吴玉 Jade Wu|萨曼...,迈克·姜 Michael Kang,汽车旅馆,7.0,美国,青春|文艺,喜剧|剧情,1
1,海瑟·格拉汉姆|杰瑞·奥康奈尔|约翰·考伯特,Brian Herzlinger,安吉拉怀孕记,5.7,美国,女性,爱情|喜剧,2
2,史蒂夫·卡瑞尔|摩根·弗里曼|劳伦·格拉汉姆|约翰尼·西蒙斯|约翰·古德曼,汤姆·沙迪亚克,冒牌天神2,6.5,美国,魔幻|励志|搞笑,科幻|喜剧|奇幻|剧情,3
3,安吉拉·兰斯伯瑞|杰拉丁·卓别林|托尼·柯蒂斯|爱德华·福克斯|罗克·赫德森,盖伊·汉弥尔顿,破镜谋杀案,7.0,英国,经典,悬疑|惊悚|犯罪,4
4,北野武|浅野忠信|大楠道代|夏川结衣|岸部一德,北野武,座头市,8.0,日本,经典,音乐|犯罪|动作|剧情|武侠,5
5,马克·韦伯|泽娜·格雷|埃曼纽尔·施莱琪|克里斯·艾略特|珍·斯马特,克里斯·科赫,下雪的日子,6.1,美国,青春,爱情|冒险|喜剧,6
6,周迅|佟大为|钟汉良|张梓琳|郭书瑶,郭在容,我的早更女友,5.4,中国大陆,青春|搞笑,爱情|喜剧|剧情,7
7,陈小春|罗家英|富田靖子|莫文蔚,严浩,我爱厨房,7.4,日本,文艺,爱情|剧情,8
8,米基·鲁尼|库尔特·拉塞尔|珀尔·贝利|杰克·艾伯森|桑迪·邓肯,阿特·史蒂文斯|Ted Berman,狐狸与猎狗,8.2,美国,经典,冒险,9
9,乌比·戈德堡|玛丽·露易斯·帕克|德鲁·巴里摩尔|马修·麦康纳,赫伯特·罗斯,潇洒有情天,8.1,法国,女性,同性|喜剧|剧情,10


In [14]:
# 得出每个用户评分的统计数据
def prepare_rating_statistics_data(movies_df, users):
    tqdm.pandas(desc="process_movie_id")
    # 根据电影名，关联得到电影id
    users['movie_id'] = users['电影名'].progress_map(lambda x: movies_df[movies_df['name'] == x].movie_id.iloc[0])
    tqdm.pandas(desc="process_timestamp")
    users['timestamp'] = pd.to_datetime(users['评论时间'], format='%Y-%m-%d %H:%M:%S')
    users['评分'] = users['评分'].astype(float)
    users.rename(columns={"用户ID": "UserId", "movie_id": "MovieId", "评分": "Rating", "timestamp": "Timestamp"}, inplace=True)
    users = users.drop(['用户名', '评论时间', '电影名', '类型'], axis=1)
    list_users_statistics = []
    tqdm.pandas(desc="process_statistics")
    for UserId, users_ele in tqdm(users.groupby(by=['UserId'])):
        users_ele['rmax'] = users_ele['Rating'].max()
        users_ele['rmin'] = users_ele['Rating'].min()
        users_ele['rcount'] = len(users_ele['Rating'].dropna())
        users_ele['rsum'] = users_ele['Rating'].sum()
        users_ele['ravg'] = users_ele['Rating'].mean()
        users_ele['rmedian'] = np.median(np.array(users_ele['Rating']).astype(float))
        list_users_statistics.append(users_ele)
    users_statistics = pd.concat(list_users_statistics, axis=0)
    users_statistics.to_csv("../dataset/processed_users_ratings_statistics.csv", index=False)

In [15]:
# 读取处理后的电影数据
movies_df = pd.read_csv("../dataset/processed_movies.csv")

In [16]:
prepare_rating_statistics_data(movies_df, users)

process_movie_id: 100%|███████████████████████████████████████████████████████| 199813/199813 [06:24<00:00, 519.66it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try