# movielens电影评分数据分析(上)

In [1]:
import numpy as np
import pandas as pd

## 读取数据

In [2]:
# 从用户表读取用户信息
users = pd.read_table('users.dat', header=None, names=['UserID','Gender','Age','Occupation','Zip-code'], sep='::',engine='python')

In [3]:
# 打印列表长度，共有6040条记录
print(len(users))

6040


In [4]:
# 查看前五条记录
users.head(5)

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [5]:
# 同样方法，导入电影评分表
ratings = pd.read_table('ratings.dat', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], sep='::',engine='python')
# 打印列表长度
print(len(ratings))
print(ratings.head(5))
# 同样方法，导入电影数据表
movies = pd.read_table('movies.dat', header=None, names=['MovieID', 'Title', 'Genres'], sep='::',engine='python')
print(len(movies))
print(movies.head(5))

1000209
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291
3883
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy


## 合并数据表

In [6]:
# 导入完成之后，我们可以发现这三张表类似于数据库中的表
# 要进行数据分析，我们就要将多张表进行合并才有助于分析 先将users与ratings两张表合并再跟movied合并
data = pd.merge(pd.merge(users, ratings), movies)
data.head(10)

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code,MovieID,Rating,Timestamp,Title,Genres
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama
5,18,F,18,3,95825,1193,4,978156168,One Flew Over the Cuckoo's Nest (1975),Drama
6,19,M,1,10,48073,1193,5,982730936,One Flew Over the Cuckoo's Nest (1975),Drama
7,24,F,25,7,10023,1193,5,978136709,One Flew Over the Cuckoo's Nest (1975),Drama
8,28,F,25,1,14607,1193,3,978125194,One Flew Over the Cuckoo's Nest (1975),Drama
9,33,M,45,3,55421,1193,5,978557765,One Flew Over the Cuckoo's Nest (1975),Drama


## 对数据初步描述分析

In [7]:
data.describe()

Unnamed: 0,UserID,Age,Occupation,MovieID,Rating,Timestamp
count,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0,1000209.0
mean,3024.512,29.73831,8.036138,1865.54,3.581564,972243700.0
std,1728.413,11.75198,6.531336,1096.041,1.117102,12152560.0
min,1.0,1.0,0.0,1.0,1.0,956703900.0
25%,1506.0,25.0,2.0,1030.0,3.0,965302600.0
50%,3070.0,25.0,7.0,1835.0,4.0,973018000.0
75%,4476.0,35.0,14.0,2770.0,4.0,975220900.0
max,6040.0,56.0,20.0,3952.0,5.0,1046455000.0


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000209 entries, 0 to 1000208
Data columns (total 10 columns):
UserID        1000209 non-null int64
Gender        1000209 non-null object
Age           1000209 non-null int64
Occupation    1000209 non-null int64
Zip-code      1000209 non-null object
MovieID       1000209 non-null int64
Rating        1000209 non-null int64
Timestamp     1000209 non-null int64
Title         1000209 non-null object
Genres        1000209 non-null object
dtypes: int64(6), object(4)
memory usage: 83.9+ MB


## 查看数据

In [9]:
# 合并后的每一条记录反映了每个人的年龄，职业，性别，邮编，电影ID，评分，时间戳，电影信息，电影分类等一系列信息
# 比如我们查看用户id为12的所有信息
data[data.UserID==12]

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code,MovieID,Rating,Timestamp,Title,Genres
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
8929,12,M,25,12,32793,2804,5,978220237,"Christmas Story, A (1983)",Comedy|Drama
11044,12,M,25,12,32793,919,5,978220120,"Wizard of Oz, The (1939)",Adventure|Children's|Drama|Musical
133412,12,M,25,12,32793,1198,5,978218949,Raiders of the Lost Ark (1981),Action|Adventure
135926,12,M,25,12,32793,593,5,978220193,"Silence of the Lambs, The (1991)",Drama|Thriller
180939,12,M,25,12,32793,1247,3,978220216,"Graduate, The (1967)",Drama|Romance
183406,12,M,25,12,32793,1641,3,978218568,"Full Monty, The (1997)",Comedy
352131,12,M,25,12,32793,1221,5,978218949,"Godfather: Part II, The (1974)",Action|Crime|Drama
375401,12,M,25,12,32793,111,5,978220179,Taxi Driver (1976),Drama|Thriller
380562,12,M,25,12,32793,3265,4,978218916,Hard-Boiled (Lashou shentan) (1992),Action|Crime


## 查看每一部电影不同性别的平均评分并计算分歧差值，之后排序

In [10]:
# 查看每一部电影不同性别的平均评分 data_gender接收
data_gender=data.pivot_table(index='Title',columns='Gender',values='Rating')
data_gender.head()

Gender,F,M
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.375,2.761905
'Night Mother (1986),3.388889,3.352941
'Til There Was You (1997),2.675676,2.733333
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),3.828571,3.689024


In [18]:
# 查看电影分歧最大的那部电影，在原数据中体现
data_gender['diff']=np.abs(data_gender.F-data_gender.M)
print(data_gender.shape)
data_gender.head(10)

(3706, 3)


Gender,F,M,diff
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"$1,000,000 Duck (1971)",3.375,2.761905,0.613095
'Night Mother (1986),3.388889,3.352941,0.035948
'Til There Was You (1997),2.675676,2.733333,0.057658
"'burbs, The (1989)",2.793478,2.962085,0.168607
...And Justice for All (1979),3.828571,3.689024,0.139547
1-900 (1994),2.0,3.0,1.0
10 Things I Hate About You (1999),3.646552,3.311966,0.334586
101 Dalmatians (1961),3.791444,3.5,0.291444
101 Dalmatians (1996),3.24,2.911215,0.328785
12 Angry Men (1957),4.184397,4.328421,0.144024


In [19]:
# 男女电影分歧最大进行排序 data_gender_sorted接收
data_gender_sorted=data_gender.sort_values(by='diff',ascending=False)
data_gender_sorted.head()

Gender,F,M,diff
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tigrero: A Film That Was Never Made (1994),1.0,4.333333,3.333333
"Spiders, The (Die Spinnen, 1. Teil: Der Goldene See) (1919)",4.0,1.0,3.0
"Neon Bible, The (1995)",1.0,4.0,3.0
"James Dean Story, The (1957)",4.0,1.0,3.0
Country Life (1994),5.0,2.0,3.0


## 算出每部电影平均得分并对其进行排序

In [21]:
#算出每部电影平均得分并对其进行排序 data_mean_rating 接收
data_mean_rating=data.pivot_table(index='Title',values='Rating')
data_mean_rating['size'] = 
data_mean_rating.head()

Unnamed: 0_level_0,Rating
Title,Unnamed: 1_level_1
"$1,000,000 Duck (1971)",3.027027
'Night Mother (1986),3.371429
'Til There Was You (1997),2.692308
"'burbs, The (1989)",2.910891
...And Justice for All (1979),3.713568


In [22]:
# 对电影平均得分排序
data_mean_rating_sorted=data_mean_rating.sort_values(by='Rating',ascending=False)
data_mean_rating_sorted.head()

Unnamed: 0_level_0,Rating
Title,Unnamed: 1_level_1
Ulysses (Ulisse) (1954),5.0
Lured (1947),5.0
Follow the Bitch (1998),5.0
Bittersweet Motel (2000),5.0
Song of Freedom (1936),5.0


## 查看评分次数多的电影并进行排序 

In [41]:
# 查看评分次数多的电影并进行排序   data_rating_num接收
data_rating_num = pd.crosstab(data.Title, data.Rating)
data_rating_num['count'] = np.sum(data_rating_num, axis=1)
data_rating_num.head()


Rating,1,2,3,4,5,count
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"$1,000,000 Duck (1971)",3,8,15,7,4,37
'Night Mother (1986),4,10,25,18,13,70
'Til There Was You (1997),5,20,15,10,2,52
"'burbs, The (1989)",36,69,107,68,23,303
...And Justice for All (1979),2,12,65,82,38,199


In [43]:
#进行排序
data_rating_num_sorted = data_rating_num.sort_values(by='count',ascending=False)
print(data_rating_num_sorted.shape)
data_rating_num_sorted.head()

Rating,1,2,3,4,5,count
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
American Beauty (1999),83,134,358,890,1963,3428
Star Wars: Episode IV - A New Hope (1977),19,62,288,796,1826,2991
Star Wars: Episode V - The Empire Strikes Back (1980),22,83,375,1027,1483,2990
Star Wars: Episode VI - Return of the Jedi (1983),39,128,589,1099,1028,2883
Jurassic Park (1993),62,197,683,1098,632,2672


# movielens电影评分数据分析(下)

## 过滤掉评分条目数不足250条的电影

In [54]:
#过滤掉评分条目数不足250条的电影
data_rating_num_sorted = data_rating_num_sorted[data_rating_num_sorted.count<50]
print(data_rating_num_sorted.shape)

In [53]:
#对评分数量进行排序，并取前20条数据


## 评分最高的十部电影


In [52]:
#评分最高的十部电影


## 查看不同年龄的分布情况并且采用直方图进行可视化

## 在原数据中标记出用户位于的年龄分组

## 电影评分表中计算不同类型电影的频数

In [51]:
#对数据进行规整-movies


In [50]:
#删除level_1列，将columns为0的列重命名为genres,并重新定义数据框为movies_genres


In [49]:
#将原movies数据中的genres列替换成movies_genres，得到规整化处理后的movies数据 



In [48]:
#合并。构建电影评分数据集movie_ratings



In [47]:
#计算movies_ratings中不同类型电影的频数

