## Python for Data Analysis - C02

### 2. MovieLens 영화평점 dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
plt.rc('figure', figsize=(12, 5))
np.set_printoptions(precision=4)

In [3]:
ratings = pd.read_csv('data/movielens_ratings.csv')
movies = pd.read_csv('data/movielens_movies.csv')
users = pd.read_csv('data/movielens_users.csv')

In [4]:
ratings.tail()

Unnamed: 0,userId,movieId,rating,timestamp
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663
100003,671,6565,3.5,1074784724


In [5]:
movies.tail()

Unnamed: 0,movieId,title,genres
9120,162672,Mohenjo Daro (2016),Adventure|Drama|Romance
9121,163056,Shin Godzilla (2016),Action|Adventure|Fantasy|Sci-Fi
9122,163949,The Beatles: Eight Days a Week - The Touring Y...,Documentary
9123,164977,The Gay Desperado (1936),Comedy
9124,164979,"Women of '69, Unboxed",Documentary


In [6]:
users.tail()

Unnamed: 0,userId,gender,age
6035,6036,F,25
6036,6037,F,45
6037,6038,F,56
6038,6039,F,45
6039,6040,M,25


In [7]:
ratings.shape

(100004, 4)

In [8]:
movies.shape

(9125, 3)

In [9]:
users.shape

(6040, 3)

In [10]:
# 공통 컬럼을 이용한 데이터 병합

In [11]:
data = pd.merge(pd.merge(ratings, users), movies)

### user 평점 갯수 순위

In [12]:
x = pd.DataFrame(data.groupby('userId').size())
x.sort_values(by=0, ascending=False).head()

Unnamed: 0_level_0,0
userId,Unnamed: 1_level_1
547,2391
564,1868
624,1735
15,1700
73,1610


In [13]:
data[data['userId'] == 547].head()

Unnamed: 0,userId,movieId,rating,timestamp,gender,age,title,genres
72,547,1029,5.0,1011142236,M,35,Dumbo (1941),Animation|Children|Drama|Musical
156,547,1129,3.5,1073443756,M,35,Escape from New York (1981),Action|Adventure|Sci-Fi|Thriller
207,547,1172,5.0,1373125067,M,35,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama
250,547,1263,5.0,974810170,M,35,"Deer Hunter, The (1978)",Drama|War
343,547,1293,5.0,981312829,M,35,Gandhi (1982),Drama


### 각 영화의 성별 평균 평점

In [14]:
# fill_value : missing value 대치값
mean_ratings = data.pivot_table('rating', index='title', columns='gender', aggfunc='mean', fill_value=0)
mean_ratings.head()

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"""Great Performances"" Cats (1998)",0.0,1.75
$9.99 (2008),2.5,4.5
'Hellboy': The Seeds of Creation (2004),0.0,2.0
'Neath the Arizona Skies (1934),0.0,0.5
'Round Midnight (1986),0.0,2.25


In [15]:
# 평점이 220개 이상인 영화만 필터링
ratings_by_title = data.groupby('title').size()
active_titles = ratings_by_title.index[ratings_by_title >= 220]
active_titles

Index(['American Beauty (1999)', 'Back to the Future (1985)',
       'Braveheart (1995)', 'Fargo (1996)', 'Forrest Gump (1994)',
       'Jurassic Park (1993)', 'Matrix, The (1999)', 'Pulp Fiction (1994)',
       'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
       'Schindler's List (1993)', 'Shawshank Redemption, The (1994)',
       'Silence of the Lambs, The (1991)',
       'Star Wars: Episode IV - A New Hope (1977)',
       'Star Wars: Episode V - The Empire Strikes Back (1980)',
       'Terminator 2: Judgment Day (1991)', 'Toy Story (1995)'],
      dtype='object', name='title')

In [16]:
# 평점이 200개 이상인 영화들의 성별 평균 평점
mean_ratings = mean_ratings.loc[active_titles]
mean_ratings

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
American Beauty (1999),4.263158,4.226994
Back to the Future (1985),4.064516,3.996951
Braveheart (1995),4.0,3.917219
Fargo (1996),4.164384,4.301325
Forrest Gump (1994),4.106481,4.030043
Jurassic Park (1993),3.695652,3.711538
"Matrix, The (1999)",4.014286,4.246032
Pulp Fiction (1994),4.25,4.259174
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981),4.105263,4.223926
Schindler's List (1993),4.267123,4.318713


### 여성이 가장 좋아한 영화 순위

In [17]:
top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)
top_female_ratings[:10]

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Shawshank Redemption, The (1994)",4.39604,4.530952
Schindler's List (1993),4.267123,4.318713
American Beauty (1999),4.263158,4.226994
Pulp Fiction (1994),4.25,4.259174
Fargo (1996),4.164384,4.301325
Star Wars: Episode IV - A New Hope (1977),4.160494,4.245238
Star Wars: Episode V - The Empire Strikes Back (1980),4.147541,4.263006
Forrest Gump (1994),4.106481,4.030043
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981),4.105263,4.223926
"Silence of the Lambs, The (1991)",4.071429,4.163636


### 남녀간의 평균 평점 차이

In [18]:
# 평점 차이 컬럼 추가
mean_ratings['diff'] = mean_ratings['F'] - mean_ratings['M']
mean_ratings.head()

gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
American Beauty (1999),4.263158,4.226994,0.036164
Back to the Future (1985),4.064516,3.996951,0.067565
Braveheart (1995),4.0,3.917219,0.082781
Fargo (1996),4.164384,4.301325,-0.136941
Forrest Gump (1994),4.106481,4.030043,0.076439


In [19]:
mean_ratings.sort_values(by='diff', ascending=False)

gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Braveheart (1995),4.0,3.917219,0.082781
Forrest Gump (1994),4.106481,4.030043,0.076439
Back to the Future (1985),4.064516,3.996951,0.067565
American Beauty (1999),4.263158,4.226994,0.036164
Terminator 2: Judgment Day (1991),4.030303,3.997076,0.033227
Pulp Fiction (1994),4.25,4.259174,-0.009174
Jurassic Park (1993),3.695652,3.711538,-0.015886
Schindler's List (1993),4.267123,4.318713,-0.05159
Star Wars: Episode IV - A New Hope (1977),4.160494,4.245238,-0.084744
"Silence of the Lambs, The (1991)",4.071429,4.163636,-0.092208


### 평점의 표준편차가 큰 영화

In [20]:
ratings_by_sd = data.groupby('title')['rating'].std()
ratings_by_sd = ratings_by_sd.loc[active_titles]
ratings_by_sd.sort_values(ascending=False)

title
Braveheart (1995)                                                                 1.023006
Toy Story (1995)                                                                  0.958981
Jurassic Park (1993)                                                              0.917073
Star Wars: Episode IV - A New Hope (1977)                                         0.908682
Matrix, The (1999)                                                                0.901202
Fargo (1996)                                                                      0.887102
Schindler's List (1993)                                                           0.882051
Star Wars: Episode V - The Empire Strikes Back (1980)                             0.880296
Terminator 2: Judgment Day (1991)                                                 0.879957
Silence of the Lambs, The (1991)                                                  0.874459
Forrest Gump (1994)                                                               0.