In [1]:
import pandas as pd
pd.set_option('html', False)

In [2]:
# Prévia do layout dos arquivos
!head README -n5
!echo
!head movies.dat

SUMMARY

These files contain 1,000,209 anonymous ratings of approximately 3,900 movies 
made by 6,040 MovieLens users who joined MovieLens in 2000.

1::Toy Story (1995)::Animation|Children's|Comedy
2::Jumanji (1995)::Adventure|Children's|Fantasy
3::Grumpier Old Men (1995)::Comedy|Romance
4::Waiting to Exhale (1995)::Comedy|Drama
5::Father of the Bride Part II (1995)::Comedy
6::Heat (1995)::Action|Crime|Thriller
7::Sabrina (1995)::Comedy|Romance
8::Tom and Huck (1995)::Adventure|Children's
9::Sudden Death (1995)::Action
10::GoldenEye (1995)::Action|Adventure|Thriller


In [3]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('movies.dat', sep='::', header=None, names=mnames)

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('users.dat', sep='::', header=None, names=unames)

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('ratings.dat', sep='::', header=None, names=rnames)





In [4]:
movies.head()

   movie_id                               title                        genres
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)                        Comedy

In [5]:
users.head()

   user_id gender  age  occupation    zip
0        1      F    1          10  48067
1        2      M   56          16  70072
2        3      M   25          15  55117
3        4      M   45           7  02460
4        5      M   25          20  55455

In [6]:
ratings.head()

   user_id  movie_id  rating  timestamp
0        1      1193       5  978300760
1        1       661       3  978302109
2        1       914       3  978301968
3        1      3408       4  978300275
4        1      2355       5  978824291

In [7]:
data = pd.merge(pd.merge(ratings, users), movies)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000209 entries, 0 to 1000208
Data columns (total 10 columns):
user_id       1000209 non-null int64
movie_id      1000209 non-null int64
rating        1000209 non-null int64
timestamp     1000209 non-null int64
gender        1000209 non-null object
age           1000209 non-null int64
occupation    1000209 non-null int64
zip           1000209 non-null object
title         1000209 non-null object
genres        1000209 non-null object
dtypes: int64(6), object(4)
memory usage: 83.9+ MB


In [8]:
# Buscar filmes com mais de 1000 votos
rating_counts = data.groupby('title').size()
freq_titles = rating_counts.index[rating_counts > 1000]
freq_titles

Index(['2001: A Space Odyssey (1968)', 'Abyss, The (1989)',
       'African Queen, The (1951)', 'Air Force One (1997)', 'Airplane! (1980)',
       'Aladdin (1992)', 'Alien (1979)', 'Aliens (1986)', 'Amadeus (1984)',
       'American Beauty (1999)', 
       ...
       'Untouchables, The (1987)', 'Usual Suspects, The (1995)',
       'Wayne's World (1992)', 'When Harry Met Sally... (1989)',
       'Who Framed Roger Rabbit? (1988)',
       'Willy Wonka and the Chocolate Factory (1971)', 'Witness (1985)',
       'Wizard of Oz, The (1939)', 'X-Men (2000)',
       'Young Frankenstein (1974)'],
      dtype='object', name='title', length=207)

In [9]:
# Demonstrar chain passo a passo
highest_rated = data.groupby('title').rating.mean()[freq_titles].order()[-20:]
highest_rated

title
Life Is Beautiful (La Vita � bella) (1997)                                     4.329861
Monty Python and the Holy Grail (1974)                                         4.335210
Saving Private Ryan (1998)                                                     4.337354
Chinatown (1974)                                                               4.339241
Silence of the Lambs, The (1991)                                               4.351823
Godfather: Part II, The (1974)                                                 4.357565
North by Northwest (1959)                                                      4.384030
Citizen Kane (1941)                                                            4.388889
One Flew Over the Cuckoo's Nest (1975)                                         4.390725
Maltese Falcon, The (1941)                                                     4.395973
Sixth Sense, The (1999)                                                        4.406263
Casablanca (1942)         

In [11]:
filtered = data[data.title.isin(highest_rated.index)]
filtered.title = filtered.title.str[:25]
filtered.groupby(['title', 'gender']).rating.count().unstack()

gender                       F     M
title                               
Casablanca (1942)          505  1164
Chinatown (1974)           255   930
Citizen Kane (1941)        280   836
Dr. Strangelove or: How I  231  1136
Godfather, The (1972)      483  1740
Godfather: Part II, The (  342  1350
Life Is Beautiful (La Vit  367   785
Maltese Falcon, The (1941  235   808
Monty Python and the Holy  352  1247
North by Northwest (1959)  332   983
One Flew Over the Cuckoo'  444  1281
Raiders of the Lost Ark (  572  1942
Rear Window (1954)         291   759
Saving Private Ryan (1998  575  2078
Schindler's List (1993)    615  1689
Shawshank Redemption, The  627  1600
Silence of the Lambs, The  706  1872
Sixth Sense, The (1999)    664  1795
Star Wars: Episode IV - A  647  2344
Usual Suspects, The (1995  413  1370

In [12]:
# Notação de tabela dinâmica 
mean_ratings = data.pivot_table('rating', index='title',
                                columns=['gender'], aggfunc='mean')
mean_ratings.tail(20)

gender                                             F         M
title                                                         
Year of the Horse (1997)                         NaN  3.250000
Yellow Submarine (1968)                     3.714286  3.689286
Yojimbo (1961)                              4.423077  4.402116
You Can't Take It With You (1938)           4.192308  3.921569
You So Crazy (1994)                         3.666667  2.300000
You've Got Mail (1998)                      3.542424  3.275591
Young Doctors in Love (1982)                1.923077  2.742424
Young Frankenstein (1974)                   4.289963  4.239177
Young Guns (1988)                           3.371795  3.425620
Young Guns II (1990)                        2.934783  2.904025
Young Poisoner's Handbook, The (1995)       4.000000  3.532258
Young Sherlock Holmes (1985)                3.514706  3.363344
Young and Innocent (1937)                   2.500000  3.500000
Your Friends and Neighbors (1998)           2.888889  3