# Best films in every genre

## *An analysis of a movie ratings dataset*

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as plt

<h3>Acquiring dataset</h3>

Here are the links to the data source and location:
* **Data Source:** MovieLens web site (filename: ml-20m.zip)
* **Location:** https://grouplens.org/datasets/movielens/

In [2]:
!ls ./database_from_movielens

README.txt        genome-tags.csv   movies.csv        tags.csv
genome-scores.csv links.csv         ratings.csv


In [233]:
movies=pd.read_csv('./database_from_movielens/movies.csv',sep=',')
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
#Extract only the movies that are in the sci-fi category
sci_movies_filter=movies['genres'].str.contains('Sci-Fi')
movies[sci_movies_filter].shape

(1743, 3)

In [5]:
tags=pd.read_csv('./database_from_movielens/tags.csv',sep=',')
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078


In [6]:
ratings=pd.read_csv('./database_from_movielens/ratings.csv',sep=',')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [13]:
ratings['rating'].describe()

count    2.000026e+07
mean     3.525529e+00
std      1.051989e+00
min      5.000000e-01
25%      3.000000e+00
50%      3.500000e+00
75%      4.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

In [252]:
tag_count=tags['tag'].value_counts()
tag_count[:10]

sci-fi             3384
based on a book    3281
atmospheric        2917
comedy             2779
action             2657
surreal            2427
BD-R               2334
twist ending       2323
funny              2072
dystopia           1991
Name: tag, dtype: int64

## Cleaning data

In [9]:
#Check if any row is null in movies
movies.isnull().any()

movieId    False
title      False
genres     False
dtype: bool

In [10]:
#Check if any row is null in ratings
ratings.isnull().any()

userId       False
movieId      False
rating       False
timestamp    False
dtype: bool

## Exploring the genres

In [254]:
#Find the unique genres
movie_genres=movies['genres'].str.cat(sep=',')
string_movie_genres=movie_genres.replace('|',',')
list_movie_genres=string_movie_genres.split(',')
unique_genres=list(set(list_movie_genres))
unique_genres.remove('(no genres listed)')
unique_genres

['Adventure',
 'Comedy',
 'Western',
 'Action',
 'Film-Noir',
 'IMAX',
 'Fantasy',
 'Documentary',
 'Animation',
 'Horror',
 'Children',
 'Mystery',
 'Musical',
 'War',
 'Drama',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'Crime']

In [251]:
#Find the number of counts for each genre
count_dict={}
for g in unique_genres:
    counts=list_movie_genres.count(g)
    count_dict[g]=counts
genre_counts=pd.DataFrame.from_dict(count_dict,orient='index',columns=['Counts'])
genre_counts.sort_values(by=['Counts'], inplace=True,ascending=False)
genre_counts[:10]

Unnamed: 0,Counts
Drama,13344
Comedy,8374
Thriller,4178
Romance,4127
Action,3520
Crime,2939
Horror,2611
Documentary,2471
Adventure,2329
Sci-Fi,1743


In [163]:
#delete the rows in movies that have no genres listed
no_genre_filter=movies['genres'].str.contains('no genres listed')==False
movies=movies[no_genre_filter]
movies.shape

(27032, 3)

In [158]:
#find all the films that have been rated more than a hundred times
rating_count=ratings[['movieId','rating']].groupby('movieId', as_index=False).count()
rating_count_filter=rating_count['rating']>100

In [159]:
#get the average rating for each movie and apply the rating count filter
avg_ratings = ratings.groupby('movieId', as_index=False).mean() #Don't make movieId the index
del avg_ratings['timestamp']
del avg_ratings['userId']
filt_avg_ratings=avg_ratings[rating_count_filter]

In [160]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [164]:
#merge movies df and filtered average ratings df
box_office=movies.merge(filt_avg_ratings,on='movieId',how='inner')
box_office.shape

(8532, 4)

In [260]:
i = 0
while i<len(unique_genres):
    df1=box_office[(box_office['genres'].str.contains(unique_genres[i]))]
    df2=df1.sort_values(by='rating')[-10:]
    print('Top 10 {} films: \n {}'.format(unique_genres[i],df2[['title','rating']]))
    print()
    i = i + 1

Top 10 Adventure films: 
                                                   title    rating
962              Monty Python and the Holy Grail (1975)  4.174146
1002                         Princess Bride, The (1987)  4.176732
1001  Star Wars: Episode V - The Empire Strikes Back...  4.188202
239           Star Wars: Episode IV - A New Hope (1977)  4.190672
4461  Spirited Away (Sen to Chihiro no kamikakushi) ...  4.203810
2536                                     Yojimbo (1961)  4.211717
1003  Raiders of the Lost Ark (Indiana Jones and the...  4.219009
765                           North by Northwest (1959)  4.233538
4677                City of God (Cidade de Deus) (2002)  4.235410
1659        Seven Samurai (Shichinin no samurai) (1954)  4.274180

Top 10 Comedy films: 
                                                   title    rating
1037                                  Sting, The (1973)  4.173556
2756                                 City Lights (1931)  4.174123
962              Monty Pyt