### Постройте топ фильмов в категориях Action и Comedy

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

In [2]:
links = pd.read_csv('../data/links.csv')
movies = pd.read_csv('../data/movies.csv')
ratings = pd.read_csv('../data/ratings.csv')
tags = pd.read_csv('../data/tags.csv')

In [3]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [7]:
# соединим таблицу с рейтингами и названиями фильмов
joined_ratings = ratings.join(movies.set_index('movieId'), on='movieId')

In [8]:
# достанем по каждому фильму количество рейтингов
title_num_ratings = {}

for title, group in tqdm_notebook(joined_ratings.groupby('title')):
    title_num_ratings[title] = group.userId.unique().shape[0]

HBox(children=(IntProgress(value=0, max=9719), HTML(value='')))




In [9]:
# считаем средний рейтинг на каждый фильм
title_mean_rating = {}

for title, group in tqdm_notebook(joined_ratings.groupby('title')):
    title_mean_rating[title] = group.rating.mean()

HBox(children=(IntProgress(value=0, max=9719), HTML(value='')))




In [10]:
# соединим уже созданную таблицу с таблицей с проставленными тегами по фильмам
joined_with_tags = joined_ratings.join(tags.set_index('movieId'), on='movieId', lsuffix='_left', rsuffix='_right')

In [11]:
joined_with_tags.head()

Unnamed: 0,userId_left,movieId,rating,timestamp_left,title,genres,userId_right,tag,timestamp_right
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,289.0,moldy,1143425000.0
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,289.0,old,1143425000.0


In [12]:
# достанем по каждому фильму количество рейтингов
title_num_tags = {}

for title, group in tqdm_notebook(joined_with_tags.groupby('title')):
    title_num_tags[title] = group.userId_right.unique().shape[0]

HBox(children=(IntProgress(value=0, max=9719), HTML(value='')))




In [13]:
title_num_tags_gen = [title_num_tags[f] for f in title_num_tags.keys()]
min_num_tags = np.min(title_num_tags_gen)
max_num_tags = np.max(title_num_tags_gen)
mean_num_tags = np.mean(title_num_tags_gen)
median_num_tags = np.median(title_num_tags_gen)

In [14]:
print("Min:", min_num_tags)
print("Max:", max_num_tags)
print("Mean:", mean_num_tags)
print("Median:", median_num_tags)

Min: 1
Max: 10
Mean: 1.0210927050108036
Median: 1.0


In [15]:
film_with_new_mark = []

for f in title_num_ratings.keys():
    # посчитаем нашу новую метрику для каждого фильма из датасета
    film_with_new_mark.append(
        (f, title_mean_rating[f] * (title_num_tags[f] - min_num_tags) / (max_num_tags - min_num_tags))
    )

In [16]:
film_with_new_mark[:20]

[("'71 (2014)", 0.0),
 ("'Hellboy': The Seeds of Creation (2004)", 0.0),
 ("'Round Midnight (1986)", 0.0),
 ("'Salem's Lot (2004)", 0.0),
 ("'Til There Was You (1997)", 0.0),
 ("'Tis the Season for Love (2015)", 0.0),
 ("'burbs, The (1989)", 0.0),
 ("'night Mother (1986)", 0.0),
 ('(500) Days of Summer (2009)', 0.0),
 ('*batteries not included (1987)', 0.0),
 ('...All the Marbles (1981)', 0.0),
 ('...And Justice for All (1979)', 0.0),
 ('00 Schneider - Jagd auf Nihil Baxter (1994)', 0.0),
 ('1-900 (06) (1994)', 0.0),
 ('10 (1979)', 0.0),
 ('10 Cent Pistol (2015)', 0.0),
 ('10 Cloverfield Lane (2016)', 0.0),
 ('10 Items or Less (2006)', 0.0),
 ('10 Things I Hate About You (1999)', 0.0),
 ('10 Years (2011)', 0.0)]

In [17]:
# создадим подмножество фильмов с жанром Action
action_films_list = list(movies[movies["genres"].str.contains("Action")].title)

In [18]:
action_films_list[:10]

['Heat (1995)',
 'Sudden Death (1995)',
 'GoldenEye (1995)',
 'Cutthroat Island (1995)',
 'Money Train (1995)',
 'Assassins (1995)',
 'Dead Presidents (1995)',
 'Mortal Kombat (1995)',
 'Lawnmower Man 2: Beyond Cyberspace (1996)',
 'From Dusk Till Dawn (1996)']

In [19]:
# создадим подмножество фильмов с жанром Comedy
comedy_films_list = list(movies[movies["genres"].str.contains("Comedy")].title)

In [20]:
comedy_films_list[:10]

['Toy Story (1995)',
 'Grumpier Old Men (1995)',
 'Waiting to Exhale (1995)',
 'Father of the Bride Part II (1995)',
 'Sabrina (1995)',
 'American President, The (1995)',
 'Dracula: Dead and Loving It (1995)',
 'Four Rooms (1995)',
 'Ace Ventura: When Nature Calls (1995)',
 'Money Train (1995)']

In [21]:
# Рейтинг фильмов (top 20) с жанром Action
list(sorted(filter(lambda x: x[0] in action_films_list, film_with_new_mark), key=lambda x: x[1], reverse=True))[:20]

[('Star Wars: Episode IV - A New Hope (1977)', 4.231075697211155),
 ('Fight Club (1999)', 1.4243119266055047),
 ('Blade Runner (1982)', 1.3669354838709675),
 ('Inception (2010)', 1.3554778554778555),
 ('Matrix, The (1999)', 0.9316546762589929),
 ('City of God (Cidade de Deus) (2002)', 0.9214814814814815),
 ('Léon: The Professional (a.k.a. The Professional) (Léon) (1994)',
  0.8930659983291562),
 ('Terminator 2: Judgment Day (1991)', 0.8824404761904762),
 ('Dark Knight, The (2008)', 0.47091722595078295),
 ('Star Wars: Episode V - The Empire Strikes Back (1980)',
  0.46840442338072674),
 ('Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
  0.46749999999999997),
 ('North by Northwest (1959)', 0.4649122807017544),
 ('John Wick: Chapter Two (2017)', 0.46031746031746035),
 ('Lord of the Rings: The Return of the King, The (2003)',
  0.45765765765765765),
 ('Braveheart (1995)', 0.4479606188466948),
 ('Avengers: Infinity War - Part I (2018)', 0.4444444444444444),

In [22]:
# Рейтинг фильмов (top 20) с жанром Comedy
list(sorted(filter(lambda x: x[0] in comedy_films_list, film_with_new_mark), key=lambda x: x[1], reverse=True))[:20]

[('Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)',
  1.8969072164948453),
 ('Pulp Fiction (1994)', 1.3990228013029318),
 ('Anchorman: The Legend of Ron Burgundy (2004)', 1.2573099415204678),
 ('Forrest Gump (1994)', 0.9253630530226276),
 ('Toy Story (1995)', 0.8713178294573644),
 ('Step Brothers (2008)', 0.7896825396825397),
 ('Corpse Bride (2005)', 0.7853535353535354),
 ('Happy Gilmore (1996)', 0.7643097643097643),
 ('Monty Python and the Holy Grail (1975)', 0.4624183006535948),
 ('Life Is Beautiful (La Vita è bella) (1997)', 0.46085858585858586),
 ('Fargo (1996)', 0.4573357888275015),
 ('Kiss Kiss Bang Bang (2005)', 0.45238095238095233),
 ('Trainspotting (1996)', 0.44880174291938996),
 ('Lost in Translation (2003)', 0.4481981981981982),
 ('Three Colors: White (Trzy kolory: Bialy) (1994)', 0.44722222222222224),
 ('This Is Spinal Tap (1984)', 0.44612794612794615),
 ('Finding Nemo (2003)', 0.4401103230890465),
 ('Big Lebowski, The (1998)', 0.436058700209643