In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from datetime import datetime
import re
from itertools import combinations

In [2]:
data = pd.read_csv('movie_bd_v5.csv')
data.sample(5)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year
580,tt0255798,22000000,84772742,The Animal,Rob Schneider|Colleen Haskell|John C. McGinley...,Luke Greenfield,He wasn't much of a man... Now he's not much o...,When loser Marvin Mange is involved in a horri...,84,Action|Comedy,Revolution Studios|Happy Madison Productions,6/1/2001,4.5,2001
1207,tt1588173,35000000,116980662,Warm Bodies,Nicholas Hoult|Teresa Palmer|Analeigh Tipton|R...,Jonathan Levine,Cold body. Warm heart.,After a zombie becomes involved with the girlf...,97,Horror|Comedy|Romance,Summit Entertainment,1/31/2013,6.4,2013
1311,tt1966566,30000000,68129518,Ð¡Ñ‚Ð°Ð»Ð¸Ð½Ð³Ñ€Ð°Ð´,Thomas Kretschmann|Yanina Studilina|Philippe R...,Fyodor Bondarchuk,The Epic Battle That Turned The Tide Of World ...,"Drama set in 1942, during one of the most impo...",131,War|Action,Art Pictures Studio,10/9/2013,5.5,2013
659,tt0824747,55000000,113020255,Changeling,Angelina Jolie|Jeffrey Donovan|John Malkovich|...,Clint Eastwood,"To find her son, she did what no one else dared.",Christine Collins is overjoyed when her kidnap...,141,Crime|Drama|Mystery,Imagine Entertainment|Malpaso Productions|Rela...,1/30/2008,7.1,2008
1574,tt0338348,165000000,305875730,The Polar Express,Tom Hanks|Michael Jeter|Eddie Deezen|Chris Cop...,Robert Zemeckis,Journey Beyond Your Imagination.,This is the story of a young boy on Christmas ...,100,Animation|Adventure|Family|Fantasy,Castle Rock Entertainment|Golden Mean|Universa...,11/9/2004,6.4,2004


In [3]:
data.describe()

Unnamed: 0,budget,revenue,runtime,vote_average,release_year
count,1889.0,1889.0,1889.0,1889.0,1889.0
mean,54310830.0,155365300.0,109.658549,6.140762,2007.860773
std,48587210.0,214669800.0,18.017041,0.764763,4.468841
min,5000000.0,2033165.0,63.0,3.3,2000.0
25%,20000000.0,34560580.0,97.0,5.6,2004.0
50%,38000000.0,83615410.0,107.0,6.1,2008.0
75%,72000000.0,178262600.0,120.0,6.6,2012.0
max,380000000.0,2781506000.0,214.0,8.1,2015.0


In [4]:
data.tail()

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year
1884,tt0120903,75000000,157299717,X-Men,Patrick Stewart|Hugh Jackman|Ian McKellen|Hall...,Bryan Singer,Evolution Begins,"Two mutants, Rogue and Wolverine, come to a pr...",104,Adventure|Action|Science Fiction,Twentieth Century Fox Film Corporation|Donners...,7/13/2000,6.6,2000
1885,tt0192255,22000000,13555988,The Little Vampire,Richard E. Grant|Jonathan Lipnicki|Jim Carter|...,Uli Edel,"They're not just best friends, they're blood b...","Based on the popular books, the story tells of...",95,Horror|Family|Foreign,New Line Cinema,10/27/2000,6.4,2000
1886,tt0131704,76000000,35134820,The Adventures of Rocky & Bullwinkle,Rene Russo|Jason Alexander|Piper Perabo|Randy ...,Des McAnuff,This summer it's not the same old bull.,Rocky and Bullwinkle have been living off the ...,88,Adventure|Animation|Action|Comedy|Family,Universal Pictures|Capella International|KC Me...,6/30/2000,4.0,2000
1887,tt0162983,40000000,36037909,Hanging Up,Meg Ryan|Diane Keaton|Lisa Kudrow|Walter Matth...,Diane Keaton,Every family has a few hang-ups.,A trio of sisters bond over their ambivalence ...,94,Comedy|Drama,Laurence Mark Productions|Columbia Pictures Co...,2/16/2000,5.2,2000
1888,tt0163676,15000000,5217498,The In Crowd,Susan Ward|Lori Heuring|Matthew Settle|Nathan ...,Mary Lambert,What would you do to get in?,A mentally disturbed young woman takes a job a...,105,Thriller,Warner Bros. Pictures,7/19/2000,5.2,2000


# Предобработка

In [5]:
answers = {} # создадим словарь для ответов

In [6]:
# добавим столбец с прибылью
data['profit'] = data['revenue'] - data['budget']

In [7]:
# преобразуем дату выпуска в формат datetime
data.release_date = data.release_date.apply(lambda x:datetime.strptime(x,"%m/%d/%Y"))

In [8]:
# проверяем, что год выпуска совпадает в столбцах release_date и release_year
len(data[data['release_date'].dt.year!=data['release_year']])

0

In [9]:
# преобразуем множественные данные в списки
def splitter (s):
    return s.apply(lambda x:x.split('|'))

In [10]:
data.cast = splitter(data.cast)
data.director = splitter(data.director)
data.genres = splitter(data.genres)
data.production_companies = splitter(data.production_companies)

In [11]:
# функция показывает описания фильмов от студии comp. Можно указывать неполное название студии. (для вопроса № 25)
def show_overview(company):
    return data[data.production_companies.apply(lambda x:company in str(x))].overview.values

In [12]:
data.head()

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit
0,tt0369610,150000000,1513528810,Jurassic World,"[Chris Pratt, Bryce Dallas Howard, Irrfan Khan...",[Colin Trevorrow],The park is open.,Twenty-two years after the events of Jurassic ...,124,"[Action, Adventure, Science Fiction, Thriller]","[Universal Studios, Amblin Entertainment, Lege...",2015-06-09,6.5,2015,1363528810
1,tt1392190,150000000,378436354,Mad Max: Fury Road,"[Tom Hardy, Charlize Theron, Hugh Keays-Byrne,...",[George Miller],What a Lovely Day.,An apocalyptic story set in the furthest reach...,120,"[Action, Adventure, Science Fiction, Thriller]","[Village Roadshow Pictures, Kennedy Miller Pro...",2015-05-13,7.1,2015,228436354
2,tt2908446,110000000,295238201,Insurgent,"[Shailene Woodley, Theo James, Kate Winslet, A...",[Robert Schwentke],One Choice Can Destroy You,Beatrice Prior must confront her inner demons ...,119,"[Adventure, Science Fiction, Thriller]","[Summit Entertainment, Mandeville Films, Red W...",2015-03-18,6.3,2015,185238201
3,tt2488496,200000000,2068178225,Star Wars: The Force Awakens,"[Harrison Ford, Mark Hamill, Carrie Fisher, Ad...",[J.J. Abrams],Every generation has a story.,Thirty years after defeating the Galactic Empi...,136,"[Action, Adventure, Science Fiction, Fantasy]","[Lucasfilm, Truenorth Productions, Bad Robot]",2015-12-15,7.5,2015,1868178225
4,tt2820852,190000000,1506249360,Furious 7,"[Vin Diesel, Paul Walker, Jason Statham, Miche...",[James Wan],Vengeance Hits Home,Deckard Shaw seeks revenge against Dominic Tor...,137,"[Action, Crime, Thriller]","[Universal Pictures, Original Film, Media Righ...",2015-04-01,7.3,2015,1316249360


# 1. У какого фильма из списка самый большой бюджет?

Использовать варианты ответов в коде решения запрещено.    
Вы думаете и в жизни у вас будут варианты ответов?)

In [13]:
# в словарь вставляем номер вопроса и ваш ответ на него
# Пример: 
answers['1'] = '2. Spider-Man 3 (tt0413300)'
# запишите свой вариант ответа
answers['1'] = 'Pirates of the Caribbean: On Stranger Tides (tt1298650)'
# если ответили верно, можете добавить комментарий со значком "+"

In [14]:
# тут пишем ваш код для решения данного вопроса:
data[data.budget==data.budget.max()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit
723,tt1298650,380000000,1021683000,Pirates of the Caribbean: On Stranger Tides,"[Johnny Depp, PenÃ©lope Cruz, Geoffrey Rush, I...",[Rob Marshall],Live Forever Or Die Trying.,Captain Jack Sparrow crosses paths with a woma...,136,"[Adventure, Action, Fantasy]","[Walt Disney Pictures, Jerry Bruckheimer Films...",2011-05-11,6.3,2011,641683000


ВАРИАНТ 2

In [15]:
# можно добавлять разные варианты решения

# 2. Какой из фильмов самый длительный (в минутах)?

In [16]:
# думаю логику работы с этим словарем вы уже поняли, 
# по этому не буду больше его дублировать
answers['2'] = 'Gods and Generals (tt0279111)'

In [17]:
data[data.runtime==data.runtime.max()]


Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit
1157,tt0279111,56000000,12923936,Gods and Generals,"[Stephen Lang, Jeff Daniels, Robert Duvall, Ke...",[Ronald F. Maxwell],The nations heart was touched by...,The film centers mostly around the personal an...,214,"[Drama, History, War]","[Turner Pictures, Antietam Filmworks]",2003-02-21,5.8,2003,-43076064


# 3. Какой из фильмов самый короткий (в минутах)?





In [18]:
answers['3'] = 'Winnie the Pooh (tt1449283)'

In [19]:
data[data.runtime==data.runtime.min()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit
768,tt1449283,30000000,14460000,Winnie the Pooh,"[Jim Cummings, Travis Oates, Jim Cummings, Bud...","[Stephen Anderson, Don Hall]",Oh Pooh.,"During an ordinary day in Hundred Acre Wood, W...",63,"[Animation, Family]","[Walt Disney Pictures, Walt Disney Animation S...",2011-04-13,6.8,2011,-15540000


# 4. Какова средняя длительность фильмов?


In [20]:
answers['4'] = round(data.runtime.mean())
answers['4']

110

# 5. Каково медианное значение длительности фильмов? 

In [21]:
answers['5'] = round(data.runtime.median())
answers['5']

107

# 6. Какой самый прибыльный фильм?
#### Внимание! Здесь и далее под «прибылью» или «убытками» понимается разность между сборами и бюджетом фильма. (прибыль = сборы - бюджет) в нашем датасете это будет (profit = revenue - budget) 

In [22]:
data[data.profit==data.profit.max()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit
239,tt0499549,237000000,2781505847,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron],Enter the World of Pandora.,"In the 22nd century, a paraplegic Marine is di...",162,"[Action, Adventure, Fantasy, Science Fiction]","[Ingenious Film Partners, Twentieth Century Fo...",2009-12-10,7.1,2009,2544505847


In [23]:
answers['6'] = 'Avatar (tt0499549)'

# 7. Какой фильм самый убыточный? 

In [24]:
data[data.profit==data.profit.min()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit
1245,tt1210819,255000000,89289910,The Lone Ranger,"[Johnny Depp, Armie Hammer, William Fichtner, ...",[Gore Verbinski],Never Take Off the Mask,The Texas Rangers chase down a gang of outlaws...,149,"[Action, Adventure, Western]","[Walt Disney Pictures, Jerry Bruckheimer Films...",2013-07-03,6.0,2013,-165710090


In [25]:
answers['7'] = 'The Lone Ranger (tt1210819)'

# 8. У скольких фильмов из датасета объем сборов оказался выше бюджета?

In [26]:
answers['8'] = data[data.profit>0]['imdb_id'].count()
answers['8']

1478

# 9. Какой фильм оказался самым кассовым в 2008 году?

In [27]:
data.query('release_year == 2008').query('revenue==revenue.max()')

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit
599,tt0468569,185000000,1001921825,The Dark Knight,"[Christian Bale, Michael Caine, Heath Ledger, ...",[Christopher Nolan],Why So Serious?,Batman raises the stakes in his war on crime. ...,152,"[Drama, Action, Crime, Thriller]","[DC Comics, Legendary Pictures, Warner Bros., ...",2008-07-16,8.1,2008,816921825


In [28]:
answers['9'] = 'The Dark Knight (tt0468569)'

# 10. Самый убыточный фильм за период с 2012 по 2014 г. (включительно)?


In [29]:
data.query('2012<=release_year<=2014').query('profit==profit.min()')

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit
1245,tt1210819,255000000,89289910,The Lone Ranger,"[Johnny Depp, Armie Hammer, William Fichtner, ...",[Gore Verbinski],Never Take Off the Mask,The Texas Rangers chase down a gang of outlaws...,149,"[Action, Adventure, Western]","[Walt Disney Pictures, Jerry Bruckheimer Films...",2013-07-03,6.0,2013,-165710090


In [30]:
answers['10'] = 'The Lone Ranger (tt1210819)'

# 11. Какого жанра фильмов больше всего?

In [31]:
# эту задачу тоже можно решать разными подходами, попробуй реализовать разные варианты
# если будешь добавлять функцию - выноси ее в предобработку что в начале
Counter(data.genres.explode()).most_common()

[('Drama', 782),
 ('Comedy', 683),
 ('Thriller', 596),
 ('Action', 582),
 ('Adventure', 415),
 ('Crime', 315),
 ('Romance', 308),
 ('Family', 260),
 ('Science Fiction', 248),
 ('Fantasy', 222),
 ('Horror', 176),
 ('Mystery', 168),
 ('Animation', 139),
 ('Music', 64),
 ('History', 62),
 ('War', 58),
 ('Western', 19),
 ('Documentary', 8),
 ('Foreign', 2)]

ВАРИАНТ 2

In [32]:
genres_dict = {}
for item in data.genres:
    for genre in item:
        if genre in genres_dict.keys():
            genres_dict[genre]+=1
        else:
            genres_dict.setdefault(genre,1)
sorted(genres_dict.items(), key=lambda x:x[1], reverse=True)

[('Drama', 782),
 ('Comedy', 683),
 ('Thriller', 596),
 ('Action', 582),
 ('Adventure', 415),
 ('Crime', 315),
 ('Romance', 308),
 ('Family', 260),
 ('Science Fiction', 248),
 ('Fantasy', 222),
 ('Horror', 176),
 ('Mystery', 168),
 ('Animation', 139),
 ('Music', 64),
 ('History', 62),
 ('War', 58),
 ('Western', 19),
 ('Documentary', 8),
 ('Foreign', 2)]

In [33]:
answers['11'] = 'Drama'

# 12. Фильмы какого жанра чаще всего становятся прибыльными? 

In [34]:
Counter(data[data.profit>0].genres.explode()).most_common()

[('Drama', 560),
 ('Comedy', 551),
 ('Thriller', 446),
 ('Action', 444),
 ('Adventure', 337),
 ('Romance', 242),
 ('Crime', 231),
 ('Family', 226),
 ('Science Fiction', 195),
 ('Fantasy', 188),
 ('Horror', 150),
 ('Animation', 120),
 ('Mystery', 119),
 ('Music', 47),
 ('History', 46),
 ('War', 41),
 ('Western', 12),
 ('Documentary', 7)]

In [35]:
answers['12'] = 'Drama'

# 13. У какого режиссера самые большие суммарные кассовые сборы?

In [36]:

data.explode('director').groupby(['director']).revenue.sum().sort_values(ascending=False)

director
Peter Jackson        6490593685
Christopher Nolan    4167548502
David Yates          4154295625
Michael Bay          3886938960
J.J. Abrams          3579169916
                        ...    
David MichÃ´d           2295423
Steven Shainberg        2281089
Paul Schrader           2062066
Keanu Reeves            2054941
Simon Hunter            2033165
Name: revenue, Length: 997, dtype: int64

In [37]:
answers['13'] = 'Peter Jackson'

# 14. Какой режисер снял больше всего фильмов в стиле Action?

In [38]:
data[data['genres'].apply(lambda x:"Action" in x)].explode('director').director.value_counts()

Robert Rodriguez      9
Michael Bay           7
Paul W.S. Anderson    7
Ridley Scott          6
Antoine Fuqua         6
                     ..
Hironobu Sakaguchi    1
George Clooney        1
Aleksander Bach       1
Stephen St. Leger     1
Henry Selick          1
Name: director, Length: 364, dtype: int64

In [39]:
# Вариант №2
Counter(data.explode('genres').explode('director').query('genres=="Action"').director).most_common()

[('Robert Rodriguez', 9),
 ('Michael Bay', 7),
 ('Paul W.S. Anderson', 7),
 ('Antoine Fuqua', 6),
 ('Ridley Scott', 6),
 ('Brett Ratner', 5),
 ('Quentin Tarantino', 5),
 ('Zack Snyder', 5),
 ('Roland Emmerich', 5),
 ('Andrzej Bartkowiak', 5),
 ('Louis Leterrier', 5),
 ('Tony Scott', 5),
 ('Paul Greengrass', 5),
 ('Peter Jackson', 5),
 ('Lee Tamahori', 5),
 ('Gore Verbinski', 5),
 ('Rob Cohen', 5),
 ('Peter Berg', 5),
 ('J.J. Abrams', 4),
 ('Lana Wachowski', 4),
 ('Lilly Wachowski', 4),
 ('Olivier Megaton', 4),
 ('Guy Ritchie', 4),
 ('Bryan Singer', 4),
 ('Tim Story', 4),
 ('Renny Harlin', 4),
 ('Mark Neveldine', 4),
 ('Brian Taylor', 4),
 ('Dominic Sena', 4),
 ('Justin Lin', 4),
 ('Christopher Nolan', 4),
 ('Simon West', 4),
 ('John Moore', 4),
 ('Steven Spielberg', 4),
 ('Martin Campbell', 4),
 ('John Singleton', 4),
 ('Len Wiseman', 4),
 ('Brad Bird', 3),
 ('Matthew Vaughn', 3),
 ('Neill Blomkamp', 3),
 ('Jaume Collet-Serra', 3),
 ('Breck Eisner', 3),
 ('Pierre Morel', 3),
 ('David A

In [40]:
answers['14'] = 'Robert Rodriguez'

# 15. Фильмы с каким актером принесли самые высокие кассовые сборы в 2012 году? 

In [41]:
data[data.release_year==2012].explode('cast').groupby(['cast'])['revenue'].sum().sort_values(ascending=False)

cast
Chris Hemsworth      2027450773
Denis Leary          1629460639
Anne Hathaway        1522851057
Robert Downey Jr.    1519557910
Mark Ruffalo         1519557910
                        ...    
Michael Nyqvist         3428048
Danny Huston            2106557
Josh Lucas              2106557
Sami Gayle              2106557
Nicolas Cage            2106557
Name: revenue, Length: 466, dtype: int64

In [42]:
answers['15'] = 'Chris Hemsworth'

# 16. Какой актер снялся в большем количестве высокобюджетных фильмов?

In [43]:
data[data.budget>data.budget.mean()].explode('cast').cast.value_counts()

Matt Damon           18
Adam Sandler         17
Angelina Jolie       16
Eddie Murphy         15
Samuel L. Jackson    15
                     ..
Avery Brooks          1
Jeffrey Donovan       1
Julie Kavner          1
John Hawkes           1
Rob Schneider         1
Name: cast, Length: 1505, dtype: int64

In [44]:
# Вариант №2
Counter(data[data.budget>data.budget.mean()].explode('cast').cast).most_common()

[('Matt Damon', 18),
 ('Adam Sandler', 17),
 ('Angelina Jolie', 16),
 ('Tom Cruise', 15),
 ('Samuel L. Jackson', 15),
 ('Eddie Murphy', 15),
 ('Mark Wahlberg', 14),
 ('Hugh Jackman', 14),
 ('Ben Stiller', 14),
 ('Jamie Foxx', 14),
 ('Russell Crowe', 14),
 ('Johnny Depp', 13),
 ('Jude Law', 13),
 ('Bruce Willis', 13),
 ('Will Smith', 13),
 ('Dwayne Johnson', 12),
 ('Ian McKellen', 12),
 ('Brad Pitt', 12),
 ('Owen Wilson', 12),
 ('Cameron Diaz', 12),
 ('Nicolas Cage', 12),
 ('Robert De Niro', 12),
 ('Leonardo DiCaprio', 11),
 ('Daniel Craig', 11),
 ('Robert Downey Jr.', 11),
 ('Steve Buscemi', 11),
 ('Vin Diesel', 10),
 ('Mark Ruffalo', 10),
 ('Jack Black', 10),
 ('Denzel Washington', 10),
 ('Emma Watson', 10),
 ('Gary Oldman', 10),
 ('Tom Hanks', 10),
 ('Orlando Bloom', 10),
 ('Ralph Fiennes', 9),
 ('Cate Blanchett', 9),
 ('Michael Caine', 9),
 ('Ewan McGregor', 9),
 ('Paul Bettany', 9),
 ('Brendan Gleeson', 9),
 ('Shia LaBeouf', 9),
 ('Nicole Kidman', 9),
 ('Colin Farrell', 9),
 ('Rach

In [45]:
answers['16'] = 'Matt Damon'

# 17. В фильмах какого жанра больше всего снимался Nicolas Cage? 

In [46]:
data[data['cast'].apply(lambda x:"Nicolas Cage" in x)].explode('genres').genres.value_counts()

Action             17
Thriller           15
Drama              12
Crime              10
Fantasy             8
Adventure           7
Comedy              6
Science Fiction     4
Animation           3
Family              3
Mystery             3
History             2
Horror              1
Romance             1
War                 1
Name: genres, dtype: int64

In [47]:
# Вариант № 2
Counter(data.explode('cast').query('cast=="Nicolas Cage"').explode('genres').genres).most_common()

[('Action', 17),
 ('Thriller', 15),
 ('Drama', 12),
 ('Crime', 10),
 ('Fantasy', 8),
 ('Adventure', 7),
 ('Comedy', 6),
 ('Science Fiction', 4),
 ('Mystery', 3),
 ('Animation', 3),
 ('Family', 3),
 ('History', 2),
 ('War', 1),
 ('Horror', 1),
 ('Romance', 1)]

In [48]:
answers['17'] = 'Action'

# 18. Самый убыточный фильм от Paramount Pictures

In [49]:
data[data['production_companies'].apply(lambda x:"Paramount Pictures" in x)].query('profit==profit.min()')

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit
925,tt0267626,100000000,35168966,K-19: The Widowmaker,"[Harrison Ford, Liam Neeson, Peter Sarsgaard, ...",[Kathryn Bigelow],Fate has found its hero.,When Russia's first nuclear submarine malfunct...,138,"[Thriller, Drama, History]","[Paramount Pictures, Intermedia Films, Nationa...",2002-07-19,6.0,2002,-64831034


In [50]:
answers['18'] = 'K-19: The Widowmaker (tt0267626)'

# 19. Какой год стал самым успешным по суммарным кассовым сборам?

In [51]:
data.groupby(['release_year'])['revenue'].sum().sort_values(ascending=False)

release_year
2015    25449202382
2014    23405862953
2013    23213799791
2012    23079001687
2011    22676791872
2010    21071204961
2009    20261791024
2008    18252781990
2007    18162406801
2004    15663430720
2005    15309425558
2006    14775042320
2003    14346123312
2002    14136361487
2001    13017764865
2000    10664099805
Name: revenue, dtype: int64

In [52]:
answers['19'] = '2015'

# 20. Какой самый прибыльный год для студии Warner Bros?

In [53]:
df = data.explode('production_companies')[['production_companies','release_year','profit']]
display(df[df['production_companies'].str.contains("Warner Bros")].
        groupby('release_year')['profit'].sum().sort_values(ascending=False))
del(df)

release_year
2014    2292949646
2007    2201675217
2008    2134595031
2010    1974712985
2011    1871393682
2003    1844008221
2009    1822454136
2013    1636453400
2004    1631933725
2005    1551980298
2001    1282142579
2012    1258020056
2002    1022709901
2015     870368348
2006     625351872
2000     452631386
Name: profit, dtype: int64

In [54]:
answers['20'] = '2014'

# 21. В каком месяце за все годы суммарно вышло больше всего фильмов?

In [55]:
data.groupby(data['release_date'].dt.month)['imdb_id'].count().sort_values(ascending=False)

release_date
9     227
12    190
10    186
8     161
3     156
4     149
6     147
11    146
7     142
5     140
2     135
1     110
Name: imdb_id, dtype: int64

In [56]:
answers['21'] = 'Сентябрь'

# 22. Сколько суммарно вышло фильмов летом? (за июнь, июль, август)

In [57]:
data.query('release_date.dt.month in [6,7,8]').imdb_id.count()

450

In [58]:
answers['22'] = '450'

# 23. Для какого режиссера зима – самое продуктивное время года? 

In [59]:
df = data.explode('director')[['director','release_date']]
display(df.query('release_date.dt.month in [1,11,12]').director.value_counts())
del(df)

Peter Jackson            8
Clint Eastwood           7
Steven Soderbergh        6
Peter Farrelly           4
Edward Zwick             4
                        ..
Wes Anderson             1
Todd Strauss-Schulson    1
Seth Rogen               1
Tim Story                1
Richard Eyre             1
Name: director, Length: 371, dtype: int64

In [60]:
# Вариант № 2
df = data.explode('director')[['director','release_date']]
display(df.query('release_date.dt.month in [1,11,12]').groupby('director')['director'].count().sort_values(ascending=False))
del(df)

director
Peter Jackson        8
Clint Eastwood       7
Steven Soderbergh    6
Nancy Meyers         4
Adam Shankman        4
                    ..
James Gunn           1
James Duffy          1
James Cameron        1
James Bobin          1
Xavier Gens          1
Name: director, Length: 371, dtype: int64

In [61]:
# Вариант №3
Counter(data.explode('director').query('release_date.dt.month in [1,11,12]').director).most_common()

[('Peter Jackson', 8),
 ('Clint Eastwood', 7),
 ('Steven Soderbergh', 6),
 ('Francis Lawrence', 4),
 ('Ron Howard', 4),
 ('Peter Farrelly', 4),
 ('Robert Zemeckis', 4),
 ('Nancy Meyers', 4),
 ('Edward Zwick', 4),
 ('Adam Shankman', 4),
 ('Martin Scorsese', 4),
 ('Shawn Levy', 3),
 ('Ridley Scott', 3),
 ('Rob Marshall', 3),
 ('Bobby Farrelly', 3),
 ('Tim Burton', 3),
 ('Steve Carr', 3),
 ('Tony Scott', 3),
 ('Martin Campbell', 3),
 ('Chris Columbus', 3),
 ('Steven Spielberg', 3),
 ('Brett Ratner', 3),
 ('Quentin Tarantino', 2),
 ('Olivier Megaton', 2),
 ('Adam McKay', 2),
 ('Tom Hooper', 2),
 ('Michael Mann', 2),
 ('Jonathan Levine', 2),
 ('Jessie Nelson', 2),
 ('Sean Anders', 2),
 ('Paul Thomas Anderson', 2),
 ('George Clooney', 2),
 ('Guy Ritchie', 2),
 ('Ron Clements', 2),
 ('John Musker', 2),
 ('John Lee Hancock', 2),
 ('Jim Sheridan', 2),
 ('David S. Goyer', 2),
 ('Patrick Lussier', 2),
 ('Scott Cooper', 2),
 ('Marc Lawrence', 2),
 ('Byron Howard', 2),
 ('Ethan Coen', 2),
 ('Joel C

In [62]:
answers['23'] = 'Peter Jackson'

# 24. Какая студия дает самые длинные названия своим фильмам по количеству символов?

# Замечание
На мой взгляд, формулировка вопроса нуждается в уточнении. Разные студии выпускают разное количество фильмов, некоторые выпустили всего один фильм. Или студия выпустила 2 фильма с очень длинным и очень коротким названиями. На основании какой выборки делать вывод о длине названий?
Однозначно можно ответить на вопрос "Какая студия дала..."
Или уточнить "У какой студии средняя/медианная длина названия больше"

In [63]:
# ответ на первый вопрос из замечания
df = data.explode('production_companies')[['production_companies','original_title']]
display(df.groupby('production_companies').original_title.apply(lambda x:x.str.len().max()).sort_values(ascending=False))
del(df)

production_companies
Twentieth Century Fox Film Corporation    83
Four By Two Productions                   83
Walt Disney                               62
Walden Media                              62
21 Laps Entertainment                     59
                                          ..
Everest Entertainment                      3
Berlanti Productions                       3
Global Entertainment Group                 2
XM2 Productions                            2
Ixtlan Productions                         2
Name: original_title, Length: 1771, dtype: int64

In [64]:
# поскольку самое длинное название фильма (со значительным отрывом) встретилось у двух студий, посмотрим, что это за фильм
data[data.original_title.str.len()==data.original_title.str.len().max()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit
1448,tt0443453,18000000,261572744,Borat: Cultural Learnings of America for Make ...,"[Sacha Baron Cohen, Ken Davitian, Luenell, Pam...",[Larry Charles],"Come to Kazakhstan, it's nice!",Kazakh journalist Borat Sagdiyev travels to Am...,82,[Comedy],"[Twentieth Century Fox Film Corporation, Four ...",2006-11-02,6.4,2006,243572744


In [65]:
# уточним список студий, выпустивших этот фильм
data[data.imdb_id=='tt0443453'].production_companies.values[0]

['Twentieth Century Fox Film Corporation', 'Four By Two Productions']

In [66]:
# посмотрим, какие еще фильмы выпустила менее популярная студия
data[data.production_companies.apply(lambda x:'Four By Two' in str(x))]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit
1448,tt0443453,18000000,261572744,Borat: Cultural Learnings of America for Make ...,"[Sacha Baron Cohen, Ken Davitian, Luenell, Pam...",[Larry Charles],"Come to Kazakhstan, it's nice!",Kazakh journalist Borat Sagdiyev travels to Am...,82,[Comedy],"[Twentieth Century Fox Film Corporation, Four ...",2006-11-02,6.4,2006,243572744


In [67]:
# уже ради интереса, посмотрим на название и еще раз убедимся, что его длина 83
data[data.imdb_id=='tt0443453'].original_title.values[0]

'Borat: Cultural Learnings of America for Make Benefit Glorious Nation of Kazakhstan'

In [68]:
len(data[data.imdb_id=='tt0443453'].original_title.values[0])

83

In [69]:
# очевидно, что длина названия единственного фильма от студии Four By Two Productions
# всегда будет больше при сравнении max/mean/median значений, поэтому ответ:
answers['24'] = 'Four By Two Productions'

# 25. Описание фильмов какой студии в среднем самые длинные по количеству слов?

In [70]:
# артикли считаются как слова
df = data.explode('production_companies')[['production_companies','overview']]
df['overview_len'] = df.overview.apply(lambda x:len(re.findall("[a-zA-Z_]+", x)))
display(df.groupby('production_companies')['overview_len'].mean().sort_values(ascending=False))
del(df)

production_companies
Midnight Picture Show                    176.0
98 MPH Productions                       169.0
Room 9 Entertainment                     169.0
Heineken Branded Entertainment           160.0
Brookwell-McNamara Entertainment         159.0
                                         ...  
Phantom Four                              13.0
Henceforth                                13.0
London Boulevard                          13.0
Empire Pictures                           12.0
Motion Picture Corporation of America     11.0
Name: overview_len, Length: 1771, dtype: float64

In [71]:
# посмотрим описания фильмов у разных студий
show_overview('Midnight Pict')

array(["The photographer Leon lives with his girlfriend and waitress Maya waiting for a chance to get in the photo business. When Maya contacts their friend Jurgis, he schedules a meeting for Leon with the successful owner of arts gallery Susan Hoff; she analyzes Leon's work and asks him to improve the quality of his photos. During the night, the upset Leon decides to wander on the streets taking pictures with his camera, and he follows three punks down to the subway station; when the gang attacks a young woman, Leon defends her and the guys move on. On the next morning, Leon discovers that the woman is missing. He goes to the police station, but Detective Lynn Hadley does not give much attention to him and discredits his statement. Leon becomes obsessed to find what happened with the stranger and he watches the subway station. When he sees the elegant butcher Mahogany in the train, Leon believes he might be a murderer and stalks him everywhere, in the beginning of his journey to the d

In [72]:
show_overview('98 MPH')

array(["Jim Morris never made it out of the minor leagues before a shoulder injury ended his pitching career twelve years ago. Now a married-with-children high-school chemistry teacher and baseball coach in Texas, Jim's team makes a deal with him: if they win the district championship, Jim will try out with a major-league organization. The bet proves incentive enough for the team, and they go from worst to first, making it to state for the first time in the history of the school. Jim, forced to live up to his end of the deal, is nearly laughed off the try-out field--until he gets onto the mound, where he confounds the scouts (and himself) by clocking successive 98 mph fastballs, good enough for a minor-league contract with the Tampa Bay Devil Rays. Jim's still got a lot of pitches to throw before he makes it to The Show, but with his big-league dreams revived, there's no telling where he could go."],
      dtype=object)

In [73]:
show_overview('Motion Picture Corporation')

array(['Two straight men mistakenly end up on a "gays only" cruise.'],
      dtype=object)

In [74]:
answers['25'] = 'Midnight Picture Show'

# 26. Какие фильмы входят в 1 процент лучших по рейтингу? 
по vote_average

In [75]:
# Вариант №1
# разобьем интревал значений индексов на 100 подинтервалов
s=data.vote_average.value_counts(bins=100)
# отсортируем и вычислим подинтервал, соответствующий запросу "1 процент лучших по рейтингу (по vote_average)"
s = s.sort_index(ascending=False).index[0]
# выведем список фильмов, с рейтингом в расчитанном интервале
data[data.vote_average.apply(lambda x:x in s)]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,profit
599,tt0468569,185000000,1001921825,The Dark Knight,"[Christian Bale, Michael Caine, Heath Ledger, ...",[Christopher Nolan],Why So Serious?,Batman raises the stakes in his war on crime. ...,152,"[Drama, Action, Crime, Thriller]","[DC Comics, Legendary Pictures, Warner Bros., ...",2008-07-16,8.1,2008,816921825


слишком мало победителей :) Поэтому используем план Б

In [76]:
# Вариант № 2
# количество фильмов в 1%
q = int(round(len(data)/100))
# отсеим лишнюю информацию
df = data[['imdb_id','original_title','vote_average']]
# отсортируем фильмы по рейтингу и выведем q первых
display(df.sort_values(by='vote_average',ascending=False).head(q))
del(df)

Unnamed: 0,imdb_id,original_title,vote_average
599,tt0468569,The Dark Knight,8.1
118,tt0816692,Interstellar,8.0
125,tt2084970,The Imitation Game,8.0
9,tt2096673,Inside Out,8.0
34,tt3170832,Room,8.0
1183,tt0993846,The Wolf of Wall Street,7.9
128,tt2267998,Gone Girl,7.9
1191,tt2024544,12 Years a Slave,7.9
119,tt2015381,Guardians of the Galaxy,7.9
1081,tt0167260,The Lord of the Rings: The Return of the King,7.9


In [77]:
answers['26'] = 'Inside Out, The Dark Knight, 12 Years a Slave'

# 27. Какие актеры чаще всего снимаются в одном фильме вместе?


In [78]:
# отсечем лишние данные
df = data[['imdb_id','original_title','cast']].copy()
# добавим столбец с комбинациями пары актеров
df['cast_pairs'] = df.cast.map(lambda x:list(combinations(x,2)))
display(Counter(df.explode('cast_pairs').cast_pairs.astype(str)).most_common())
del(df)

[("('Daniel Radcliffe', 'Rupert Grint')", 8),
 ("('Daniel Radcliffe', 'Emma Watson')", 8),
 ("('Rupert Grint', 'Emma Watson')", 7),
 ("('Ben Stiller', 'Owen Wilson')", 6),
 ("('Johnny Depp', 'Helena Bonham Carter')", 6),
 ("('Adam Sandler', 'Kevin James')", 5),
 ("('Hugh Jackman', 'Ian McKellen')", 5),
 ("('Kristen Stewart', 'Robert Pattinson')", 5),
 ("('Kristen Stewart', 'Taylor Lautner')", 5),
 ("('Robert Pattinson', 'Taylor Lautner')", 5),
 ("('Jennifer Lawrence', 'Josh Hutcherson')", 4),
 ("('Jennifer Lawrence', 'Liam Hemsworth')", 4),
 ("('Jennifer Lawrence', 'Woody Harrelson')", 4),
 ("('Josh Hutcherson', 'Liam Hemsworth')", 4),
 ("('Josh Hutcherson', 'Woody Harrelson')", 4),
 ("('Liam Hemsworth', 'Woody Harrelson')", 4),
 ("('Tobin Bell', 'Costas Mandylor')", 4),
 ("('Tobin Bell', 'Betsy Russell')", 4),
 ("('Costas Mandylor', 'Betsy Russell')", 4),
 ("('Vin Diesel', 'Jordana Brewster')", 4),
 ("('Paul Walker', 'Jordana Brewster')", 4),
 ("('Mike Myers', 'Eddie Murphy')", 4),
 (

In [79]:
answers['27'] = 'Daniel Radcliffe, Rupert Grint'

# Submission

In [80]:
# в конце можно посмотреть свои ответы к каждому вопросу
answers

{'1': 'Pirates of the Caribbean: On Stranger Tides (tt1298650)',
 '2': 'Gods and Generals (tt0279111)',
 '3': 'Winnie the Pooh (tt1449283)',
 '4': 110,
 '5': 107,
 '6': 'Avatar (tt0499549)',
 '7': 'The Lone Ranger (tt1210819)',
 '8': 1478,
 '9': 'The Dark Knight (tt0468569)',
 '10': 'The Lone Ranger (tt1210819)',
 '11': 'Drama',
 '12': 'Drama',
 '13': 'Peter Jackson',
 '14': 'Robert Rodriguez',
 '15': 'Chris Hemsworth',
 '16': 'Matt Damon',
 '17': 'Action',
 '18': 'K-19: The Widowmaker (tt0267626)',
 '19': '2015',
 '20': '2014',
 '21': 'Сентябрь',
 '22': '450',
 '23': 'Peter Jackson',
 '24': 'Four By Two Productions',
 '25': 'Midnight Picture Show',
 '26': 'Inside Out, The Dark Knight, 12 Years a Slave',
 '27': 'Daniel Radcliffe, Rupert Grint'}

In [81]:
# и убедиться что ни чего не пропустил)
len(answers)

27