In [219]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import re

In [220]:
data = pd.read_csv('movie_bd_v5.xls')
data.sample(5)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year
30,tt1823672,49000000,102069268,Chappie,Sharlto Copley|Dev Patel|Ninja|Yolandi Visser|...,Neill Blomkamp,I am consciousness. I am alive. I am Chappie.,Every child comes into the world full of promi...,120,Crime|Action|Science Fiction,Columbia Pictures|Media Rights Capital|Sony Pi...,3/4/2015,6.6,2015
1232,tt1931435,35000000,21819348,The Big Wedding,Robert De Niro|Diane Keaton|Katherine Heigl|Am...,Justin Zackham,It's never too late to start acting like a family,To the amusement of their adult children and f...,90,Comedy,Millenium Films|Two Ton Films,4/25/2013,5.6,2013
1854,tt0158622,83000000,59468275,The Flintstones in Viva Rock Vegas,Mark Addy|Stephen Baldwin|Kristen Johnston|Jan...,Brian Levant,Get ready to rock!,The Flintstones are at it again. The Flintston...,90,Science Fiction|Comedy|Family|Romance,Universal Pictures|Amblin Entertainment,4/28/2000,4.4,2000
655,tt0425061,80000000,230685453,Get Smart,Steve Carell|Anne Hathaway|Dwayne Johnson|Alan...,Peter Segal,Saving The World...And Loving It!,When the identities of secret agents from Cont...,110,Action|Comedy|Thriller,Village Roadshow Pictures|Atlas Entertainment|...,6/19/2008,5.9,2008
1846,tt0204946,28000000,90449929,Bring It On,Kirsten Dunst|Jesse Bradford|Huntley Ritter|El...,Peyton Reed,May the best moves win.,The Toro cheerleading squad from Rancho Carne ...,98,Comedy,Beacon Communications,8/25/2000,5.7,2000


In [221]:
data.describe()
#data.head(16)
#data.info()
#data.head(5)

Unnamed: 0,budget,revenue,runtime,vote_average,release_year
count,1889.0,1889.0,1889.0,1889.0,1889.0
mean,54310830.0,155365300.0,109.658549,6.140762,2007.860773
std,48587210.0,214669800.0,18.017041,0.764763,4.468841
min,5000000.0,2033165.0,63.0,3.3,2000.0
25%,20000000.0,34560580.0,97.0,5.6,2004.0
50%,38000000.0,83615410.0,107.0,6.1,2008.0
75%,72000000.0,178262600.0,120.0,6.6,2012.0
max,380000000.0,2781506000.0,214.0,8.1,2015.0


# Предобработка

In [222]:
'''
Defining a function that accepts a result 
and an output type as arguments and returns
a result in a way that can be written directly
into a dictinary that is used below 
to store answers.

This function is here to avoid copying and
pasting the results all the time.

More comments inside the function itself.
'''

def store_answer(result, out_type):
    # id_and_movie returns an answer 
    # formatted like this (example below):
    # 'Some movie name (tt123456)'
    if out_type == 'id_and_movie':
        return f"{result['original_title'].values[0]} ({result['imdb_id'].values[0]})"
    elif out_type == 'single_value':
        return result
    else:
        return "Undefined output type. Please fix."
    
# dictionary for the answers
answers = {}

# adding a profit column to our dataframe
data['profit'] = data['revenue'] - data['budget']

# creating a separate column where genres are
# stored as lists (not as strings separeted by |)
data['genres_list'] = data['genres'].apply(lambda x: x.split(sep='|'))

# separate column where directors are stored
# as lists (some movies have multiple directors)
data['director_list'] = data['director'].apply(lambda x: x.split(sep='|'))

# separate column where actors are stored
# as lists (movies have multiple actors)
data['cast_list'] = data['cast'].apply(lambda x: x.split(sep='|'))

# separate column where companies are stored
# as lists (movies have multiple production companies)
data['production_companies_list'] = data['production_companies'].apply(lambda x: x.split(sep='|'))

# changing date string to datetime format
data['release_date'] = pd.to_datetime(data['release_date']) 

# adding a separate release_month column
data['release_month'] = pd.DatetimeIndex(data['release_date']).month_name()

# adding a separate column with title length
data['title_length'] = data['original_title'].str.len()

# adding a separate column with title word count
# using regex '\W+' pattern to split the words
data['overview_word_count'] = data['overview'].apply(lambda x: len(re.split('\W+', x)))

# 1. У какого фильма из списка самый большой бюджет?

Использовать варианты ответов в коде решения запрещено.    
Вы думаете и в жизни у вас будут варианты ответов?)

In [223]:
answer_1 = data[data['budget'] == data['budget'].max()]
answers['1'] = store_answer(answer_1, 'id_and_movie')

# 2. Какой из фильмов самый длительный (в минутах)?

In [224]:
answer_2 = data[data['runtime'] == data['runtime'].max()]
answers['2'] = store_answer(answer_2, 'id_and_movie')

# 3. Какой из фильмов самый короткий (в минутах)?





In [225]:
answer_3 = data[data['runtime'] == data['runtime'].min()]
answers['3'] = store_answer(answer_3, 'id_and_movie')

# 4. Какова средняя длительность фильмов?


In [226]:
answer_4 = int(round(data['runtime'].mean(),0))
answers['4'] = store_answer(answer_4, 'single_value')

# 5. Каково медианное значение длительности фильмов? 

In [227]:
answer_5 = int(round(data['runtime'].median(),0))
answers['5'] = store_answer(answer_5, 'single_value')

# 6. Какой самый прибыльный фильм?
#### Внимание! Здесь и далее под «прибылью» или «убытками» понимается разность между сборами и бюджетом фильма. (прибыль = сборы - бюджет) в нашем датасете это будет (profit = revenue - budget) 

In [228]:
answer_6 = data[data['profit'] == data['profit'].max()]
answers['6'] = store_answer(answer_6, 'id_and_movie')

# 7. Какой фильм самый убыточный? 

In [229]:
answer_7 = data[data['profit'] == data['profit'].min()]
answers['7'] = store_answer(answer_7, 'id_and_movie')

# 8. У скольких фильмов из датасета объем сборов оказался выше бюджета?

In [230]:
answer_8 = len(data[data['profit'] > 0])
answers['8'] = store_answer(answer_8, 'single_value')

# 9. Какой фильм оказался самым кассовым в 2008 году?

In [231]:
answer_9 = data[data['release_year'] == 2008].sort_values(by='revenue', ascending=False).head(1)
answers['9'] = store_answer(answer_9, 'id_and_movie')

# 10. Самый убыточный фильм за период с 2012 по 2014 г. (включительно)?


In [232]:
answer_10 = data[(data['release_year'] >= 2012) & (data['release_year'] <= 2014)].sort_values(by='profit').head(1)
answers['10'] = store_answer(answer_10, 'id_and_movie')

# 11. Какого жанра фильмов больше всего?

In [233]:
answer_11 = data['genres_list'].explode().value_counts().index[0]
answers['11'] = store_answer(answer_11, 'single_value')

ВАРИАНТ 2

In [234]:
'''
Alternative approach:

Creating a concatenated string for all the genres
from all the rows of a dataframe (separated by |)
and splitting them into a list. Casting a list 
to Counter and using Counter.most_common() method
to get the first entry from the top

Commenting the code below out because I don't want
it to affect the result (both this cell and 
the previous one produce the same output
'''

#answer_11 = Counter(data['genres'].str.cat(sep='|').split(sep='|')).most_common(1)[0][0]
#answers['11'] = store_answer(answer_11, 'single_value')

"\nAlternative approach:\n\nCreating a concatenated string for all the genres\nfrom all the rows of a dataframe (separated by |)\nand splitting them into a list. Casting a list \nto Counter and using Counter.most_common() method\nto get the first entry from the top\n\nCommenting the code below out because I don't want\nit to affect the result (both this cell and \nthe previous one produce the same output\n"

# 12. Фильмы какого жанра чаще всего становятся прибыльными? 

In [235]:
answer_12 = data[data['profit'] > 0]['genres_list'].explode().value_counts().index[0]
answers['12'] = store_answer(answer_12, 'single_value')

# 13. У какого режиссера самые большие суммарные кассовые сбооры?

In [236]:
answer_13 = data.explode('director_list').groupby(by=['director_list'])['revenue'].sum().sort_values(ascending=False).index[0]
answers['13'] = store_answer(answer_13, 'single_value')

# 14. Какой режисер снял больше всего фильмов в стиле Action?

In [237]:
answer_14 = data[data['genres'].str.lower().str.contains('action')]['director_list'].explode().value_counts().index[0]
answers['14'] = store_answer(answer_14, 'single_value')

# 15. Фильмы с каким актером принесли самые высокие кассовые сборы в 2012 году? 

In [238]:
answer_15 = data[data['release_year'] == 2012].explode('cast_list').groupby(by='cast_list')['revenue'].sum().sort_values(ascending=False).index[0]
answers['15'] = store_answer(answer_15, 'single_value')

# 16. Какой актер снялся в большем количестве высокобюджетных фильмов?

In [239]:
avg_budget = data['budget'].mean()
answer_16 = data[data['budget'] > avg_budget].explode('cast_list')['cast_list'].value_counts().index[0]
answers['16'] = store_answer(answer_16, 'single_value')

# 17. В фильмах какого жанра больше всего снимался Nicolas Cage? 

In [240]:
answer_17 = data[data['cast'].str.contains('Nicolas Cage')].explode('genres_list')['genres_list'].value_counts().index[0]
answers['17'] = store_answer(answer_17, 'single_value')

# 18. Самый убыточный фильм от Paramount Pictures

In [241]:
answer_18 = data[data['production_companies'].str.contains('Paramount Pictures')].sort_values(by='profit', ascending=True)
answers['18'] = store_answer(answer_18, 'id_and_movie')

# 19. Какой год стал самым успешным по суммарным кассовым сборам?

In [242]:
answer_19 = data.groupby(by=['release_year'])['revenue'].sum().sort_values(ascending=False).index[0]
answers['19'] = store_answer(answer_19, 'single_value')

# 20. Какой самый прибыльный год для студии Warner Bros?

In [243]:
answer_20 = data[data['production_companies'].str.contains('Warner Bros')].groupby(by=['release_year'])['profit'].sum().sort_values(ascending=False).index[0]
answers['20'] = store_answer(answer_20, 'single_value')

# 21. В каком месяце за все годы суммарно вышло больше всего фильмов?

In [244]:
answer_21 = data['release_month'].value_counts().index[0]
answers['21'] = store_answer(answer_21, 'single_value')

# 22. Сколько суммарно вышло фильмов летом? (за июнь, июль, август)

In [245]:
answer_22 = len(data[data['release_month'].isin(['June', 'July', 'August'])])
answers['22'] = store_answer(answer_22, 'single_value')

# 23. Для какого режиссера зима – самое продуктивное время года? 

In [246]:
answer_23 = data[data['release_month'].isin(['December', 'January', 'February'])].explode('director_list')['director_list'].value_counts().index[0]
answers['23'] = store_answer(answer_23, 'single_value')

# 24. Какая студия дает самые длинные названия своим фильмам по количеству символов?

In [247]:
answer_24 = data.explode('production_companies_list').groupby(by='production_companies_list')['title_length'].mean().sort_values(ascending=False).index[0]
answers['24'] = store_answer(answer_24, 'single_value')

# 25. Описание фильмов какой студии в среднем самые длинные по количеству слов?

In [248]:
answer_25 = data.explode('production_companies_list').groupby(by='production_companies_list')['overview_word_count'].mean().sort_values(ascending=False).index[0]
answers['25'] = store_answer(answer_25, 'single_value')

# 26. Какие фильмы входят в 1 процент лучших по рейтингу? 
по vote_average

In [249]:
quantile_99 = data['vote_average'].quantile(0.99)
data[data['vote_average'] > quantile_99].sort_values(by='vote_average', ascending=False)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,...,vote_average,release_year,profit,genres_list,director_list,cast_list,production_companies_list,release_month,title_length,overview_word_count
599,tt0468569,185000000,1001921825,The Dark Knight,Christian Bale|Michael Caine|Heath Ledger|Aaro...,Christopher Nolan,Why So Serious?,Batman raises the stakes in his war on crime. ...,152,Drama|Action|Crime|Thriller,...,8.1,2008,816921825,"[Drama, Action, Crime, Thriller]",[Christopher Nolan],"[Christian Bale, Michael Caine, Heath Ledger, ...","[DC Comics, Legendary Pictures, Warner Bros., ...",July,15,68
9,tt2096673,175000000,853708609,Inside Out,Amy Poehler|Phyllis Smith|Richard Kind|Bill Ha...,Pete Docter,Meet the little voices inside your head.,"Growing up can be a bumpy road, and it's no ex...",94,Comedy|Animation|Family,...,8.0,2015,678708609,"[Comedy, Animation, Family]",[Pete Docter],"[Amy Poehler, Phyllis Smith, Richard Kind, Bil...","[Walt Disney Pictures, Pixar Animation Studios...",June,10,115
34,tt3170832,6000000,35401758,Room,Brie Larson|Jacob Tremblay|Joan Allen|Sean Bri...,Lenny Abrahamson,Love knows no boundaries,Jack is a young boy of 5 years old who has liv...,117,Drama|Thriller,...,8.0,2015,29401758,"[Drama, Thriller]",[Lenny Abrahamson],"[Brie Larson, Jacob Tremblay, Joan Allen, Sean...","[Element Pictures, No Trace Camping, A24, Dupe...",October,4,50
118,tt0816692,165000000,621752480,Interstellar,Matthew McConaughey|Jessica Chastain|Anne Hath...,Christopher Nolan,Mankind was born on Earth. It was never meant ...,Interstellar chronicles the adventures of a gr...,169,Adventure|Drama|Science Fiction,...,8.0,2014,456752480,"[Adventure, Drama, Science Fiction]",[Christopher Nolan],"[Matthew McConaughey, Jessica Chastain, Anne H...","[Paramount Pictures, Legendary Pictures, Warne...",November,12,36
125,tt2084970,14000000,233555708,The Imitation Game,Benedict Cumberbatch|Keira Knightley|Matthew G...,Morten Tyldum,The true enigma was the man who cracked the code.,Based on the real life story of legendary cryp...,113,History|Drama|Thriller|War,...,8.0,2014,219555708,"[History, Drama, Thriller, War]",[Morten Tyldum],"[Benedict Cumberbatch, Keira Knightley, Matthe...","[Black Bear Pictures, Bristol Automotive]",November,18,51
119,tt2015381,170000000,773312399,Guardians of the Galaxy,Chris Pratt|Zoe Saldana|Dave Bautista|Vin Dies...,James Gunn,All heroes start somewhere.,"Light years from Earth, 26 years after being a...",121,Action|Science Fiction|Adventure,...,7.9,2014,603312399,"[Action, Science Fiction, Adventure]",[James Gunn],"[Chris Pratt, Zoe Saldana, Dave Bautista, Vin ...","[Marvel Studios, Moving Picture Company (MPC),...",July,23,29
128,tt2267998,61000000,369330363,Gone Girl,Ben Affleck|Rosamund Pike|Carrie Coon|Neil Pat...,David Fincher,You don't know what you've got 'til it's...,With his wife's disappearance having become th...,145,Mystery|Thriller|Drama,...,7.9,2014,308330363,"[Mystery, Thriller, Drama]",[David Fincher],"[Ben Affleck, Rosamund Pike, Carrie Coon, Neil...","[Twentieth Century Fox Film Corporation, Regen...",October,9,33
138,tt2278388,30000000,174600318,The Grand Budapest Hotel,Ralph Fiennes|Tony Revolori|F. Murray Abraham|...,Wes Anderson,A perfect holiday without leaving home.,The Grand Budapest Hotel tells of a legendary ...,99,Comedy|Drama,...,7.9,2014,144600318,"[Comedy, Drama]",[Wes Anderson],"[Ralph Fiennes, Tony Revolori, F. Murray Abrah...","[Fox Searchlight Pictures, Scott Rudin Product...",February,24,68
370,tt1375666,160000000,825500000,Inception,Leonardo DiCaprio|Joseph Gordon-Levitt|Ellen P...,Christopher Nolan,Your mind is the scene of the crime.,"Cobb, a skilled thief who commits corporate es...",148,Action|Thriller|Science Fiction|Mystery|Adventure,...,7.9,2010,665500000,"[Action, Thriller, Science Fiction, Mystery, A...",[Christopher Nolan],"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elle...","[Legendary Pictures, Warner Bros., Syncopy]",July,9,47
872,tt0253474,35000000,120072577,The Pianist,Adrien Brody|Thomas Kretschmann|Frank Finlay|M...,Roman Polanski,Music was his passion. Survival was his master...,The Pianist is a film adapted from the biograp...,150,Drama|War,...,7.9,2002,85072577,"[Drama, War]",[Roman Polanski],"[Adrien Brody, Thomas Kretschmann, Frank Finla...","[Bac Films, Canal+Polska, Heritage Films, Stud...",September,11,41


In [250]:
# entering an answer manually
# based on the result from a 
# previous cell
answers['26'] = 'Inside Out, The Dark Knight, 12 Years a Slave'

# 27. Какие актеры чаще всего снимаются в одном фильме вместе?


In [251]:
# two temporary dataframes that 
# will be joined later
df1 = data[['imdb_id','cast_list']].explode('cast_list')
df2 = data[['imdb_id','cast_list']].explode('cast_list')

df3 = df1.merge(df2, left_on='imdb_id', right_on='imdb_id')

# counter will be used for summing later
df3['counter'] = 1

# leaving only the entries where
# cast_list_x != cast_list_y because
# they refer to the same actor
df3[~(df3['cast_list_x'] == df3['cast_list_y'])].groupby(['cast_list_x', 'cast_list_y'])['counter'].sum().sort_values(ascending=False).head(20)


cast_list_x           cast_list_y         
Rupert Grint          Daniel Radcliffe        8
Emma Watson           Rupert Grint            8
Rupert Grint          Emma Watson             8
Daniel Radcliffe      Emma Watson             8
Emma Watson           Daniel Radcliffe        8
Daniel Radcliffe      Rupert Grint            8
Owen Wilson           Ben Stiller             6
Helena Bonham Carter  Johnny Depp             6
Ben Stiller           Owen Wilson             6
Johnny Depp           Helena Bonham Carter    6
Kristen Stewart       Taylor Lautner          5
Robert Pattinson      Kristen Stewart         5
Vin Diesel            Paul Walker             5
Hugh Jackman          Ian McKellen            5
Ian McKellen          Hugh Jackman            5
Robert Pattinson      Taylor Lautner          5
Adam Sandler          Kevin James             5
Kevin James           Adam Sandler            5
Taylor Lautner        Robert Pattinson        5
Kristen Stewart       Robert Pattinson       

In [252]:
# entering an answer manually
# based on the result from a 
# previous cell
answers['27'] = 'Rupert Grint & Daniel Radcliffe'

# Submission

In [253]:
# в конце можно посмотреть свои ответы к каждому вопросу
answers

{'1': 'Pirates of the Caribbean: On Stranger Tides (tt1298650)',
 '2': 'Gods and Generals (tt0279111)',
 '3': 'Winnie the Pooh (tt1449283)',
 '4': 110,
 '5': 107,
 '6': 'Avatar (tt0499549)',
 '7': 'The Lone Ranger (tt1210819)',
 '8': 1478,
 '9': 'The Dark Knight (tt0468569)',
 '10': 'The Lone Ranger (tt1210819)',
 '11': 'Drama',
 '12': 'Drama',
 '13': 'Peter Jackson',
 '14': 'Robert Rodriguez',
 '15': 'Chris Hemsworth',
 '16': 'Matt Damon',
 '17': 'Action',
 '18': 'K-19: The Widowmaker (tt0267626)',
 '19': 2015,
 '20': 2014,
 '21': 'September',
 '22': 450,
 '23': 'Peter Jackson',
 '24': 'Four By Two Productions',
 '25': 'Midnight Picture Show',
 '26': 'Inside Out, The Dark Knight, 12 Years a Slave',
 '27': 'Rupert Grint & Daniel Radcliffe'}

In [254]:
# и убедиться что ни чего не пропустил)
len(answers)

27