In [60]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
data = pd.read_csv('movie_bd_v5.xls')
data.sample(5)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year
1406,tt0365485,10000000,17297244,The Matador,Pierce Brosnan|Greg Kinnear|Hope Davis|Portia ...,Richard Shepard,A hitman and a salesman walk into a bar...,"The life of Danny Wright, a salesman forever o...",96,Action|Comedy|Crime|Drama|Thriller,Miramax Films,5/12/2005,6.0,2005
672,tt1034331,60000000,73174566,Righteous Kill,Robert De Niro|Carla Gugino|50 Cent|Al Pacino|...,Jon Avnet,Most people respect the badge. Everybody respe...,Two veteran New York City detectives work to i...,101,Action|Adventure|Crime|Drama|Thriller,Grosvenor Park Media Ltd.|InVenture Entertainm...,9/11/2008,5.8,2008
20,tt1964418,190000000,209035668,Tomorrowland,Britt Robertson|George Clooney|Raffey Cassidy|...,Brad Bird,Imagine a world where nothing is impossible.,"Bound by a shared destiny, a bright, optimisti...",130,Action|Family|Science Fiction|Adventure|Mystery,Walt Disney Pictures|Babieka|A113,5/19/2015,6.2,2015
1816,tt0141926,62000000,127666415,U-571,Matthew McConaughey|Bill Paxton|Harvey Keitel|...,Jonathan Mostow,Heroes are ordinary men who do extraordinary t...,"In the midst of World War II, the battle below...",116,Action|Drama|Thriller|War,Universal Pictures|Canal Plus|Dino De Laurenti...,4/20/2000,6.1,2000
215,tt2103254,20000000,100525432,Tammy,Melissa McCarthy|Susan Sarandon|Kathy Bates|Al...,Ben Falcone,She hit the road. The road hit back.,After losing her job and learning that her hus...,97,Comedy,New Line Cinema|Gary Sanchez Productions,7/2/2014,5.1,2014


In [99]:
#data.describe()
#data.head(16)
#data.info()
data.head(5)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,...,release_date,vote_average,release_year,profit,genres_list,director_list,cast_list,release_month,production_companies_list,title_length
0,tt0369610,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,The park is open.,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,...,2015-06-09,6.5,2015,1363528810,"[Action, Adventure, Science Fiction, Thriller]",[Colin Trevorrow],"[Chris Pratt, Bryce Dallas Howard, Irrfan Khan...",June,"[Universal Studios, Amblin Entertainment, Lege...",14
1,tt1392190,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,What a Lovely Day.,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,...,2015-05-13,7.1,2015,228436354,"[Action, Adventure, Science Fiction, Thriller]",[George Miller],"[Tom Hardy, Charlize Theron, Hugh Keays-Byrne,...",May,"[Village Roadshow Pictures, Kennedy Miller Pro...",18
2,tt2908446,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,One Choice Can Destroy You,Beatrice Prior must confront her inner demons ...,119,Adventure|Science Fiction|Thriller,...,2015-03-18,6.3,2015,185238201,"[Adventure, Science Fiction, Thriller]",[Robert Schwentke],"[Shailene Woodley, Theo James, Kate Winslet, A...",March,"[Summit Entertainment, Mandeville Films, Red W...",9
3,tt2488496,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,J.J. Abrams,Every generation has a story.,Thirty years after defeating the Galactic Empi...,136,Action|Adventure|Science Fiction|Fantasy,...,2015-12-15,7.5,2015,1868178225,"[Action, Adventure, Science Fiction, Fantasy]",[J.J. Abrams],"[Harrison Ford, Mark Hamill, Carrie Fisher, Ad...",December,"[Lucasfilm, Truenorth Productions, Bad Robot]",28
4,tt2820852,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,James Wan,Vengeance Hits Home,Deckard Shaw seeks revenge against Dominic Tor...,137,Action|Crime|Thriller,...,2015-04-01,7.3,2015,1316249360,"[Action, Crime, Thriller]",[James Wan],"[Vin Diesel, Paul Walker, Jason Statham, Miche...",April,"[Universal Pictures, Original Film, Media Righ...",9


# Предобработка

In [98]:
'''
Defining a function that accepts a result 
and an output type as arguments and returns
a result in a way that can be written directly
into a dictinary that is used below 
to store answers.

This function is here to avoid copying and
pasting the results all the time.

More comments inside the function itself.
'''

def store_answer(result, out_type):
    # id_and_movie returns an answer 
    # formatted like this (example below):
    # 'Some movie name (tt123456)'
    if out_type == 'id_and_movie':
        return f"{result['original_title'].values[0]} ({result['imdb_id'].values[0]})"
    elif out_type == 'single_value':
        return result
    else:
        return "Undefined output type. Please fix."
    
# dictionary for the answers
#answers = {}

# adding a profit column to our dataframe
data['profit'] = data['revenue'] - data['budget']

# creating a separate column where genres are
# stored as lists (not as strings separeted by |)
data['genres_list'] = data['genres'].apply(lambda x: x.split(sep='|'))

# separate column where directors are stored
# as lists (some movies have multiple directors)
data['director_list'] = data['director'].apply(lambda x: x.split(sep='|'))

# separate column where actors are stored
# as lists (movies have multiple actors)
data['cast_list'] = data['cast'].apply(lambda x: x.split(sep='|'))

# separate column where companies are stored
# as lists (movies have multiple production companies)
data['production_companies_list'] = data['production_companies'].apply(lambda x: x.split(sep='|'))

# changing date string to datetime format
data['release_date'] = pd.to_datetime(data['release_date']) 

# adding a separate release_month column
data['release_month'] = pd.DatetimeIndex(data['release_date']).month_name()

# adding a separate column with title length
data['title_length'] = data['original_title'].str.len()

# 1. У какого фильма из списка самый большой бюджет?

Использовать варианты ответов в коде решения запрещено.    
Вы думаете и в жизни у вас будут варианты ответов?)

In [28]:
answer_1 = data[data['budget'] == data['budget'].max()]
answers['1'] = store_answer(answer_1, 'id_and_movie')

# 2. Какой из фильмов самый длительный (в минутах)?

In [29]:
answer_2 = data[data['runtime'] == data['runtime'].max()]
answers['2'] = store_answer(answer_2, 'id_and_movie')

# 3. Какой из фильмов самый короткий (в минутах)?





In [30]:
answer_3 = data[data['runtime'] == data['runtime'].min()]
answers['3'] = store_answer(answer_3, 'id_and_movie')

# 4. Какова средняя длительность фильмов?


In [31]:
answer_4 = int(round(data['runtime'].mean(),0))
answers['4'] = store_answer(answer_4, 'single_value')

# 5. Каково медианное значение длительности фильмов? 

In [32]:
answer_5 = int(round(data['runtime'].median(),0))
answers['5'] = store_answer(answer_5, 'single_value')

# 6. Какой самый прибыльный фильм?
#### Внимание! Здесь и далее под «прибылью» или «убытками» понимается разность между сборами и бюджетом фильма. (прибыль = сборы - бюджет) в нашем датасете это будет (profit = revenue - budget) 

In [33]:
answer_6 = data[data['profit'] == data['profit'].max()]
answers['6'] = store_answer(answer_6, 'id_and_movie')

# 7. Какой фильм самый убыточный? 

In [34]:
answer_7 = data[data['profit'] == data['profit'].min()]
answers['7'] = store_answer(answer_7, 'id_and_movie')

# 8. У скольких фильмов из датасета объем сборов оказался выше бюджета?

In [35]:
answer_8 = len(data[data['profit'] > 0])
answers['8'] = store_answer(answer_8, 'single_value')

# 9. Какой фильм оказался самым кассовым в 2008 году?

In [36]:
answer_9 = data[data['release_year'] == 2008].sort_values(by='revenue', ascending=False).head(1)
answers['9'] = store_answer(answer_9, 'id_and_movie')

# 10. Самый убыточный фильм за период с 2012 по 2014 г. (включительно)?


In [37]:
answer_10 = data[(data['release_year'] >= 2012) & (data['release_year'] <= 2014)].sort_values(by='profit').head(1)
answers['10'] = store_answer(answer_10, 'id_and_movie')

# 11. Какого жанра фильмов больше всего?

In [38]:
answer_11 = data['genres_list'].explode().value_counts().index[0]
answers['11'] = store_answer(answer_11, 'single_value')

ВАРИАНТ 2

In [15]:
'''
Alternative approach:

Creating a concatenated string for all the genres
from all the rows of a dataframe (separated by |)
and splitting them into a list. Casting a list 
to Counter and using Counter.most_common() method
to get the first entry from the top
'''

answer_11 = Counter(data['genres'].str.cat(sep='|').split(sep='|')).most_common(1)[0][0]
answers['11'] = store_answer(answer_11, 'single_value')

# 12. Фильмы какого жанра чаще всего становятся прибыльными? 

In [39]:
answer_12 = data[data['profit'] > 0]['genres_list'].explode().value_counts().index[0]
answers['12'] = store_answer(answer_12, 'single_value')

# 13. У какого режиссера самые большие суммарные кассовые сбооры?

In [40]:
answer_13 = data.explode('director_list').groupby(by=['director_list'])['revenue'].sum().sort_values(ascending=False).index[0]
answers['13'] = store_answer(answer_13, 'single_value')

# 14. Какой режисер снял больше всего фильмов в стиле Action?

In [41]:
answer_14 = data[data['genres'].str.lower().str.contains('action')]['director_list'].explode().value_counts().index[0]
answers['14'] = store_answer(answer_14, 'single_value')

# 15. Фильмы с каким актером принесли самые высокие кассовые сборы в 2012 году? 

In [42]:
answer_15 = data[data['release_year'] == 2012].explode('cast_list').groupby(by='cast_list')['revenue'].sum().sort_values(ascending=False).index[0]
answers['15'] = store_answer(answer_15, 'single_value')

# 16. Какой актер снялся в большем количестве высокобюджетных фильмов?

In [43]:
avg_budget = data['budget'].mean()
answer_16 = data[data['budget'] > avg_budget].explode('cast_list')['cast_list'].value_counts().index[0]
answers['16'] = store_answer(answer_16, 'single_value')

# 17. В фильмах какого жанра больше всего снимался Nicolas Cage? 

In [44]:
answer_17 = data[data['cast'].str.contains('Nicolas Cage')].explode('genres_list')['genres_list'].value_counts().index[0]
answers['17'] = store_answer(answer_17, 'single_value')

# 18. Самый убыточный фильм от Paramount Pictures

In [46]:
answer_18 = data[data['production_companies'].str.contains('Paramount Pictures')].sort_values(by='profit', ascending=True)
answers['18'] = store_answer(answer_18, 'id_and_movie')

# 19. Какой год стал самым успешным по суммарным кассовым сборам?

In [47]:
answer_19 = data.groupby(by=['release_year'])['revenue'].sum().sort_values(ascending=False).index[0]
answers['19'] = store_answer(answer_19, 'single_value')

# 20. Какой самый прибыльный год для студии Warner Bros?

In [48]:
answer_20 = data[data['production_companies'].str.contains('Warner Bros')].groupby(by=['release_year'])['profit'].sum().sort_values(ascending=False).index[0]
answers['20'] = store_answer(answer_20, 'single_value')

# 21. В каком месяце за все годы суммарно вышло больше всего фильмов?

In [74]:
answer_21 = data['release_month'].value_counts().index[0]
answers['21'] = store_answer(answer_21, 'single_value')

# 22. Сколько суммарно вышло фильмов летом? (за июнь, июль, август)

In [81]:
answer_22 = len(data[data['release_month'].isin(['June', 'July', 'August'])])
answers['22'] = store_answer(answer_22, 'single_value')

# 23. Для какого режиссера зима – самое продуктивное время года? 

In [87]:
answer_23 = data[data['release_month'].isin(['December', 'January', 'February'])].explode('director_list')['director_list'].value_counts().index[0]
answers['23'] = store_answer(answer_23, 'single_value')

# 24. Какая студия дает самые длинные названия своим фильмам по количеству символов?

In [102]:
answer_24 = data.explode('production_companies_list').groupby(by='production_companies_list')['title_length'].mean().sort_values(ascending=False).index[0]
answers['24'] = store_answer(answer_24, 'single_value')

# 25. Описание фильмов какой студии в среднем самые длинные по количеству слов?

# 26. Какие фильмы входят в 1 процент лучших по рейтингу? 
по vote_average

# 27. Какие актеры чаще всего снимаются в одном фильме вместе?


ВАРИАНТ 2

# Submission

In [103]:
# в конце можно посмотреть свои ответы к каждому вопросу
answers

{'1': 'Pirates of the Caribbean: On Stranger Tides (tt1298650)',
 '2': 'Gods and Generals (tt0279111)',
 '3': 'Winnie the Pooh (tt1449283)',
 '4': 110,
 '5': 107,
 '6': 'Avatar (tt0499549)',
 '7': 'The Lone Ranger (tt1210819)',
 '8': 1478,
 '9': 'The Dark Knight (tt0468569)',
 '10': 'The Lone Ranger (tt1210819)',
 '11': 'Drama',
 '12': 'Drama',
 '13': 'Peter Jackson',
 '14': 'Robert Rodriguez',
 '15': 'Chris Hemsworth',
 '16': 'Matt Damon',
 '17': 'Action',
 '18': 'K-19: The Widowmaker (tt0267626)',
 '19': 2015,
 '20': 2014,
 '21': 'September',
 '22': 450,
 '23': 'Peter Jackson',
 '24': 'Four By Two Productions'}

In [0]:
# и убедиться что ни чего не пропустил)
len(answers)