In [1]:
import pandas as pd

df = pd.read_csv('./data/IMDB-Movie-Data.csv')
df = df.fillna(value=0)

In [2]:
# Question1
print("Q1：Top‐3 movies with the highest ratings in 2016?")

# 先找出2016年的電影再排序
df1 = df.copy()
df1 = df1[df1['Year'] == 2016]
df1.sort_values(by="Rating")
df1[["Title", "Rating"]].head(3)

Q1：Top‐3 movies with the highest ratings in 2016?


Unnamed: 0,Title,Rating
2,Dangal,8.8
4,Kimi no na wa,8.6
15,Koe no katachi,8.4


In [2]:
# Question2
print("Q2: The actor generating the highest average revenue?")

df2 = df.copy()

# 收集所有Actor的名字及收益
all_actors = [],[]
df2["Actors"] = df2["Actors"].str.split("|")

for idx, actors in enumerate(df2["Actors"]):
    for actor in actors:
        all_actors[0].append(actor.strip(" "))
        all_actors[1].append(df2["Revenue (Millions)"][idx])

# 將array轉成dataframe形式並利用groupby求mean
actor_df = pd.DataFrame(all_actors, index=["actor", "revenue"])
actor_df = actor_df.T
actor_df["revenue"] = actor_df["revenue"].astype("float")
actor_df = actor_df.groupby("actor").mean()
max_revenue = actor_df["revenue"].max()
actor_df[actor_df['revenue'] == max_revenue]

Q2: The actor generating the highest average revenue?


Unnamed: 0_level_0,revenue
actor,Unnamed: 1_level_1
Daisy Ridley,936.63
John Boyega,936.63


In [3]:
# Question3
print("Q3: The average rating of Emma Watson’s movies?") 

# 蒐集Emma Watson所有rating
total_rating = []
for idx in range(len(df2)):
    data = df2.loc[idx]
    if " Emma Watson" in data["Actors"] or "Emma Watson" in data["Actors"]:
        total_rating.append(data["Rating"])
                   
rating_mean = pd.Series(data=total_rating, dtype=float).mean()
print("A3：%.2f" % rating_mean)

Q3: The average rating of Emma Watson’s movies?
A3：7.17


In [4]:
# Question 4   
print("Q4: Top-3 directors who collaborate with the most actors?")

#蒐集導演和演員
all_directors=[], []
for idx, actors in enumerate(df2["Actors"]):
    for actor in actors:
        all_directors[0].append(df2["Director"][idx])
        all_directors[1].append(actor.strip(" "))
        
director_df = pd.DataFrame(all_directors, index=["director", "actor"])
director_df.T.groupby("director").size().nlargest(3)

Q4: Top-3 directors who collaborate with the most actors?


director
Ridley Scott          32
David Yates           24
M. Night Shyamalan    24
dtype: int64

In [5]:
# Question 5   
print("Q5: Top-2 actors playing in the most genres of movies?")

# 收集所有Actor的名字及電影種類
all_actors = [],[]
df2["Genre"] = df2["Genre"].str.split("|")

for idx, actors in enumerate(df2["Actors"]):
    for actor in actors:
        for genre in df2["Genre"][idx]:
            all_actors[0].append(actor.strip(" "))
            all_actors[1].append(genre)
            
# 去除重複的演員及演出種類            
genre_df = pd.DataFrame(all_actors, index=["actor", "genre"])
genre_df = genre_df.T.drop_duplicates()            
genre_df.groupby("actor").size().nlargest(2)

Q5: Top-2 actors playing in the most genres of movies?


actor
Brad Pitt    14
Amy Adams    13
dtype: int64

In [6]:
# Question 6   
print("Q6: All actors whose movies lead to the largest maximum gap of years?")

# 收集所有Actor的名字及電影種類
all_actors = [],[]

for idx, actors in enumerate(df2["Actors"]):
    for actor in actors:
        all_actors[0].append(actor.strip(" "))
        all_actors[1].append(df2["Year"][idx])

# 去除重複的演員及演出年        
year_df = pd.DataFrame(all_actors, index=["actor", "year"])
year_df = year_df.T.drop_duplicates().groupby("actor")

# 計算所有actor的gap_year
year_diff_df = year_df.apply(lambda x: x.max() - x.min())
max_year = year_diff_df["year"].max()
year_diff_df = year_diff_df[year_diff_df['year'] == max_year]
print("A6: 最大GAP YEAR為%d年，共有%d個演員！" % (max_year,len(year_diff_df)))
print(year_diff_df.index.values)

Q6: All actors whose movies lead to the largest maximum gap of years?
A6: 最大GAP YEAR為10年，共有53個演員！
['Abbie Cornish' 'Anne Hathaway' 'Audrey Tautou' 'Ben Kingsley'
 'Ben Whishaw' 'Bob Balaban' 'Brad Pitt' 'Bryce Dallas Howard'
 'Chiwetel Ejiofor' 'Christian Bale' 'Christopher Plummer'
 'Denzel Washington' 'Dominic West' 'Dustin Hoffman' 'Edward Norton'
 'Ellen Burstyn' 'Emily Blunt' 'Eva Green' 'Gerard Butler' 'Hugh Jackman'
 'Jack Davenport' 'Jennifer Aniston' 'Jennifer Connelly' 'Jeremy Irons'
 'Jessica Biel' 'Johnny Depp' 'Judi Dench' 'Justin Theroux' 'Kang-ho Song'
 'Kate Bosworth' 'Kevin Spacey' 'Kirsten Dunst' 'Luke Wilson'
 'Marion Cotillard' 'Mark Wahlberg' 'Matt Damon' 'Maya Rudolph'
 'Meryl Streep' 'Michelle Monaghan' 'Morgan Freeman' 'Owen Wilson'
 'Paula Patton' 'Rachel Weisz' 'Russell Crowe' 'Sacha Baron Cohen'
 'Samuel L. Jackson' 'Scarlett Johansson' 'Steve Carell' 'Tom Cruise'
 'Tom Hanks' 'Toni Collette' 'Will Ferrell' 'Will Smith']


In [7]:
# Question 7   
print("Q7: Find all actors who collaborate with Johnny Depp in direct and indirect ways.")

# 收集所有和Johnny Depp有直接關係的Actor名字
all_actors = []

for actors in df2["Actors"]:
    if "Johnny Depp" in actors or " Johnny Depp" in actors:
        for actor in actors:
            all_actors.append(actor.strip(" "))

actor_sr = pd.Series(all_actors).drop_duplicates()

# 找出與Johnny Depp有間接關係的
old_len = 0
new_len = len(actor_sr)
while(new_len != old_len):
    for actors in df2["Actors"]:
        for actor in actor_sr:
            if actor in actors:
                actor_sr = actor_sr.append(pd.Series(actors))
                break
    actor_sr = actor_sr.drop_duplicates()
    old_len = new_len
    new_len = len(actor_sr)

print("A7: 總共有%d個演員和Johnny Depp有關係~~" %len(actor_sr))
print(actor_sr.values)

Q7: Find all actors who collaborate with Johnny Depp in direct and indirect ways.
A7: 總共有1808個演員和Johnny Depp有關係~~
['Johnny Depp' 'Helena Bonham Carter' 'Alan Rickman' ... 'Nia Vardalos'
 ' Michael Constantine' ' Lainie Kazan']
