In [264]:
import pandas as pd

df = pd.read_csv("../Data/imdb_top_1000.csv")

# Ger rid of useless columns

df.drop(columns=["Poster_Link", "Certificate", "Runtime", "Overview", "Meta_score"], inplace=True)
df.dropna(inplace=True)

# Cleanup data
df.drop([966, 1], inplace=True)
df["Genre"] = df["Genre"].str.split(',')
df["Gross"] = df["Gross"].str.replace(',', '')
df["Gross"] = df["Gross"].astype(int)
df.head()

Unnamed: 0,Series_Title,Released_Year,Genre,IMDB_Rating,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,The Shawshank Redemption,1994,[Drama],9.3,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
2,The Dark Knight,2008,"[Action, Crime, Drama]",9.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,The Godfather: Part II,1974,"[Crime, Drama]",9.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,12 Angry Men,1957,"[Crime, Drama]",9.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000
5,The Lord of the Rings: The Return of the King,2003,"[Action, Adventure, Drama]",8.9,Peter Jackson,Elijah Wood,Viggo Mortensen,Ian McKellen,Orlando Bloom,1642758,377845905


In [265]:
# Oldest Movie

title = df.loc[df[["Released_Year"]].idxmin()]['Series_Title'].values[0]
print(f'The oldest movie is "{title}".')

The oldest movie is "The Kid".


In [266]:
# Newest Movie

newest_year = df[["Released_Year"]].max()
titles = df.loc[df["Released_Year"].astype(int) == int(newest_year.iloc[0])]['Series_Title'].values.tolist()

print(f"The most recent release year is {int(newest_year.iloc[0])}")
print(f"{len(titles)} entries were released in {int(newest_year.iloc[0])}. \nThey are:")
for t in titles:
    print(t)


The most recent release year is 2019
16 entries were released in 2019. 
They are:
Gisaengchung
Joker
Avengers: Endgame
1917
Chhichhore
Portrait de la jeune fille en feu
Ford v Ferrari
Gully Boy
Knives Out
Marriage Story
Jojo Rabbit
The Irishman
Little Women
Toy Story 4
Once Upon a Time... in Hollywood
The Peanut Butter Falcon


In [267]:
# Top 10 movies by IMDB rating

df.sort_values('IMDB_Rating', ascending=False, inplace=True)
df.head(10)

Unnamed: 0,Series_Title,Released_Year,Genre,IMDB_Rating,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,The Shawshank Redemption,1994,[Drama],9.3,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
2,The Dark Knight,2008,"[Action, Crime, Drama]",9.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,The Godfather: Part II,1974,"[Crime, Drama]",9.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,12 Angry Men,1957,"[Crime, Drama]",9.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000
6,Pulp Fiction,1994,"[Crime, Drama]",8.9,Quentin Tarantino,John Travolta,Uma Thurman,Samuel L. Jackson,Bruce Willis,1826188,107928762
7,Schindler's List,1993,"[Biography, Drama, History]",8.9,Steven Spielberg,Liam Neeson,Ralph Fiennes,Ben Kingsley,Caroline Goodall,1213505,96898818
5,The Lord of the Rings: The Return of the King,2003,"[Action, Adventure, Drama]",8.9,Peter Jackson,Elijah Wood,Viggo Mortensen,Ian McKellen,Orlando Bloom,1642758,377845905
8,Inception,2010,"[Action, Adventure, Sci-Fi]",8.8,Christopher Nolan,Leonardo DiCaprio,Joseph Gordon-Levitt,Elliot Page,Ken Watanabe,2067042,292576195
9,Fight Club,1999,[Drama],8.8,David Fincher,Brad Pitt,Edward Norton,Meat Loaf,Zach Grenier,1854740,37030102
10,The Lord of the Rings: The Fellowship of the Ring,2001,"[Action, Adventure, Drama]",8.8,Peter Jackson,Elijah Wood,Ian McKellen,Orlando Bloom,Sean Bean,1661481,315544750


In [268]:
# Top movie for each genre
e_df = df.explode("Genre")
e_df = e_df[["Genre", "Series_Title", "No_of_Votes", "Gross"]]
# setup new df of genres
genres = df["Genre"]
# flatten into list
genres = [elem for elems in genres for elem in elems]
# strip out spaces
genres = [genre.strip(' ') for genre in genres]
# convert to dict then back to list to remove duplicates
genres = list(dict.fromkeys(genres))
top_by_genre = []


for genre in genres:
    new_df = e_df[e_df["Genre"].str.contains(genre)]
    top_movie = new_df.loc[new_df[["No_of_Votes"]].idxmax()]['Series_Title'].values[0]
    top_by_genre.append([genre, top_movie])

for movie in top_by_genre:
    print(f"The top movie in {movie[0]} is: {movie[1]}.")

The top movie in Drama is: The Shawshank Redemption.
The top movie in Action is: The Dark Knight.
The top movie in Crime is: The Dark Knight.
The top movie in Biography is: Schindler's List.
The top movie in History is: Schindler's List.
The top movie in Adventure is: Inception.
The top movie in Sci-Fi is: Inception.
The top movie in Romance is: Forrest Gump.
The top movie in Western is: Django Unchained.
The top movie in Fantasy is: Star Wars.
The top movie in Comedy is: Back to the Future.
The top movie in Thriller is: The Silence of the Lambs.
The top movie in Animation is: WALL·E.
The top movie in Family is: WALL·E.
The top movie in Mystery is: Se7en.
The top movie in War is: Inglourious Basterds.
The top movie in Horror is: The Shining.
The top movie in Music is: The Pianist.
The top movie in Sport is: The Big Lebowski.
The top movie in Film-Noir is: The Third Man.
The top movie in Musical is: Singin' in the Rain.


In [269]:
# Director with the most movies
directors = df["Director"].to_list()
directors = list(dict.fromkeys(directors))
num_rows = 0
most_movies = 0
most_director = ''

for direct in directors:
    new_df = df[df["Director"].str.contains(direct)]
    num_rows = len(new_df.index)
    if num_rows >= most_movies:
        most_movies = num_rows
        most_director = direct
print(f"{most_director} has directed the most movies, a total of {most_movies}.")

Steven Spielberg has directed the most movies, a total of 13.


In [270]:
# Star with the most movies

star_df = df[["Series_Title", "Star1", "Star2", "Star3", "Star4"]]
star_df["Stars"] = star_df[["Star1", "Star2", "Star3", "Star4"]].values.tolist()
star_df.drop(["Star1", "Star2", "Star3", "Star4"], inplace=True, axis=1)
star_df = star_df.explode("Stars")
stars = star_df["Stars"]
stars = list(dict.fromkeys(stars))
num_rows = 0
most_movies = 0
most_star = ''

for star in stars:
    new_df = star_df[star_df["Stars"].str.contains(star)]
    num_rows = len(new_df.index)
    if num_rows >= most_movies:
        most_movies = num_rows
        most_star = star
print(f"{most_star} has starred in the most movies, a total of {most_movies}.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  star_df["Stars"] = star_df[["Star1", "Star2", "Star3", "Star4"]].values.tolist()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  star_df.drop(["Star1", "Star2", "Star3", "Star4"], inplace=True, axis=1)


Robert De Niro has starred in the most movies, a total of 17.


In [271]:
# Highest grossing movie for each genre
highest_by_genre = []
top_movie = ''

for genre in genres:
    new_df = e_df[e_df["Genre"].str.contains(genre)]
    top_movie = new_df.loc[new_df[["Gross"]].idxmax()]['Series_Title'].values[0]
    highest_by_genre.append([genre, top_movie])

for movie in highest_by_genre:
    print(f"The highest grossing movie in {movie[0]} is: {movie[1]}.")


The highest grossing movie in Drama is: Avengers: Endgame.
The highest grossing movie in Action is: Star Wars: Episode VII - The Force Awakens.
The highest grossing movie in Crime is: The Dark Knight.
The highest grossing movie in Biography is: The Blind Side.
The highest grossing movie in History is: Gone with the Wind.
The highest grossing movie in Adventure is: Star Wars: Episode VII - The Force Awakens.
The highest grossing movie in Sci-Fi is: Star Wars: Episode VII - The Force Awakens.
The highest grossing movie in Romance is: Titanic.
The highest grossing movie in Western is: Dances with Wolves.
The highest grossing movie in Fantasy is: Avatar.
The highest grossing movie in Comedy is: Toy Story 4.
The highest grossing movie in Thriller is: Joker.
The highest grossing movie in Animation is: Incredibles 2.
The highest grossing movie in Family is: E.T. the Extra-Terrestrial.
The highest grossing movie in Mystery is: The Sixth Sense.
The highest grossing movie in War is: Saving Priva

In [275]:
# Lowest grossing movie for each director

lowest_by_genre = []
top_movie = ''

for genre in genres:
    new_df = e_df[e_df["Genre"].str.contains(genre)]
    top_movie = new_df.loc[new_df[["Gross"]].idxmin()]['Series_Title'].values[0]
    lowest_by_genre.append([genre, top_movie])

for movie in lowest_by_genre:
    print(f"The lowest grossing movie in {movie[0]} is: {movie[1]}.")

The lowest grossing movie in Drama is: Adams æbler.
The lowest grossing movie in Action is: Knockin' on Heaven's Door.
The lowest grossing movie in Crime is: Adams æbler.
The lowest grossing movie in Biography is: La passion de Jeanne d'Arc.
The lowest grossing movie in History is: La passion de Jeanne d'Arc.
The lowest grossing movie in Adventure is: La montaña sagrada.
The lowest grossing movie in Sci-Fi is: La planète sauvage.
The lowest grossing movie in Romance is: Mr. Nobody.
The lowest grossing movie in Western is: Giù la testa.
The lowest grossing movie in Fantasy is: Mr. Nobody.
The lowest grossing movie in Comedy is: Adams æbler.
The lowest grossing movie in Thriller is: Dead Man's Shoes.
The lowest grossing movie in Animation is: Tôkyô goddofâzâzu.
The lowest grossing movie in Family is: Modern Times.
The lowest grossing movie in Mystery is: Salinui chueok.
The lowest grossing movie in War is: La battaglia di Algeri.
The lowest grossing movie in Horror is: Les yeux sans visa

In [279]:
# Save the dataframe as a parquet file

df.to_parquet('../Data/dataframetoparquet.parquet.gzip', compression='gzip')



In [285]:
# Aggregate the gross revenue for all 1000 movies.
df.agg({"Gross":['sum', 'mean', 'median', 'min', 'max']})

Unnamed: 0,Gross
sum,56228070000.0
mean,67826390.0
median,23383990.0
min,1305.0
max,936662200.0
