# Predicting with Director Rankings and Actor Rankings

## 1. Getting the relevant datasets

In [1]:
#Importing the basic libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

### 1.1 Importing the datasets for:
> 1. **Top 50 Directors** 
> 2. **Top 1000 Actors and Actresses**
> 3. **Top 250 Films**
> 4. **The Oscar Award**   

#### Top 50 directors:

In [2]:
directorData = pd.read_csv('Top 50 directors.csv')
directorData.head()

Unnamed: 0,Rank,Name of Director
0,1,Steven Spielberg
1,2,Martin Scorsese
2,3,Francis Ford Coppola
3,4,Stanley Kubrick
4,5,Alfred Hitchcock


Check the vital statistics of the dataset using the type and shape attributes.

In [3]:
print("Data type : ", type(directorData))
print("Data dims : ", directorData.shape)

Data type :  <class 'pandas.core.frame.DataFrame'>
Data dims :  (50, 2)


Check the variables (and their types) in the dataset using the dtypes attribute.

In [4]:
print(directorData.dtypes)

Rank                 int64
Name of Director    object
dtype: object


#### Top 1000 actors and actresses:

In [5]:
actorData = pd.read_csv('Top 1000 Actors and Actresses.csv')
actorData.head()

Unnamed: 0,Position,Const,Created,Modified,Description,Name,Known For,Birth Date
0,1,nm0000134,2014-03-09,2014-03-09,,Robert De Niro,Raging Bull,1943-08-17
1,2,nm0000197,2014-03-09,2015-10-25,,Jack Nicholson,Chinatown,1937-04-22
2,3,nm0000008,2014-03-09,2014-03-09,,Marlon Brando,Apocalypse Now,1924-04-03
3,4,nm0000243,2014-03-09,2014-03-09,,Denzel Washington,Fences,1954-12-28
4,5,nm0000031,2014-03-09,2014-03-09,,Katharine Hepburn,The Lion in Winter,1907-05-12


Check the vital statistics of the dataset using the type and shape attributes.

In [6]:
print("Data type : ", type(actorData))
print("Data dims : ", actorData.shape)

Data type :  <class 'pandas.core.frame.DataFrame'>
Data dims :  (1000, 8)


Check the variables (and their types) in the dataset using the dtypes attribute.

In [7]:
print(actorData.dtypes)

Position         int64
Const           object
Created         object
Modified        object
Description    float64
Name            object
Known For       object
Birth Date      object
dtype: object


#### Top 250 films:

In [8]:
imdbData = pd.read_csv('imdbTop250.csv')
imdbData.head()

Unnamed: 0,Ranking,IMDByear,IMDBlink,Title,Date,RunTime,Genre,Rating,Score,Votes,Gross,Director,Cast1,Cast2,Cast3,Cast4
0,1,1996,/title/tt0076759/,Star Wars: Episode IV - A New Hope,1977,121,"Action, Adventure, Fantasy",8.6,90.0,1299781,322.74,George Lucas,Mark Hamill,Harrison Ford,Carrie Fisher,Alec Guinness
1,2,1996,/title/tt0111161/,The Shawshank Redemption,1994,142,Drama,9.3,80.0,2529673,28.34,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler
2,3,1996,/title/tt0117951/,Trainspotting,1996,93,Drama,8.1,83.0,665213,16.5,Danny Boyle,Ewan McGregor,Ewen Bremner,Jonny Lee Miller,Kevin McKidd
3,4,1996,/title/tt0114814/,The Usual Suspects,1995,106,"Crime, Drama, Mystery",8.5,77.0,1045626,23.34,Bryan Singer,Kevin Spacey,Gabriel Byrne,Chazz Palminteri,Stephen Baldwin
4,5,1996,/title/tt0108598/,The Wrong Trousers,1993,30,"Animation, Short, Comedy",8.3,,53316,,Nick Park,Peter Sallis,Peter Hawkins,,


Check the vital statistics of the dataset using the type and shape attributes.

In [9]:
print("Data type : ", type(imdbData))
print("Data dims : ", imdbData.shape)

Data type :  <class 'pandas.core.frame.DataFrame'>
Data dims :  (6500, 16)


Check the variables (and their types) in the dataset using the dtypes attribute.

In [10]:
print(imdbData.dtypes)

Ranking       int64
IMDByear      int64
IMDBlink     object
Title        object
Date          int64
RunTime       int64
Genre        object
Rating      float64
Score       float64
Votes         int64
Gross       float64
Director     object
Cast1        object
Cast2        object
Cast3        object
Cast4        object
dtype: object


#### The Oscar Award:

In [11]:
awardData = pd.read_csv('clean_oscarAward.csv')
awardData.head()

Unnamed: 0.1,Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner,Number of wins
0,0,1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False,0
1,1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True,1
2,2,1927,1928,1,ACTRESS,Louise Dresser,A Ship Comes In,False,0
3,3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True,3
4,4,1927,1928,1,ACTRESS,Gloria Swanson,Sadie Thompson,False,0


Check the vital statistics of the dataset using the type and shape attributes.

In [12]:
print("Data type : ", type(awardData))
print("Data dims : ", awardData.shape)

Data type :  <class 'pandas.core.frame.DataFrame'>
Data dims :  (4834, 9)


Check the variables (and their types) in the dataset using the dtypes attribute.

In [13]:
print(awardData.dtypes)

Unnamed: 0         int64
year_film          int64
year_ceremony      int64
ceremony           int64
category          object
name              object
film              object
winner              bool
Number of wins     int64
dtype: object


### 1.2 Adding directorData and actorData into imdbData

#### Inserting columns into imdbData:

Default value of cast ranks is set as 100000 - allows the rankings of the top 250 actors and actresses to weigh more
Default value of sum of rankings is set as 400000 - sum of the default value of all cast ranks

In [14]:
imdbData.insert(12, "Director Rank", 0)
imdbData.insert(14, "Cast1 Rank", 100000)
imdbData.insert(16, "Cast2 Rank", 100000)
imdbData.insert(18, "Cast3 Rank", 100000)
imdbData.insert(20, "Cast4 Rank", 100000)
imdbData.insert(21, "Sum of Rankings", 400000)

In [15]:
imdbData

Unnamed: 0,Ranking,IMDByear,IMDBlink,Title,Date,RunTime,Genre,Rating,Score,Votes,...,Director Rank,Cast1,Cast1 Rank,Cast2,Cast2 Rank,Cast3,Cast3 Rank,Cast4,Cast4 Rank,Sum of Rankings
0,1,1996,/title/tt0076759/,Star Wars: Episode IV - A New Hope,1977,121,"Action, Adventure, Fantasy",8.6,90.0,1299781,...,0,Mark Hamill,100000,Harrison Ford,100000,Carrie Fisher,100000,Alec Guinness,100000,400000
1,2,1996,/title/tt0111161/,The Shawshank Redemption,1994,142,Drama,9.3,80.0,2529673,...,0,Tim Robbins,100000,Morgan Freeman,100000,Bob Gunton,100000,William Sadler,100000,400000
2,3,1996,/title/tt0117951/,Trainspotting,1996,93,Drama,8.1,83.0,665213,...,0,Ewan McGregor,100000,Ewen Bremner,100000,Jonny Lee Miller,100000,Kevin McKidd,100000,400000
3,4,1996,/title/tt0114814/,The Usual Suspects,1995,106,"Crime, Drama, Mystery",8.5,77.0,1045626,...,0,Kevin Spacey,100000,Gabriel Byrne,100000,Chazz Palminteri,100000,Stephen Baldwin,100000,400000
4,5,1996,/title/tt0108598/,The Wrong Trousers,1993,30,"Animation, Short, Comedy",8.3,,53316,...,0,Peter Sallis,100000,Peter Hawkins,100000,,100000,,100000,400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6495,246,2021,/title/tt0058946/,The Battle of Algiers,1966,121,"Drama, War",8.1,96.0,57995,...,0,Brahim Hadjadj,100000,Jean Martin,100000,Yacef Saadi,100000,Samia Kerbash,100000,400000
6496,247,2021,/title/tt0050783/,Nights of Cabiria,1957,110,Drama,8.1,,47318,...,0,Giulietta Masina,100000,François Périer,100000,Franca Marzi,100000,Dorian Gray,100000,400000
6497,248,2021,/title/tt0093779/,The Princess Bride,1987,98,"Adventure, Family, Fantasy",8.1,77.0,416207,...,0,Cary Elwes,100000,Mandy Patinkin,100000,Robin Wright,100000,Chris Sarandon,100000,400000
6498,249,2021,/title/tt7060344/,Raatchasan,2018,170,"Crime, Drama, Mystery",8.4,,37474,...,0,Vishnu Vishal,100000,Amala Paul,100000,Radha Ravi,100000,Sangili Murugan,100000,400000


#### Adding director rankings to the IMDB Dataset using data from the Top 50 Directors dataset:

In [17]:
yes = 0
for i in range(0,len(imdbData)):
    for j in range(0,len(directorData)):
        if(imdbData['Director'][i] == directorData['Name of Director'][j]):
            imdbData.at[i,'Director Rank'] = directorData['Rank'][j]
            yes = 1
    if yes != 1:
        imdbData.at[i,'Director Rank']='0'
    yes = 0

In [18]:
imdbData

Unnamed: 0,Ranking,IMDByear,IMDBlink,Title,Date,RunTime,Genre,Rating,Score,Votes,...,Director Rank,Cast1,Cast1 Rank,Cast2,Cast2 Rank,Cast3,Cast3 Rank,Cast4,Cast4 Rank,Sum of Rankings
0,1,1996,/title/tt0076759/,Star Wars: Episode IV - A New Hope,1977,121,"Action, Adventure, Fantasy",8.6,90.0,1299781,...,0,Mark Hamill,100000,Harrison Ford,100000,Carrie Fisher,100000,Alec Guinness,100000,400000
1,2,1996,/title/tt0111161/,The Shawshank Redemption,1994,142,Drama,9.3,80.0,2529673,...,20,Tim Robbins,100000,Morgan Freeman,100000,Bob Gunton,100000,William Sadler,100000,400000
2,3,1996,/title/tt0117951/,Trainspotting,1996,93,Drama,8.1,83.0,665213,...,36,Ewan McGregor,100000,Ewen Bremner,100000,Jonny Lee Miller,100000,Kevin McKidd,100000,400000
3,4,1996,/title/tt0114814/,The Usual Suspects,1995,106,"Crime, Drama, Mystery",8.5,77.0,1045626,...,43,Kevin Spacey,100000,Gabriel Byrne,100000,Chazz Palminteri,100000,Stephen Baldwin,100000,400000
4,5,1996,/title/tt0108598/,The Wrong Trousers,1993,30,"Animation, Short, Comedy",8.3,,53316,...,0,Peter Sallis,100000,Peter Hawkins,100000,,100000,,100000,400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6495,246,2021,/title/tt0058946/,The Battle of Algiers,1966,121,"Drama, War",8.1,96.0,57995,...,0,Brahim Hadjadj,100000,Jean Martin,100000,Yacef Saadi,100000,Samia Kerbash,100000,400000
6496,247,2021,/title/tt0050783/,Nights of Cabiria,1957,110,Drama,8.1,,47318,...,0,Giulietta Masina,100000,François Périer,100000,Franca Marzi,100000,Dorian Gray,100000,400000
6497,248,2021,/title/tt0093779/,The Princess Bride,1987,98,"Adventure, Family, Fantasy",8.1,77.0,416207,...,0,Cary Elwes,100000,Mandy Patinkin,100000,Robin Wright,100000,Chris Sarandon,100000,400000
6498,249,2021,/title/tt7060344/,Raatchasan,2018,170,"Crime, Drama, Mystery",8.4,,37474,...,0,Vishnu Vishal,100000,Amala Paul,100000,Radha Ravi,100000,Sangili Murugan,100000,400000


#### Adding Cast1 rankings to the IMDB Dataset using data from the Top 1000 Actors and Actresses dataset:

In [19]:
yes = 0
for i in range(0,len(imdbData)):
    for j in range(0,len(actorData)):
        if(imdbData['Cast1'][i].strip() == actorData['Name'][j]):
            imdbData.at[i,'Cast1 Rank'] = actorData['Position'][j]
            yes = 1
    if yes != 1:
        imdbData.at[i,'Cast1 Rank']='100000'
    yes = 0

In [20]:
imdbData

Unnamed: 0,Ranking,IMDByear,IMDBlink,Title,Date,RunTime,Genre,Rating,Score,Votes,...,Director Rank,Cast1,Cast1 Rank,Cast2,Cast2 Rank,Cast3,Cast3 Rank,Cast4,Cast4 Rank,Sum of Rankings
0,1,1996,/title/tt0076759/,Star Wars: Episode IV - A New Hope,1977,121,"Action, Adventure, Fantasy",8.6,90.0,1299781,...,0,Mark Hamill,914,Harrison Ford,100000,Carrie Fisher,100000,Alec Guinness,100000,400000
1,2,1996,/title/tt0111161/,The Shawshank Redemption,1994,142,Drama,9.3,80.0,2529673,...,20,Tim Robbins,159,Morgan Freeman,100000,Bob Gunton,100000,William Sadler,100000,400000
2,3,1996,/title/tt0117951/,Trainspotting,1996,93,Drama,8.1,83.0,665213,...,36,Ewan McGregor,713,Ewen Bremner,100000,Jonny Lee Miller,100000,Kevin McKidd,100000,400000
3,4,1996,/title/tt0114814/,The Usual Suspects,1995,106,"Crime, Drama, Mystery",8.5,77.0,1045626,...,43,Kevin Spacey,618,Gabriel Byrne,100000,Chazz Palminteri,100000,Stephen Baldwin,100000,400000
4,5,1996,/title/tt0108598/,The Wrong Trousers,1993,30,"Animation, Short, Comedy",8.3,,53316,...,0,Peter Sallis,100000,Peter Hawkins,100000,,100000,,100000,400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6495,246,2021,/title/tt0058946/,The Battle of Algiers,1966,121,"Drama, War",8.1,96.0,57995,...,0,Brahim Hadjadj,100000,Jean Martin,100000,Yacef Saadi,100000,Samia Kerbash,100000,400000
6496,247,2021,/title/tt0050783/,Nights of Cabiria,1957,110,Drama,8.1,,47318,...,0,Giulietta Masina,100000,François Périer,100000,Franca Marzi,100000,Dorian Gray,100000,400000
6497,248,2021,/title/tt0093779/,The Princess Bride,1987,98,"Adventure, Family, Fantasy",8.1,77.0,416207,...,0,Cary Elwes,903,Mandy Patinkin,100000,Robin Wright,100000,Chris Sarandon,100000,400000
6498,249,2021,/title/tt7060344/,Raatchasan,2018,170,"Crime, Drama, Mystery",8.4,,37474,...,0,Vishnu Vishal,100000,Amala Paul,100000,Radha Ravi,100000,Sangili Murugan,100000,400000


#### Adding Cast2 rankings to the IMDB Dataset using data from the Top 1000 Actors and Actresses dataset:

In [21]:
yes = 0
for i in range(0,len(imdbData)):
    for j in range(0,len(actorData)):
        if(imdbData['Cast2'][i].strip() == actorData['Name'][j]):
            imdbData.at[i,'Cast2 Rank'] = actorData['Position'][j]
            yes = 1
    if yes != 1:
        imdbData.at[i,'Cast2 Rank']='100000'
    yes = 0


#### Adding Cast3 rankings to the IMDB Dataset using data from the Top 1000 Actors and Actresses dataset:

In [22]:
yes = 0
for i in range(0,len(imdbData)):
    if(type(imdbData['Cast3'][i]) == float):
        continue
    else:
        for j in range(0,len(actorData)):
            if(imdbData['Cast3'][i].strip() == actorData['Name'][j]):
                imdbData.at[i,'Cast3 Rank'] = actorData['Position'][j]
                yes = 1
    if yes != 1:
        imdbData.at[i,'Cast3 Rank']='100000'
    yes = 0

#### Adding Cast4 rankings to the IMDB Dataset using data from the Top 1000 Actors and Actresses dataset:

In [23]:
yes = 0
for i in range(0,len(imdbData)):
    if(type(imdbData['Cast4'][i]) == float):
        continue
    else:
        for j in range(0,len(actorData)):
            if(imdbData['Cast4'][i].strip() == actorData['Name'][j]):
                imdbData.at[i,'Cast4 Rank'] = actorData['Position'][j]
                yes = 1
    if yes != 1:
        imdbData.at[i,'Cast4 Rank']='100000'
    yes = 0

#### Adding sum of cast rankings to the IMDB Dataset using data from the Top 1000 Actors and Actresses dataset:

In [24]:
sum = 0
for i in range(0,len(imdbData)):
    sum += imdbData['Cast1 Rank'][i] + imdbData['Cast2 Rank'][i] + imdbData['Cast3 Rank'][i] + imdbData['Cast4 Rank'][i]
    imdbData.at[i,'Sum of Rankings'] = sum
    sum = 0

#### New imdbData dataset:

In [25]:
imdbData

Unnamed: 0,Ranking,IMDByear,IMDBlink,Title,Date,RunTime,Genre,Rating,Score,Votes,...,Director Rank,Cast1,Cast1 Rank,Cast2,Cast2 Rank,Cast3,Cast3 Rank,Cast4,Cast4 Rank,Sum of Rankings
0,1,1996,/title/tt0076759/,Star Wars: Episode IV - A New Hope,1977,121,"Action, Adventure, Fantasy",8.6,90.0,1299781,...,0,Mark Hamill,914,Harrison Ford,58,Carrie Fisher,731,Alec Guinness,165,1868
1,2,1996,/title/tt0111161/,The Shawshank Redemption,1994,142,Drama,9.3,80.0,2529673,...,20,Tim Robbins,159,Morgan Freeman,34,Bob Gunton,100000,William Sadler,100000,200193
2,3,1996,/title/tt0117951/,Trainspotting,1996,93,Drama,8.1,83.0,665213,...,36,Ewan McGregor,713,Ewen Bremner,100000,Jonny Lee Miller,100000,Kevin McKidd,100000,300713
3,4,1996,/title/tt0114814/,The Usual Suspects,1995,106,"Crime, Drama, Mystery",8.5,77.0,1045626,...,43,Kevin Spacey,618,Gabriel Byrne,100000,Chazz Palminteri,771,Stephen Baldwin,100000,201389
4,5,1996,/title/tt0108598/,The Wrong Trousers,1993,30,"Animation, Short, Comedy",8.3,,53316,...,0,Peter Sallis,100000,Peter Hawkins,100000,,100000,,100000,400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6495,246,2021,/title/tt0058946/,The Battle of Algiers,1966,121,"Drama, War",8.1,96.0,57995,...,0,Brahim Hadjadj,100000,Jean Martin,100000,Yacef Saadi,100000,Samia Kerbash,100000,400000
6496,247,2021,/title/tt0050783/,Nights of Cabiria,1957,110,Drama,8.1,,47318,...,0,Giulietta Masina,100000,François Périer,100000,Franca Marzi,100000,Dorian Gray,100000,400000
6497,248,2021,/title/tt0093779/,The Princess Bride,1987,98,"Adventure, Family, Fantasy",8.1,77.0,416207,...,0,Cary Elwes,903,Mandy Patinkin,100000,Robin Wright,340,Chris Sarandon,100000,201243
6498,249,2021,/title/tt7060344/,Raatchasan,2018,170,"Crime, Drama, Mystery",8.4,,37474,...,0,Vishnu Vishal,100000,Amala Paul,100000,Radha Ravi,100000,Sangili Murugan,100000,400000


In [26]:
imdbData.to_csv('IMDBtotal.csv')