## Import the necessary basic libraries

In [1]:
import numpy as np
import pandas as pd
import csv

## Cleaning the Oscar Award dataset

### Obtaining the dataset

In [None]:
oscarData = pd.read_csv('the_oscar_award.csv')
oscarData.head()

###### Check the vital statistics of the dataset using the type and shape attributes:

In [None]:
print("Data type : ", type(oscarData))
print("Data dims : ", oscarData.shape)

###### Check the variables (and their types) in the dataset using the dtypes attribute.

In [None]:
print(oscarData.dtypes)

###### Information about the variables:

In [None]:
oscarData.info()

- We can see that there are missing values for the film category

In [None]:
oscarData['film'].isnull().value_counts()

- There are a total of 304 missing values for the film category

- The categories with missing values for the film category

In [None]:
print(oscarData[oscarData['film'].isnull()]['category'].unique())

In [None]:
oscarData[oscarData['film'].isnull()].head()

- There are a few awards that are not related to the movie itself, hence we should drop these lines

In [None]:
dropRows = oscarData[
    (oscarData['category'] == 'HONORARY AWARD') |
    (oscarData['category'] == 'SPECIAL AWARD') |
    (oscarData['category'] == 'IRVING G. THALBERG MEMORIAL AWARD') |
    (oscarData['category'] == 'JEAN HERSHOLT HUMANITARIAN AWARD') |
    (oscarData['category'] == 'SPECIAL ACHIEVEMENT AWARD')
].index

oscarData = oscarData.drop(dropRows)

In [None]:
oscarData = oscarData.dropna(how='all')

In [None]:
oscarData.head()

- These are the remaining categories with empty film names:

In [None]:
print(oscarData[oscarData['film'].isnull()]['category'].unique())
oscarData[oscarData['film'].isnull()]

- Through observation, we can see that there are 2 categories which have the film name in the 'name' column instead of the 'film' column. These categories are the "SPECIAL FOREIGN LANGUAGE FILM AWARD" and the "HONORARY FOREIGN LANGUAGE FILM AWARD".

In [None]:
for row in oscarData [
    (oscarData['category'] == 'SPECIAL FOREIGN LANGUAGE FILM AWARD') |
    (oscarData['category'] == 'HONORARY FOREIGN LANGUAGE FILM AWARD')
].iterrows():
    
    print(row[1][4])

- We should move the film name to the 'film' column for each of these categories

In [None]:
oscarData['film'] = oscarData.apply(
    lambda x: x['name'].split('-')[0] if x['category'] in (
        'SPECIAL FOREIGN LANGUAGE FILM AWARD', 'HONORARY FOREIGN LANGUAGE FILM AWARD'
    )
    else x['film'],
    axis=1
)

- These are the remaining categories with empty film names:

In [None]:
print(oscarData[oscarData['film'].isnull()]['category'].unique())
oscarData[oscarData['film'].isnull()]

- Upon conducting some research, we found that these 4 remaining categories have since been removed. Hence, we will not be considering the award winners for these categories and will be dropping them from our oscarData dataset.

In [None]:
dropRows = oscarData[
    (
        (oscarData['category'] == 'ENGINEERING EFFECTS') |
        (oscarData['category'] == 'WRITING (Title Writing)') |
        (oscarData['category'] == 'SOUND RECORDING') |
        (oscarData['category'] == 'ASSISTANT DIRECTOR')
    )
].index

oscarData = oscarData.drop(dropRows)

oscarData.info()

In [None]:
oscarData = oscarData.dropna(how = 'all')
oscarData.head()

In [None]:
oscarData.to_csv('clean_oscarAward.csv')

In [None]:
oscarData

In [None]:
oscarData = pd.read_csv('clean_oscarAward.csv')
oscarData

In [None]:
del oscarData["Unnamed: 0"]
oscarData

In [None]:
oscarData['film'].nunique()

### Calculate the number of wins for each film.
(Remove duplicate films as we are only interested in the film and its number of wins at the Oscar Awards.)

###### Create a new column for the number of wins:

In [None]:
oscarData.insert(7, "Number of wins", 0)

###### Calculating and adding the number of wins for each film into the oscarData dataset

In [None]:
title = ""
wins = 0

for i in range(0, 10391):
    print(i)
    try:
        if(oscarData['Number of wins'][i] > 0):
            oscarData = oscarData.drop(i)
            continue
        
        if oscarData['winner'][i] == True:
            wins += 1
            
        title = oscarData['film'][i]
        year = oscarData['year_film'][i]
        
        for j in range(i+1, 10391):
            try:
                if oscarData['film'][j] == title and oscarData['year_film'][j] == year:
                    if oscarData['winner'][j] == True:
                        wins += 1
            except:
                continue
        
        for k in range(0, 10391):
            try:
                if oscarData['film'][k] == title and oscarData['year_film'][k] == year:
                    oscarData.at[k, 'Number of wins'] = wins
            except:
                continue
        
        wins = 0
        
    except:
        continue

In [None]:
oscarData = oscarData.dropna(how = 'all')

###### The new Oscar Award dataset:

In [None]:
oscarData

In [None]:
oscarData.to_csv('clean_oscarAwardWins.csv')

In [None]:
#helper function
oscarData = pd.read_csv('clean_oscarAwardWins.csv')
oscarData

In [None]:
#clearing unneccessary columns
del oscarData["Unnamed: 0"]
oscarData.head()

## Further cleaning the Oscar Award dataset using the IMDB dataset

###### As we are trying to predict if a film can win at the Oscars based on the actors and directors' popularity, film's runtime, film's rating and film's genre, we will first compare the IMDB dataset with our current Oscar Award dataset and remove the films from the Oscar Award dataset that does not appear in our IMDB dataset.

### Obtaining the IMDB dataset

In [None]:
imdbData = pd.read_csv('imdbTop250.csv')
imdbData.head()

### Removing the films from the Oscar Award dataset

In [None]:
yes = 0

for i in range(0,10391):
    print(i)
    try:
        for j in range(0, len(imdbData)):
            if oscarData['film'][i] == imdbData['Title'][j] and oscarData['year_film'][i] == imdbData['Date']:
                yes = 1
        if yes != 1:
            oscarData = oscarData.drop(i)
        yes = 0
    except:
        continue

In [None]:
#clearing unnecessary columns
oscarData = oscarData.dropna(how = 'all')

###### The new Oscar Award dataset:

In [None]:
oscarData.head()

In [None]:
oscarData.to_csv('oscarIMDB.csv')

In [2]:
#helper function
oscarData = pd.read_csv('oscarIMDB.csv')
oscarData.head()

Unnamed: 0.1,Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner,Number of wins
0,5,1927,1928,1,ART DIRECTION,Rochus Gliese,Sunrise,False,3
1,63,1929,1930,3,CINEMATOGRAPHY,(Arthur Edeson),All Quiet on the Western Front,False,2
2,105,1931,1932,5,ACTRESS,Marie Dressler,Emma,False,0
3,144,1932,1933,6,DIRECTING,George Cukor,Little Women,False,1
4,165,1934,1935,7,ACTOR,Clark Gable,It Happened One Night,True,5


In [3]:
#clearing unneccessary columns
del oscarData["Unnamed: 0"]
oscarData.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner,Number of wins
0,1927,1928,1,ART DIRECTION,Rochus Gliese,Sunrise,False,3
1,1929,1930,3,CINEMATOGRAPHY,(Arthur Edeson),All Quiet on the Western Front,False,2
2,1931,1932,5,ACTRESS,Marie Dressler,Emma,False,0
3,1932,1933,6,DIRECTING,George Cukor,Little Women,False,1
4,1934,1935,7,ACTOR,Clark Gable,It Happened One Night,True,5


## Further cleaning the Oscar Award dataset using the Sales dataset

###### As we are trying to predict if a film can win at the Oscars based on the film's sales, we will first compare the Sales dataset with our current Oscar Award dataset and remove the films from the Oscar Award dataset that does not appear in our Sales dataset.

### Obtaining the Sales dataset

In [4]:
salesData = pd.read_csv('Highest Hollywood Grossing Movies.csv')
salesData.head()

Unnamed: 0.1,Unnamed: 0,Title,Movie Info,Distributor,Release Date,Domestic Sales (in $),International Sales (in $),World Sales (in $),Genre,Movie Runtime,License
0,0,Star Wars: Episode VII - The Force Awakens (2015),"As a new threat to the galaxy rises, Rey, a de...",Walt Disney Studios Motion Pictures,"December 16, 2015",936662225,1132859475,2069521700,"['Action', 'Adventure', 'Sci-Fi']",2 hr 18 min,PG-13
1,1,Avengers: Endgame (2019),After the devastating events of Avengers: Infi...,Walt Disney Studios Motion Pictures,"April 24, 2019",858373000,1939128328,2797501328,"['Action', 'Adventure', 'Drama', 'Sci-Fi']",3 hr 1 min,PG-13
2,2,Avatar (2009),A paraplegic Marine dispatched to the moon Pan...,Twentieth Century Fox,"December 16, 2009",760507625,2086738578,2847246203,"['Action', 'Adventure', 'Fantasy', 'Sci-Fi']",2 hr 42 min,PG-13
3,3,Black Panther (2018),"T'Challa, heir to the hidden but advanced king...",Walt Disney Studios Motion Pictures,,700426566,647171407,1347597973,"['Action', 'Adventure', 'Sci-Fi']",2 hr 14 min,
4,4,Avengers: Infinity War (2018),The Avengers and their allies must be willing ...,Walt Disney Studios Motion Pictures,,678815482,1369544272,2048359754,"['Action', 'Adventure', 'Sci-Fi']",2 hr 29 min,


In [5]:
#clearing unneccessary columns
del salesData["Unnamed: 0"]
salesData.head()

Unnamed: 0,Title,Movie Info,Distributor,Release Date,Domestic Sales (in $),International Sales (in $),World Sales (in $),Genre,Movie Runtime,License
0,Star Wars: Episode VII - The Force Awakens (2015),"As a new threat to the galaxy rises, Rey, a de...",Walt Disney Studios Motion Pictures,"December 16, 2015",936662225,1132859475,2069521700,"['Action', 'Adventure', 'Sci-Fi']",2 hr 18 min,PG-13
1,Avengers: Endgame (2019),After the devastating events of Avengers: Infi...,Walt Disney Studios Motion Pictures,"April 24, 2019",858373000,1939128328,2797501328,"['Action', 'Adventure', 'Drama', 'Sci-Fi']",3 hr 1 min,PG-13
2,Avatar (2009),A paraplegic Marine dispatched to the moon Pan...,Twentieth Century Fox,"December 16, 2009",760507625,2086738578,2847246203,"['Action', 'Adventure', 'Fantasy', 'Sci-Fi']",2 hr 42 min,PG-13
3,Black Panther (2018),"T'Challa, heir to the hidden but advanced king...",Walt Disney Studios Motion Pictures,,700426566,647171407,1347597973,"['Action', 'Adventure', 'Sci-Fi']",2 hr 14 min,
4,Avengers: Infinity War (2018),The Avengers and their allies must be willing ...,Walt Disney Studios Motion Pictures,,678815482,1369544272,2048359754,"['Action', 'Adventure', 'Sci-Fi']",2 hr 29 min,


###### The year of release is found in the 'Title' column. As we need to compare this to the year of release in the Oscar Award dataset to ensure we remove the correct edition of a movie, we should move this to another new column called 'Release Year'

In [6]:
#creating a new column
salesData.insert(4, "Release Year", 0)
salesData.head()

Unnamed: 0,Title,Movie Info,Distributor,Release Date,Release Year,Domestic Sales (in $),International Sales (in $),World Sales (in $),Genre,Movie Runtime,License
0,Star Wars: Episode VII - The Force Awakens (2015),"As a new threat to the galaxy rises, Rey, a de...",Walt Disney Studios Motion Pictures,"December 16, 2015",0,936662225,1132859475,2069521700,"['Action', 'Adventure', 'Sci-Fi']",2 hr 18 min,PG-13
1,Avengers: Endgame (2019),After the devastating events of Avengers: Infi...,Walt Disney Studios Motion Pictures,"April 24, 2019",0,858373000,1939128328,2797501328,"['Action', 'Adventure', 'Drama', 'Sci-Fi']",3 hr 1 min,PG-13
2,Avatar (2009),A paraplegic Marine dispatched to the moon Pan...,Twentieth Century Fox,"December 16, 2009",0,760507625,2086738578,2847246203,"['Action', 'Adventure', 'Fantasy', 'Sci-Fi']",2 hr 42 min,PG-13
3,Black Panther (2018),"T'Challa, heir to the hidden but advanced king...",Walt Disney Studios Motion Pictures,,0,700426566,647171407,1347597973,"['Action', 'Adventure', 'Sci-Fi']",2 hr 14 min,
4,Avengers: Infinity War (2018),The Avengers and their allies must be willing ...,Walt Disney Studios Motion Pictures,,0,678815482,1369544272,2048359754,"['Action', 'Adventure', 'Sci-Fi']",2 hr 29 min,


In [7]:
#Moving the 'Release Year' from the 'Title' to the 'Release Year' column
new = salesData["Title"].str.split("(", n = 1, expand = True)
salesData["Title"]= new[0]
#Remove the brackets
temp = new[1].str.split(")", n = 1, expand = True)
salesData["Release Year"]= temp[0].astype(str).astype('int64')
#Remove the last whitespace
salesData['Title'] = salesData['Title'].str.strip()

###### New salesData dataframe:

In [8]:
salesData.head()

Unnamed: 0,Title,Movie Info,Distributor,Release Date,Release Year,Domestic Sales (in $),International Sales (in $),World Sales (in $),Genre,Movie Runtime,License
0,Star Wars: Episode VII - The Force Awakens,"As a new threat to the galaxy rises, Rey, a de...",Walt Disney Studios Motion Pictures,"December 16, 2015",2015,936662225,1132859475,2069521700,"['Action', 'Adventure', 'Sci-Fi']",2 hr 18 min,PG-13
1,Avengers: Endgame,After the devastating events of Avengers: Infi...,Walt Disney Studios Motion Pictures,"April 24, 2019",2019,858373000,1939128328,2797501328,"['Action', 'Adventure', 'Drama', 'Sci-Fi']",3 hr 1 min,PG-13
2,Avatar,A paraplegic Marine dispatched to the moon Pan...,Twentieth Century Fox,"December 16, 2009",2009,760507625,2086738578,2847246203,"['Action', 'Adventure', 'Fantasy', 'Sci-Fi']",2 hr 42 min,PG-13
3,Black Panther,"T'Challa, heir to the hidden but advanced king...",Walt Disney Studios Motion Pictures,,2018,700426566,647171407,1347597973,"['Action', 'Adventure', 'Sci-Fi']",2 hr 14 min,
4,Avengers: Infinity War,The Avengers and their allies must be willing ...,Walt Disney Studios Motion Pictures,,2018,678815482,1369544272,2048359754,"['Action', 'Adventure', 'Sci-Fi']",2 hr 29 min,


### Removing the films from the Oscar Award dataset

In [9]:
yes = 0

for i in range(0,len(oscarData)):
    print(i)
    
    try:
        for j in range(0, len(salesData)):
            if oscarData['film'][i] == salesData['Title'][j] and oscarData['year_film'][i] == salesData['Release Year'][j]:
                yes = 1
                
        if yes != 1:
            oscarData = oscarData.drop(i)
        
        yes = 0
    
    except:
        continue


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

###### new Oscar Award dataset:

In [10]:
oscarData

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner,Number of wins
10,1937,1938,10,MUSIC (Scoring),"Walt Disney Studio Music Department, Leigh Har...",Snow White and the Seven Dwarfs,False,0
193,1972,1973,45,ACTOR,Marlon Brando,The Godfather,True,3
204,1973,1974,46,ACTOR IN A SUPPORTING ROLE,Jason Miller,The Exorcist,False,2
227,1975,1976,48,FILM EDITING,Verna Fields,Jaws,True,3
246,1979,1980,52,ACTOR IN A SUPPORTING ROLE,Robert Duvall,Apocalypse Now,False,2
...,...,...,...,...,...,...,...,...
711,2019,2020,92,ACTRESS IN A LEADING ROLE,Saoirse Ronan,Little Women,False,1
714,2019,2020,92,CINEMATOGRAPHY,Roger Deakins,1917,True,3
718,2019,2020,92,FILM EDITING,Michael McCusker and Andrew Buckland,Ford v Ferrari,True,2
722,2019,2020,92,VISUAL EFFECTS,"Dan DeLeeuw, Russell Earl, Matt Aitken and Dan...",Avengers: Endgame,False,0


In [11]:
oscarData.to_csv('oscarIMDBSales')