In [93]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [94]:
names = ['action', 'adventure', 'animation', 'biography', 'crime', 'family', 'fantasy', 'history', 'horror', 'mystery', 'romance', 'scifi', 'sports', 'thriller', 'war']
df = pd.DataFrame() 

for name in names:
    temp_df = pd.read_csv(f'Original Data/{name}.csv')
    df = pd.concat([df, temp_df], ignore_index=True) 

In [95]:
df.columns

Index(['movie_id', 'movie_name', 'year', 'certificate', 'runtime', 'genre',
       'rating', 'description', 'director', 'director_id', 'star', 'star_id',
       'votes', 'gross(in $)'],
      dtype='object')

In [96]:
genre_split = df['genre'].str.split(',', expand=True)

df[['Genre1', 'Genre2', 'Genre3']] = genre_split

df[['genre', 'Genre1', 'Genre2', 'Genre3']]

Unnamed: 0,genre,Genre1,Genre2,Genre3
0,"Action, Adventure, Drama",Action,Adventure,Drama
1,"Action, Adventure, Fantasy",Action,Adventure,Fantasy
2,"Action, Thriller",Action,Thriller,
3,"Action, Adventure, Comedy",Action,Adventure,Comedy
4,"Action, Crime, Mystery",Action,Crime,Mystery
...,...,...,...,...
367309,"Drama, History, War",Drama,History,War
367310,War,War,,
367311,War,War,,
367312,"Action, Adventure, Drama",Action,Adventure,Drama


In [97]:
print(df.isnull().sum())

movie_id            0
movie_name          4
year            53248
certificate    263145
runtime        109153
genre               0
rating         137360
description         0
director        27369
director_id     27369
star            58695
star_id         51858
votes          137356
gross(in $)    342313
Genre1              0
Genre2          84109
Genre3         186005
dtype: int64


In [98]:
box_office_df = df[df[['gross(in $)', 'runtime', 'director']].notnull().all(axis=1)]

print(box_office_df.isnull().sum(), len(box_office_df))

movie_id          0
movie_name        0
year              0
certificate    1086
runtime           0
genre             0
rating            0
description       0
director          0
director_id       0
star              7
star_id           0
votes             0
gross(in $)       0
Genre1            0
Genre2          249
Genre3         4276
dtype: int64 24987


In [99]:
box_office_df.head()

Unnamed: 0,movie_id,movie_name,year,certificate,runtime,genre,rating,description,director,director_id,star,star_id,votes,gross(in $),Genre1,Genre2,Genre3
12,tt1825683,Black Panther,2018,PG-13,134 min,"Action, Adventure, Sci-Fi",7.3,"T'Challa, heir to the hidden but advanced king...",Ryan Coogler,/name/nm3363032/,"Chadwick Boseman, \nMichael B. Jordan, \nLupit...","/name/nm1569276/,/name/nm0430107/,/name/nm2143...",785813.0,700059566.0,Action,Adventure,Sci-Fi
14,tt0092099,Top Gun,1986,PG,109 min,"Action, Drama",6.9,As students at the United States Navy's elite ...,Tony Scott,/name/nm0001716/,"Tom Cruise, \nTim Robbins, \nKelly McGillis, \...","/name/nm0000129/,/name/nm0000209/,/name/nm0000...",461419.0,179800601.0,Action,Drama,
15,tt0499549,Avatar,2009,PG-13,162 min,"Action, Adventure, Fantasy",7.9,A paraplegic Marine dispatched to the moon Pan...,James Cameron,/name/nm0000116/,"Sam Worthington, \nZoe Saldana, \nSigourney We...","/name/nm0941777/,/name/nm0757855/,/name/nm0000...",1322694.0,760507625.0,Action,Adventure,Fantasy
20,tt1392170,The Hunger Games,2012,PG-13,142 min,"Action, Adventure, Sci-Fi",7.2,Katniss Everdeen voluntarily takes her younger...,Gary Ross,/name/nm0002657/,"Jennifer Lawrence, \nJosh Hutcherson, \nLiam H...","/name/nm2225369/,/name/nm1242688/,/name/nm2955...",927499.0,408010692.0,Action,Adventure,Sci-Fi
23,tt1160419,Dune,2021,PG-13,155 min,"Action, Adventure, Drama",8.0,A noble family becomes embroiled in a war for ...,Denis Villeneuve,/name/nm0898288/,"Timothée Chalamet, \nRebecca Ferguson, \nZenda...","/name/nm3154303/,/name/nm0272581/,/name/nm3918...",649342.0,108327830.0,Action,Adventure,Drama


In [100]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = 'https://d23.com/list-of-disney-films/'

response = requests.get(url)

soup = BeautifulSoup(response.text, 'html.parser')

movie_titles = []
movie_years = []

# Find all <span> tags with the specified font-size, which contain both year and title
spans = soup.find_all('span', style="font-size: 20px;")

for span in spans:
    # Extract text of each span and try to split by '&nbsp;'
    if ':' in span.get_text():
        year = span.get_text().split(':')[0].split()[-1]  # Get year which is last word before ':'
        title_em = span.find('em')  # Find the <em> tag within the span, which contains the title
        if title_em:
            title = title_em.get_text().strip()
            movie_years.append(year)
            movie_titles.append(title)

disney_titles = pd.DataFrame({
    'Year': movie_years,
    'Title': movie_titles
})

print(disney_titles)


     Year                                        Title
0    1937              Snow White and the Seven Dwarfs
1    1940                                    Pinocchio
2    1940                                     Fantasia
3    1941                         The Reluctant Dragon
4    1941                                        Dumbo
..    ...                                          ...
760  2022                        Better Nate Than Ever
761  2022                                   Polar Bear
762  2022  Doctor Strange in the Multiverse of Madness
763  2022                                    Pinocchio
764  2023                               Chang Can Dunk

[765 rows x 2 columns]


In [101]:
result_df = pd.merge(box_office_df, disney_titles, left_on=['movie_name', 'year'], right_on=['Title', 'Year'], how='inner')
len(result_df)

1209

In [102]:
print(result_df.isnull().sum(), len(result_df))

movie_id        0
movie_name      0
year            0
certificate     6
runtime         0
genre           0
rating          0
description     0
director        0
director_id     0
star            0
star_id         0
votes           0
gross(in $)     0
Genre1          0
Genre2          0
Genre3         84
Year            0
Title           0
dtype: int64 1209


In [103]:
cleaned_df = result_df.drop_duplicates()
len(cleaned_df)

700

In [104]:
cleaned_df = cleaned_df.drop(columns=['genre', 'Year', 'Title'])
cleaned_df

Unnamed: 0,movie_id,movie_name,year,certificate,runtime,rating,description,director,director_id,star,star_id,votes,gross(in $),Genre1,Genre2,Genre3
0,tt1825683,Black Panther,2018,PG-13,134 min,7.3,"T'Challa, heir to the hidden but advanced king...",Ryan Coogler,/name/nm3363032/,"Chadwick Boseman, \nMichael B. Jordan, \nLupit...","/name/nm1569276/,/name/nm0430107/,/name/nm2143...",785813.0,700059566.0,Action,Adventure,Sci-Fi
1,tt4154796,Avengers: Endgame,2019,PG-13,181 min,8.4,After the devastating events of Avengers: Infi...,"Anthony Russo, \nJoe Russo",/name/nm0751577/,"Robert Downey Jr., \nChris Evans, \nMark Ruffa...","/name/nm0751648/,/name/nm0000375/,/name/nm0262...",1148100.0,858373000.0,Action,Adventure,Drama
2,tt0478970,Ant-Man,2015,PG-13,117 min,7.3,Armed with a super-suit with the astonishing a...,Peyton Reed,/name/nm0715636/,"Paul Rudd, \nMichael Douglas, \nCorey Stoll, \...","/name/nm0748620/,/name/nm0000140/,/name/nm1015...",675514.0,180202163.0,Action,Comedy,Sci-Fi
3,tt9032400,Eternals,2021,PG-13,156 min,6.3,"The saga of the Eternals, a race of immortal b...",Chloé Zhao,/name/nm2125482/,"Gemma Chan, \nRichard Madden, \nAngelina Jolie...","/name/nm2110418/,/name/nm0534635/,/name/nm0001...",348052.0,164870234.0,Action,Adventure,Fantasy
4,tt4154756,Avengers: Infinity War,2018,PG-13,149 min,8.4,The Avengers and their allies must be willing ...,"Anthony Russo, \nJoe Russo",/name/nm0751577/,"Robert Downey Jr., \nChris Hemsworth, \nMark R...","/name/nm0751648/,/name/nm0000375/,/name/nm1165...",1095301.0,678815482.0,Action,Adventure,Sci-Fi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1141,tt0140352,The Insider,1999,R,157 min,7.8,A research chemist comes under personal and pr...,Michael Mann,/name/nm0000520/,"Russell Crowe, \nAl Pacino, \nChristopher Plum...","/name/nm0000128/,/name/nm0000199/,/name/nm0001...",173299.0,28965197.0,Biography,Drama,Thriller
1174,tt0128278,Instinct,1999,R,126 min,6.5,"When noted anthropologist Dr. Ethan Powell, wh...",Jon Turteltaub,/name/nm0005509/,"Anthony Hopkins, \nCuba Gooding Jr., \nDonald ...","/name/nm0000164/,/name/nm0000421/,/name/nm0000...",33600.0,34098563.0,Drama,Thriller,
1183,tt0116827,Last Dance,1996,R,103 min,5.7,A lawyer assigned to the clemency case of a wo...,Bruce Beresford,/name/nm0000915/,"Sharon Stone, \nRob Morrow, \nRandy Quaid, \nP...","/name/nm0000232/,/name/nm0001555/,/name/nm0001...",4536.0,5858000.0,Drama,Thriller,
1199,tt0093105,"Good Morning, Vietnam",1987,R,121 min,7.3,"In 1965, an unorthodox and irreverent DJ named...",Barry Levinson,/name/nm0001469/,"Robin Williams, \nForest Whitaker, \nTom. T. T...","/name/nm0000245/,/name/nm0001845/,/name/nm0870...",144339.0,123922370.0,Biography,Comedy,Drama


In [105]:
cleaned_df.to_csv('disney_movies_cleaned.csv', encoding='utf-8-sig', index=False)