In [1]:
#import packages from different libraries
import pandas as pd
from bs4 import BeautifulSoup
import requests

#locate the url link
wikiurl = "https://en.wikipedia.org/wiki/List_of_Disney_live-action_adaptations_and_remakes_of_Disney_animated_films"

In [2]:
response = requests.get(wikiurl)
soup = BeautifulSoup(response.text,'html.parser')

In [3]:
#Convert the 1st Wiki table to a list
tb_1 = soup.find_all('table',{'class':"wikitable"})[0]
list_1 = pd.read_html(str(tb_1))

#Convert the 2nd Wiki table to a list
tb_2 = soup.find_all('table',{'class':"wikitable"})[1]
list_2 = pd.read_html(str(tb_2))

In [4]:
#Convert both lists to pandas dataframe
released = pd.DataFrame(list_1[0])
upcoming = pd.DataFrame(list_2[0])

In [5]:
#Drop columns that are not needed
released = released.drop(columns = ["Director(s)","Screenwriter(s)","Producer(s)","Co-production with","Ref."])
upcoming = upcoming.drop(columns = ["Director(s)","Screenwriter(s)","Producer(s)","Co-production with","Ref."])

In [6]:
#Rename the columns for future convenience (merging, etc.)
released.columns = ["title","original_film_title","release_date"]
upcoming.columns = ["title","original_film_title","release_date"]

In [7]:
#Remove parentheses/brackets and all the data within DataFrame released
released['title'] = released['title'].str.replace(r"\[.*\]","")
released['original_film_title'] = released['original_film_title'].str.replace(r"\[.*\]","")
released['original_film_title'] = released['original_film_title'].str.replace(r"\(.*\)","")
released['release_date'] = released['release_date'].str.replace(r"\[.*\]","")

In [8]:
#Remove parentheses/brackets and all the data within DataFrame upcoming
upcoming['title'] = upcoming['title'].str.replace(r"\[.*\]","")
upcoming['original_film_title'] = upcoming['original_film_title'].str.replace(r"\[.*\]","")
upcoming['original_film_title'] = upcoming['original_film_title'].str.replace(r"\(.*\)","")
upcoming['release_date'] = upcoming['release_date'].str.replace(r"\[.*\]","")

In [9]:
released

Unnamed: 0,title,original_film_title,release_date
0,Rudyard Kipling's The Jungle Book,The Jungle Book,"December 25, 1994"
1,101 Dalmatians,One Hundred and One Dalmatians,"November 27, 1996"
2,102 Dalmatians,One Hundred and One Dalmatians,"November 22, 2000"
3,Alice in Wonderland,Alice in Wonderland,"March 5, 2010"
4,Maleficent,Sleeping Beauty,"May 30, 2014"
5,Cinderella,Cinderella,"March 13, 2015"
6,The Jungle Book,The Jungle Book,"April 15, 2016"
7,Alice Through the Looking Glass,Alice in Wonderland,"May 27, 2016"
8,Beauty and the Beast,Beauty and the Beast,"March 17, 2017"
9,Christopher Robin,Winnie the Pooh films,"August 3, 2018"


In [10]:
#Replace all the TBAs with Null values
upcoming['release_date'].fillna('',inplace=True)

In [11]:
upcoming.loc[upcoming.release_date == "TBA", "release_date"] = None

In [12]:
upcoming

Unnamed: 0,title,original_film_title,release_date
0,Pinocchio,Pinocchio,"September 8, 2022"
1,Peter Pan & Wendy,Peter Pan,2022
2,The Little Mermaid,The Little Mermaid,"May 26, 2023"
3,Snow White,Snow White and the Seven Dwarfs,
4,Untitled The Jungle Book sequel,The Jungle Book,
5,Untitled The Lion King follow-up film,The Lion King,
6,Hercules,Hercules,
7,Hunchback,The Hunchback of Notre Dame,
8,Untitled Aladdin sequel,Aladdin,
9,Bambi,Bambi,


In [13]:
upcoming.loc[upcoming.release_date.isna(),"release_date"]

3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
Name: release_date, dtype: object

In [14]:
#Change String to Datatime
released['release_date'] = pd.to_datetime(released['release_date'])
upcoming['release_date'] = pd.to_datetime(upcoming['release_date'])

In [15]:
upcoming

Unnamed: 0,title,original_film_title,release_date
0,Pinocchio,Pinocchio,2022-09-08
1,Peter Pan & Wendy,Peter Pan,2022-01-01
2,The Little Mermaid,The Little Mermaid,2023-05-26
3,Snow White,Snow White and the Seven Dwarfs,NaT
4,Untitled The Jungle Book sequel,The Jungle Book,NaT
5,Untitled The Lion King follow-up film,The Lion King,NaT
6,Hercules,Hercules,NaT
7,Hunchback,The Hunchback of Notre Dame,NaT
8,Untitled Aladdin sequel,Aladdin,NaT
9,Bambi,Bambi,NaT


In [16]:
released.to_csv('released.csv')
upcoming.to_csv('upcoming.csv')