# Module 11 Challenge
## Deliverable 2: Scrape and Analyze Mars Weather Data

In [1]:
# Import relevant libraries
from splinter import Browser
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
browser = Browser('chrome')

# PoC for scraping a page and converting to DataFrame

### Step 1: Visit the Website

In [3]:
# Visit the website
# https://static.bc-edx.com/data/web/mars_facts/temperature.html
url = "https://www.the-numbers.com/movie/budgets/all"
browser.visit(url)

### Step 2: Scrape the Table

Create a Beautiful Soup object and use it to scrape the data in the HTML table.

Note that this can also be achieved by using the Pandas `read_html` function. However, use Beautiful Soup here to continue sharpening your web scraping skills.

In [4]:
# Scrape the website
html = browser.html
# Create a Beautiful Soup object
soup = BeautifulSoup(html, "html.parser")

In [5]:
# Extract all rows
all_rows=soup.find_all("tr")

### Step 3: Store the Data


In [6]:
# Create empty lists for rows
headers = []
data_rows = []

# Loop through the scraped data to create a list of headers
header_scrape = all_rows[0].find_all("th")
for header in header_scrape:
    headers.append(header.text)

# Loop through the scraped data to create a list of data rows
for row in all_rows[1:]:
    data=[]
    for r in row.find_all("td"):
        data.append(r.text)
    data_rows.append(data)

In [7]:
# Create a Pandas DataFrame by using the list of rows and a list of the column names
df = pd.DataFrame(data_rows, columns=headers)

In [8]:
# Confirm DataFrame was created successfully
df.head()

Unnamed: 0,Unnamed: 1,ReleaseDate,Movie,ProductionBudget,DomesticGross,WorldwideGross
0,1,"Dec 9, 2022",Avatar: The Way of Water,"$460,000,000","$684,075,767","$2,319,591,720"
1,2,"Apr 23, 2019",Avengers: Endgame,"$400,000,000","$858,373,000","$2,788,912,285"
2,3,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$379,000,000","$241,071,802","$1,045,713,802"
3,4,"Apr 22, 2015",Avengers: Age of Ultron,"$365,000,000","$459,005,868","$1,395,316,979"
4,5,"May 17, 2023",Fast X,"$340,000,000","$145,960,660","$714,414,470"


### Step 4: Prepare Data for Analysis


In [9]:
# Examine data type of each column
df.dtypes

                    object
ReleaseDate         object
Movie               object
ProductionBudget    object
DomesticGross       object
WorldwideGross      object
dtype: object

In [10]:
# Change data types for data analysis
df["ReleaseDate"] = pd.to_datetime(df["ReleaseDate"])
# TODO: convert budget to integer (Movie title to string?)
# df = df.astype({"ProductionBudget":"int64"})
df.dtypes

                            object
ReleaseDate         datetime64[ns]
Movie                       object
ProductionBudget            object
DomesticGross               object
WorldwideGross              object
dtype: object

# PoC for iterating through pages

In [11]:
# Create the list of page clicks
import numpy as np

page_clicks=[]
page_numbers = np.arange(101, 401, 100)
for page in page_numbers:
    page_clicks.append(str(page)+"-")
page_clicks

['101-', '201-', '301-']

In [12]:
k = 1001
div = k%1000 # = 500
div2 = k//1000 # = 1
print(str(div2)+","+str(div))

1,1


In [13]:
# Visit the website
url = "https://www.the-numbers.com/movie/budgets/all"
browser.visit(url)

for i in range(len(page_clicks)):
    # Scrape the website
    html = browser.html
    # Create a Beautiful Soup object
    soup = BeautifulSoup(html, "html.parser")
    # Extract all rows
    all_row=soup.find_all("tr")
    print("----------------")
    print("page" + str(i) + ":")
    print("----------------")
    for row in all_row[1:3]:
        data=[]
        for r in row.find_all("td"):
            print(r.text)
    browser.links.find_by_partial_text(page_clicks[i]).click()
    

----------------
page0:
----------------
1
Dec 9, 2022
Avatar: The Way of Water
 $460,000,000
 $684,075,767
 $2,319,591,720
2
Apr 23, 2019
Avengers: Endgame
 $400,000,000
 $858,373,000
 $2,788,912,285
----------------
page1:
----------------
101
Jun 6, 2023
Transformers: Rise of the Beasts
 $195,000,000
 $157,066,392
 $437,669,281
102
Jun 2, 2017
The Mummy
 $195,000,000
 $80,101,125
 $409,953,905
----------------
page2:
----------------
201
Nov 24, 2004
Alexander
 $155,000,000
 $34,297,191
 $167,297,191
202
Jul 14, 2017
War for the Planet of the Apes
 $152,000,000
 $146,880,162
 $489,592,267


## Iterating through the pages to store all the data

In [14]:
url = "https://www.the-numbers.com/movie/budgets/all"
browser.visit(url)

# Create empty lists for rows
headers = []
data_rows = []

# Create the full list of page clicks
all_page_clicks=[]
all_page_numbers = np.arange(101, 6501, 100)
for page in all_page_numbers:
    if page < 1000:
        all_page_clicks.append(str(page)+"-")
    elif page < 10000:
        if page%1000 == 1:
            all_page_clicks.append(str(page//1000)+",001-")
        else:
            all_page_clicks.append(str(page//1000)+","+str(page%1000))

for i in range(len(all_page_numbers)):
    # Scrape the website
    html = browser.html
    # Create a Beautiful Soup object
    soup = BeautifulSoup(html, "html.parser")
    # Extract all rows
    all_rows = soup.find_all("tr")
    
    # Loop through the scraped data to create a list of headers for first iteration only
    if i == 0:
        header_scrape = all_rows[0].find_all("th")
        for header in header_scrape:
            headers.append(header.text)

    # Loop through the scraped data to create a list of data rows
    for row in all_rows[1:]:
        data = []
        for r in row.find_all("td"):
            data.append(r.text)
        data_rows.append(data)
    print("clicking " + all_page_clicks[i])
    browser.links.find_by_partial_text(all_page_clicks[i]).click()

clicking 101-
clicking 201-
clicking 301-
clicking 401-
clicking 501-
clicking 601-
clicking 701-
clicking 801-
clicking 901-
clicking 1,001-
clicking 1,101
clicking 1,201
clicking 1,301
clicking 1,401
clicking 1,501
clicking 1,601
clicking 1,701
clicking 1,801
clicking 1,901
clicking 2,001-
clicking 2,101
clicking 2,201
clicking 2,301
clicking 2,401
clicking 2,501
clicking 2,601
clicking 2,701
clicking 2,801
clicking 2,901
clicking 3,001-
clicking 3,101
clicking 3,201
clicking 3,301
clicking 3,401
clicking 3,501
clicking 3,601
clicking 3,701
clicking 3,801
clicking 3,901
clicking 4,001-
clicking 4,101
clicking 4,201
clicking 4,301
clicking 4,401
clicking 4,501
clicking 4,601
clicking 4,701
clicking 4,801
clicking 4,901
clicking 5,001-
clicking 5,101
clicking 5,201
clicking 5,301
clicking 5,401
clicking 5,501
clicking 5,601
clicking 5,701
clicking 5,801
clicking 5,901
clicking 6,001-
clicking 6,101
clicking 6,201
clicking 6,301
clicking 6,401


In [15]:
# Create a Pandas DataFrame by using the list of rows and a list of the column names
movie_costs_df = pd.DataFrame(data_rows, columns=headers)
movie_costs_df

Unnamed: 0,Unnamed: 1,ReleaseDate,Movie,ProductionBudget,DomesticGross,WorldwideGross
0,1,"Dec 9, 2022",Avatar: The Way of Water,"$460,000,000","$684,075,767","$2,319,591,720"
1,2,"Apr 23, 2019",Avengers: Endgame,"$400,000,000","$858,373,000","$2,788,912,285"
2,3,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$379,000,000","$241,071,802","$1,045,713,802"
3,4,"Apr 22, 2015",Avengers: Age of Ultron,"$365,000,000","$459,005,868","$1,395,316,979"
4,5,"May 17, 2023",Fast X,"$340,000,000","$145,960,660","$714,414,470"
...,...,...,...,...,...,...
6395,6396,"Jan 27, 2017",Emily,"$27,000","$3,547","$3,547"
6396,6397,"Jun 30, 1972",Deep Throat,"$25,000","$45,000,000","$45,000,000"
6397,6398,"Aug 1, 1997",In the Company of Men,"$25,000","$2,883,661","$2,883,661"
6398,6399,"Jan 14, 2000",The Terrorist,"$25,000","$195,043","$195,043"


In [16]:
# Examine data type of each column
movie_costs_df.dtypes

                    object
ReleaseDate         object
Movie               object
ProductionBudget    object
DomesticGross       object
WorldwideGross      object
dtype: object

In [17]:
# Convert $ objests to floats
movie_costs_df[movie_costs_df.columns[3:]] = movie_costs_df[movie_costs_df.columns[3:]].replace('[\$,]', '', regex=True).astype(float)
# Change date to datetime type
movie_costs_df["ReleaseDate"] = pd.to_datetime(movie_costs_df["ReleaseDate"])

movie_costs_df.dtypes

ParserError: Unknown string format: Unknown present at position 277

In [18]:
movie_costs_df.to_csv("movie_costs.csv")