## A Web Scrapper Project

#### Importing Libraries

In [None]:
from bs4 import BeautifulSoup as bs
import requests
import re

#### Defining function to find data 

In [None]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]

    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

def remove_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()

def get_info_box(url):
    r = requests.get(url)
    soup = bs(r.content)

    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")
    remove_tags(soup)
    movie_info = {}

    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info["Title"] = row.find("th").get_text(" ", strip=True)
        
        elif index == 1:
            try:
                tmp = row.find("td").find("a")
                tmp = str(tmp).split(",")[-1].split(" ")

                for line in tmp:
                    if re.search("src", line):
                        break
                
                tmp = line.split("//")[1].replace("\"", "")
                movie_info["Image"] = "https://" + tmp
            except:
                pass

        else:
            try:
                key = row.find("th").get_text(" ", strip=True)
                value = get_content_value(row.find("td"))
                movie_info[key] = value
            except:
                pass

    return movie_info

#### Getting List all movies

In [None]:
full_url = "https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films"
r = requests.get(full_url)
soup = bs(r.content)

In [None]:
# Load webpages for each movie in list

movies = soup.select(".wikitable.sortable i a")

all_movies_list = []
for index, movie in enumerate(movies):
    try:
        relative_path = movie["href"]
        title = movie["title"]
        full_path = "https://en.wikipedia.org/" + relative_path
        all_movies_list.append(get_info_box(full_path))
        # print(relative_path)
        # print(title)
        # print()

    except Exception as e:
        print(movie.get_text())
        print(e)
        print(index)
        print()

#### Saving/Reloading the data in JSON format

In [2]:
import json

def save_data(title, data):
    with open(title, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, sort_keys=True)

In [3]:
def load_data(title):
    with open(title, encoding="utf-8") as f:
        return json.load(f)

In [None]:
save_data("disney_movies_data_cleaned_v1.json", all_movies_list)

In [None]:
# Deleting old used variable
del all_movies_list

### Cleaning data

#### Converting Running time into integers

In [None]:
movies_data_list = load_data("disney_movies_data_cleaned_v1.json")

In [None]:
# Coverting Running time into integer

def convert_running_time(data):
    if data == None:
        return None

    if type(data) == int:
        return data
    
    if isinstance(data, list):
        data = data[0]

    if "Min" in data or "min" in data:
        tmp = int(data.replace("\u2013", " ").split()[0])
        return tmp

for movie in movies_data_list:
    movie["Running time"] = convert_running_time(movie.get("Running time", None))

In [None]:
save_data("disney_movies_data_cleaned_v2.json", movies_data_list)

In [None]:
# Deleting old Variable

del movies_data_list

#### Converting Budget and Box office into integer value

In [None]:
# Loading cleaned v2 data
all_movies_data = load_data("disney_movies_data_cleaned_v2.json")

In [None]:
import re
amount = r"million|billion|thousand"
number = r"\d+(,\d{3})*\.*\d*"

def convert_to_number(data):
    if isinstance(data, list):
        for entry in data:
            if "$" in str(entry):
                data = entry
    pattern =  rf"\${number}(-|\s-|\sto\s)?\$?({number})?\s?({amount})?"
    try:
        # Specific calls for special character
        if "–" in data:
            pattern = rf"\${number}(–\s|\s–|–|\s–\s?)?\$?{number}\s?({amount})?"
        match = re.search(pattern, data).group()
        match = match.replace(",", "")
    except:
        return "None"

    value = re.search(number, match).group()
    amount_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    try:
        word = re.search(amount, match).group()
        mul_factor = amount_dict[word]
    except:
        mul_factor = 1

    return int(float(value)*mul_factor)

In [None]:
# To fix the anamoly in 2 dataset

all_movies_data[335]["Budget"].replace("  ", " ")
all_movies_data[412]["Box office"].replace("\n", "")

'$7.7 million'

In [None]:
for entry in all_movies_data:
    try:
        entry["Budget"] = convert_to_number(entry["Budget"])
    except:
        entry["Budget"] = "None"
    try:
        entry["Box office"] = convert_to_number(entry["Box office"])
    except:
        entry["Box office"] = "None"

In [None]:
save_data("disney_movies_data_cleaned_v3.json", all_movies_data)

In [None]:
# Deleting unused varible

del all_movies_data

#### Saving Files in Google Drive

In [1]:
%cd drive/"My Drive"/"Colab Notebooks"/

/content/drive/My Drive/Colab Notebooks


In [None]:
%pwd

'/content/drive/My Drive/Colab Notebooks'

In [None]:
save_data("disney_movies_data_cleaned_v3.json", all_movies_data)

In [None]:
%cd /content

/content


In [None]:
!ls -la

total 1371
-rw------- 1 root root   8417 Mar  1  2020  AutoML_data_preparation_AIA023.ipynb
-rw------- 1 root root 662294 May 12 07:18  Capstone_Project.ipynb
-rw------- 1 root root 151294 May 11 04:11 'Copy of Capstone Project.ipynb'
-rw------- 1 root root  12651 Oct  3 08:06 'Copy of Object detection'
-rw------- 1 root root  38962 Mar 22  2020  crfasrnn_demo.ipynb
-rw------- 1 root root 463474 Oct 23 08:38  disney_movies_data_cleaned_v3.json
-rw------- 1 root root   4559 Jun  8 07:26 'Object Detection.ipynb'
-rw------- 1 root root   1234 May  3 10:19  test.ipynb
-rw------- 1 root root  30834 May 10 05:55 "utf-8''Week 4 Programming Assignment.ipynb"
-rw------- 1 root root  27241 Oct 23 12:45 'Web Scrapper.ipynb'


#### Convert Date Time into datetime objects

In [None]:
# Load data from Drive

all_movies_data = load_data("disney_movies_data_cleaned_v3.json")

In [None]:
from datetime import datetime

def convert_date(date):
    if date == "N/A":
        return None
    
    if isinstance(date, list):
        date = date[0]

    date = date.split("(")[0].strip()
    #print(date)

    formats = ["%B %d, %Y", "%d %B %Y"]
    try:
        return datetime.strptime(date, formats[0]).strftime("%d/%m/%Y")
    except:
        try:
            return datetime.strptime(date, formats[1]).strftime("%d/%m/%Y")
        except:
            return None

In [None]:
for movie in all_movies_data:
    movie["Release date"] = convert_date(movie.get("Release date", "N/A"))

In [None]:
save_data("cleaned_v4.json", all_movies_data)

In [None]:
# Deleting Old variables

del all_movies_data

#### Attaching IMDB/Rotten Tomatoes scores

In [None]:
import os

In [None]:
os.environ['OMDB_API_KEY']="PUT_YOUR_API_KEY_HERE

In [None]:
os.environ['OMDB_API_KEY']

'dbe390ac'

In [None]:
import requests
import urllib

def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameters = {"apikey": os.environ['OMDB_API_KEY'], 't':title}
    para_encoded = urllib.parse.urlencode(parameters)

    full_url = base_url + para_encoded
    return requests.get(full_url).json()

def get_scores_from_list(omdb_ratings):
    res = ["N/A", "N/A"]
    for rating in omdb_ratings:
        if rating["Source"] == "Rotten Tomatoes":
            res[0] = rating["Value"]
        if rating["Source"] == "Metacritic":
            res[1] = rating["Value"]
    return res

In [None]:
get_omdb_info(all_movies_data[213]["Title"])

{'Actors': 'Simon Callow, Richard Dreyfuss, Jane Leeves, Joanna Lumley',
 'Awards': 'Nominated for 1 Oscar. Another 4 wins & 12 nominations.',
 'BoxOffice': 'N/A',
 'Country': 'UK, USA',
 'DVD': 'N/A',
 'Director': 'Henry Selick',
 'Genre': 'Animation, Adventure, Family, Fantasy, Musical',
 'Language': 'English',
 'Metascore': '78',
 'Plot': 'An orphan who lives with his two cruel aunts befriends anthropomorphic bugs who live inside a giant peach, and they embark on a journey to New York City.',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BMTNkNWIwNGUtNTJlOC00NDU3LTk0NWEtNjNjNDM4NzRiNThkXkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_SX300.jpg',
 'Production': 'Walt Disney Pictures, Skellington Productions',
 'Rated': 'PG',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '6.7/10'},
  {'Source': 'Rotten Tomatoes', 'Value': '91%'},
  {'Source': 'Metacritic', 'Value': '78/100'}],
 'Released': '12 Apr 1996',
 'Response': 'True',
 'Runtime': '79 min',
 'Title': 'James and the Giant Peac

In [None]:
for movie in all_movies_data:
    try:
        title = movie["Title"]
        omdb_info = get_omdb_info(title)
        movie["Title"] = omdb_info["Title"]
        movie["Imdb Rating"] = omdb_info.get("imdbRating", "N/A")
        ratings = get_scores_from_list(omdb_info.get("Ratings", "N/A"))
        movie["Rotten Tomatoes"] = ratings[0]
        movie["Metacritics"] = ratings[1]
    except: # When movie details not found
        print(movie["Title"])

Escape from the Dark
The Omega Connection
The Man from Snowy River II
America's Heart and Soul
High School Musical 3: Senior Year
The Book of Masters
Anaganaga O Dheerudu
Tini: The Movie
Jagga Jasoos


In [None]:
# Saving data to disk
save_data("cleaned_v5.json", all_movies_data)

#### Using Pandas for visualisation adn Analysis

In [4]:
# Load data

all_movies_data = load_data("cleaned_v5.json")

In [5]:
import pandas as pd

In [6]:
df = pd.DataFrame(all_movies_data)

In [7]:
df.head()

Unnamed: 0,Box office,Budget,Country,Image,Imdb Rating,Language,Metacritics,Production company,Release date,Rotten Tomatoes,Running time,Title,Based on,Directed by,Distributed by,Music by,Produced by,Starring,Written by,Story by,Cinematography,Narrated by,Edited by,Screenplay by,Audio format,Country of origin,Created by,Distributor,Executive,Genre,No. of episodes,No. of seasons,Original network,Original release,Picture format,Production,Related shows,Production companies,Hepburn,Japanese,Adaptation by,Simplified,Traditional
0,45,,United States,https://upload.wikimedia.org/wikipedia/en/thum...,7.2,English,,Walt Disney Productions,19/05/1937,,41.0,Academy Award Review of Walt Disney Cartoons,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,418000000,1490000.0,United States,https://upload.wikimedia.org/wikipedia/en/thum...,7.6,English,95/100,Walt Disney Productions,21/12/1937,,83.0,Snow White and the Seven Dwarfs,"[Snow White, by The, Brothers Grimm]","[David Hand (supervising), William Cottrell, W...",RKO Radio Pictures,"[Frank Churchill, Paul Smith, Leigh Harline]",Walt Disney,"[Adriana Caselotti, Lucille La Verne, Harry St...","[Ted Sears, Richard Creedon, Otto Englander, D...",,,,,,,,,,,,,,,,,,,,,,,,
2,164000000,2600000.0,United States,https://upload.wikimedia.org/wikipedia/en/thum...,7.4,English,99/100,Walt Disney Productions,07/02/1940,100%,88.0,Pinocchio,"[The Adventures of Pinocchio, by, Carlo Collodi]","[Ben Sharpsteen, Hamilton Luske, Bill Roberts,...",RKO Radio Pictures,"[Leigh Harline, Paul J. Smith]",Walt Disney,"[Cliff Edwards, Dickie Jones, Christian Rub, M...",,"[Ted Sears, Otto Englander, Webb Smith, Willia...",,,,,,,,,,,,,,,,,,,,,,,
3,76400000,2280000.0,United States,https://upload.wikimedia.org/wikipedia/en/thum...,7.7,English,96/100,Walt Disney Productions,13/11/1940,95%,126.0,Fantasia,,"[Samuel Armstrong, James Algar, Bill Roberts, ...","[Walt Disney Productions, RKO Radio Pictures]",See program,"[Walt Disney, Ben Sharpsteen]","[Leopold Stokowski, Deems Taylor]",,"[Joe Grant, Dick Huemer]",James Wong Howe,Deems Taylor,,,,,,,,,,,,,,,,,,,,,
4,960000,600000.0,United States,https://upload.wikimedia.org/wikipedia/en/thum...,6.9,English,,Walt Disney Productions,20/06/1941,67%,74.0,The Reluctant Dragon,,"[Alfred Werker, (live action), Hamilton Luske,...",RKO Radio Pictures,"[Frank Churchill, Larry Morey]",Walt Disney,"[Robert Benchley, Frances Gifford, Buddy Peppe...","[Live-action:, Ted Sears, Al Perkins, Larry Cl...",,Bert Giennon,,Paul Weatherwax,,,,,,,,,,,,,,,,,,,,


In [8]:
# to remove sparsly filled columns

thresh = len(df) * .2
df.dropna(thresh = thresh, axis = 1, inplace = True)

In [9]:
# To fix object type of Running Time

df["Running time"] = df["Running time"].fillna(0).astype(int)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438 entries, 0 to 437
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Box office          438 non-null    object
 1   Budget              438 non-null    object
 2   Country             432 non-null    object
 3   Image               419 non-null    object
 4   Imdb Rating         429 non-null    object
 5   Language            434 non-null    object
 6   Metacritics         429 non-null    object
 7   Production company  397 non-null    object
 8   Release date        432 non-null    object
 9   Rotten Tomatoes     429 non-null    object
 10  Running time        438 non-null    int64 
 11  Title               438 non-null    object
 12  Based on            215 non-null    object
 13  Directed by         428 non-null    object
 14  Distributed by      433 non-null    object
 15  Music by            426 non-null    object
 16  Produced by         426 no

In [12]:
# to save final data

df.to_json("Disney_movies_data_final.json", orient='records', lines=True)
df.to_csv("Disney_movie_data_final.csv")

### Youtube Link to follow

https://www.youtube.com/watch?v=Ewgy-G9cmbg&t=5300s