## Rotten Tomatoes Web Scraper
### Gresya Angelina Eunike Leman (18220104)
### Tugas Seleksi Calon Asisten Lab Basis Data 

## Importing Necessary Libraries

In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import requests
import time
import json
import os

## Variables, Functions, and Procedures Declaration

In [2]:
baseurl = "https://www.rottentomatoes.com"
series = []

# rest after scraping each page's data
def rest(count):
    if(count % 20 == 0):
        print(f"Done scraping the contents of {count} tv shows.")
        time.sleep(3)
    time.sleep(2)

# format date to ISO 8601
def dateformatter(date):
    months = {
        'Jan' : '01',
        'Feb' : '02',
        'Mar' : '03',
        'Apr' : '04',
        'May' : '05',
        'Jun' : '06',
        'Jul' : '07',
        'Aug' : '08',
        'Sep' : '09',
        'Oct' : '10',
        'Nov' : '11',
        'Dec' : '12',
    }
    date = date.replace(',', '').split(' ')
    date[0] = months.get(date[0])
    date[0], date[1], date[2] = date[2], date[0], date[1]
    date = '-'.join(date)
    return (date)

# format text
def formattext(string):
    return (string.text.strip())

# format rating to integer
def ratingformatter(rating):
    return (int(formattext(rating).replace('%', '')))

# remove parantheses
def removepar(string):
    return (string.replace('(', '').replace(')', ''))

# procedure to write the collected data to a json file
def write_json(datas):
    path = 'D:\seleksi asisten basdat\Seleksi-2022-Tugas-1\Data Scraping\data'
    file_name = 'TvShows.json'    # name of the file
    file_path = os.path.join(path, file_name)
    with open(file_path, 'w', encoding = 'utf-8') as file:
        json.dump(datas, file, ensure_ascii = False, indent = 4)
    print("Done exporting json file.")

## Grabbing HTML of the website

In [3]:
# open the browser window
path = 'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'
service = Service(path)
service.start()
driver = webdriver.Remote(service.service_url)

# navigates driver to the url containing the first page 
url = url = baseurl + '/browse/tv_series_browse/sort:popular?page=1'
driver.get(url)

# create load more button object
loadmore = driver.find_element(By.XPATH, '/html/body/div[3]/main/div[1]/div/div[5]/button')

# clicking the button as many times to reach go to last desired page
lastpage = 50
for i in range(lastpage-1):
    loadmore.click()
    time.sleep(4)   # resting after each clicks

# grab the html of the page
html = driver.page_source

## Scraping Data

In [4]:
# making a beautiful soup to grab the datas
soup = BeautifulSoup(html, 'lxml')

# finding all of the links to the tv shows
tvshows = soup.find('div', class_ = 'discovery-grids-container').find_all('a')

# declaration of variabled used
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36'}
count = 1

for link in tvshows:
    try:
        # declaring the next url, extracting the html, and turning it into a soup
        suburl = baseurl + link['href']
        subsource = requests.get(suburl, headers=headers).text
        subsoup = BeautifulSoup(subsource, 'lxml')

        # fetching title, airing years, and the synopsis of the tv show
        title = formattext(subsoup.find('h1', class_ = 'mop-ratings-wrap__title mop-ratings-wrap__title--top'))
        airing = removepar(formattext(subsoup.find('span', class_ = 'h3 subtle')))
        synopsis = formattext(subsoup.find('div', class_ = 'tv-series__series-info--synopsis clamp clamp-6 js-clamp clearfix'))

        # fetching the average rating of each seasons (tomatometer and audience score)
        avgtm = ratingformatter(subsoup.find('span', {'data-qa' : 'tomatometer'}))
        avgas = ratingformatter(subsoup.find('span', {'data-qa' : 'audience-score'}))

        # fetching the tv network, premiere date, and genre
        tvnetwork = formattext(subsoup.find('td', {'data-qa' : 'series-details-network'}))        
        pdate = dateformatter(formattext(subsoup.find('td', {'data-qa' : 'series-details-premiere-date'}))) 
        genre = formattext(subsoup.find('td', {'data-qa' : 'series-details-genre'}))

        # fetching main casts of each series
        maincasts = [] 
        casts = subsoup.find('span', {'data-qa' : 'series-info-cast'}).find_next_siblings("a")
        for cast in casts:
            maincasts.append(formattext(cast))

        # fetching data from every season of the series
        seasons = subsoup.find_all('a', {'data-qa' : 'season-link'})
        seasoninfo = []
        for season in seasons:
            try:
                # fetching details provided before the shadow dom
                sinfo = season.find('season-list-item')
                info = sinfo['info'].split(",")         # var to keep number of episodes and airing year

                # fetching the title, number of episodes, and airing year of each season
                stitle = title + ': ' + sinfo['seasontitle']
                episodes = int(info[2].strip().replace(' episodes', ''))
                airing_year = int(info[0].strip())

                # declaring every season's url and extracting the html, also turning it into a soup
                surl = baseurl + season['href']
                ssource = requests.get(surl, headers=headers).text
                ssoup = BeautifulSoup(ssource, 'lxml')

                # fetching rating (tomatometer and audience score) form internal link
                tomatometer = ratingformatter(ssoup.find('span', {'data-qa' : 'tomatometer'}))
                audiencescore = ratingformatter(ssoup.find('span', {'data-qa' : 'audience-score'}))

                # keeping the season info
                new_season = {
                    "season_title" : stitle,
                    "airing_year" : airing_year,
                    "episodes" : episodes,
                    "tomatometer" : tomatometer,
                    "audience_score" : audiencescore,
                }

                # adding the season info to an array
                seasoninfo.append(new_season)

                # resting time before looping (so it's not blocked)
                rest(1)

            # passing the bad datas
            except ValueError:
                pass
        
        # reverse the array to the ascending order
        seasoninfo.reverse()

        # keeping each series info
        new_series = {
            "title" : title,
            "airing" : airing,
            "synopsis" : synopsis,
            "average_tomatometer" : avgtm,
            "average_audience_score" : avgas,
            "tv_network" : tvnetwork,
            "premiere_date" : pdate,
            "genre" : genre,
            "main_casts" : maincasts,
            "num_of_seasons" : len(seasoninfo),
            "seasons_info" : seasoninfo,
        }

        #adding new series info to the list
        series.append(new_series)
        #resting time before looping
        rest(count)
        count += 1      # increasing the count of series
        
    # passing the bad datas
    except AttributeError:
        pass
    except IndexError:
        pass

print(f"Successfully scraped the contents of {count - 1} tv shows.")

Done scraping the contents of 20 tv shows.
Done scraping the contents of 40 tv shows.
Done scraping the contents of 60 tv shows.
Done scraping the contents of 80 tv shows.
Done scraping the contents of 100 tv shows.
Done scraping the contents of 120 tv shows.
Done scraping the contents of 140 tv shows.
Done scraping the contents of 160 tv shows.
Done scraping the contents of 180 tv shows.
Done scraping the contents of 200 tv shows.
Done scraping the contents of 220 tv shows.
Done scraping the contents of 240 tv shows.
Done scraping the contents of 260 tv shows.
Done scraping the contents of 280 tv shows.
Done scraping the contents of 300 tv shows.
Done scraping the contents of 320 tv shows.
Done scraping the contents of 340 tv shows.
Done scraping the contents of 360 tv shows.
Done scraping the contents of 380 tv shows.
Done scraping the contents of 400 tv shows.
Done scraping the contents of 420 tv shows.
Done scraping the contents of 440 tv shows.
Done scraping the contents of 460 tv

## Writing Data to JSON

In [5]:
write_json(series)
print("Web scraping is finished!")
driver.quit()

Done exporting json file.
Web scraping is finished!
