In [1]:
import os
import time
import datetime
import re
import pandas as pd


import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
import chromedriver_autoinstaller
from bs4 import BeautifulSoup


startTime = time.time()
currentTime = datetime.datetime.today()
print('==============================\n')
print(f'Current time is {currentTime}')


fileDirectory = os.getcwd()
parentDirectory = os.getcwd()


class Website:
    def __init__(self, name, title, main_url, target_url):
        self.name = name
        self.title = title
        self.main_url = main_url
        self.target_url = target_url
        
    def info(self):
        print(f'Website.name : {self.name}')
        print(f'Website.title : {self.title}')
        print(f'Website.main_url : {self.main_url}')
        print(f'Website.target_url : {self.target_url}')


class WebElement:
    def __init__(self, name, type, identifier):
        self.name = name
        self.type = type
        self.identifier = identifier
        
    def info(self):
        print(f'WebElement.name : {self.name}')
        print(f'WebElement.type : {self.type}')
        print(f'WebElement.identifier : {self.identifier}')


class Preprocessor:
    def __init__(self) -> None:
        print('class Preprocessor : init')
        
    def clearText(self, texts):
        list_return = [text.text for text in texts]
        texts_cleared = [text.strip() for text in list_return]
        return texts_cleared

    def extractImage(self, contents):
        images_extracted = [tag['src'] for tag in contents]
        return images_extracted

    def extractURL(self, contents):
        url_extracted = [tag['href'] for tag in contents]
        return url_extracted
    
    def extractURLfromParent(self, contents):
        url_extracted = [tag.parent['href'] for tag in contents]
        return url_extracted

    def insertTextInURLs(self, text, location, urls):
        urls_inserted = []
        for url in urls:
            url_splited = url.split(location)
            url_inserted = url_splited[0] + location + text + url_splited[1]
            urls_inserted.append(url_inserted)
        return urls_inserted

    def preprocessContents(self, website, type, contents):
        if type == 'text':
            contents_preprocessed = self.clearText(contents)
        elif type == 'image':
            contents_preprocessed = self.extractImage(contents)
        elif type == 'url':
            if website.title == 'genre_top_100':
                contents_preprocessed = self.extractURLfromParent(contents)
            else:
                contents_preprocessed = self.extractURL(contents)
            if website.name == '블랙보드':
                contents_preprocessed = self.insertTextInURLs('courses','ultra/',contents_preprocessed)
            elif website.name == '고려대학교 경영대학':
                contents_preprocessed = [website.url.replace('notice.html','') + content for content in contents_preprocessed]
        return contents_preprocessed


class ActionChain:
    def __init__(self):
        print('class ActionChain : init')
        
    def waitAction(self, driver, action, maxWaitTime=30):
        try:
            if action.type == 'css':
                element = WebDriverWait(driver, maxWaitTime).until(expected_conditions.presence_of_element_located((By.CSS_SELECTOR, action.identifier)))
            elif action.type == 'xpath':
                element = WebDriverWait(driver, maxWaitTime).until(expected_conditions.presence_of_element_located((By.XPATH, action.identifier)))
        except Exception as e:
            print('ActionChain.waitAction error :', e)

    def clickAction(self, driver, action):
        chains = webdriver.ActionChains(driver)
        if action.type == 'css':
            button = driver.find_element(By.CSS_SELECTOR, action.identifier)
        elif action.type == 'xpath':
            button = driver.find_element(By.XPATH, action.identifier)
        chains.move_to_element(button).click().perform()
        # chains.context_click(button).perform()

    def inputAction(self, driver, action):
        if action.type == 'css':
            inputElement = driver.find_element(By.CSS_SELECTOR, action.identifier)
        if action.type == 'xpath':
            inputElement = driver.find_element(By.XPATH, action.identifier)
        text =re.findall('\(([^)]+)', action.name)
        inputElement.send_keys(text)

    def enterAction(self, driver, action):
        if action.type == 'css':
            keyElement = driver.find_element(By.CSS_SELECTOR, action.identifier)
        if action.type == 'xpath':
            keyElement = driver.find_element(By.XPATH, action.identifier)
        keyElement.send_keys(Keys.ENTER)

    def doActionChain(self, driver, actions):
        for i, action in enumerate(actions):
            if 'wait' in action.name:
                self.waitAction(driver, action)
                print(f'Step {i} : {action.name} done')
            elif 'time' in action.name:
                driver.implicitly_wait(5)
                print(f'Step {i} : {action.name} done')
            elif 'click' in action.name:
                self.clickAction(driver, action)
                print(f'Step {i} : {action.name} done')
            elif 'input' in action.name:
                self.inputAction(driver, action)
                print(f'Step {i} : {action.name} done')
            elif 'enter' in action.name:
                self.enterAction(driver, action)
                print(f'Step {i} : {action.name} done')
            elif 'back' in action.name:
                driver.back()
                print(f'Step {i} : {action.name} done')
            else:
                print('')
            
    def doActionRoutine(self, driver, iteration, iteration_actions):
        for i, action in enumerate(iteration_actions):
            if 'wait' in action.name:
                self.waitAction(driver, action)
                # print(f'Step {i} : {action.name} done')
            elif 'load' in action.name:
                driver.implicitly_wait(5)
                # print(f'Step {i} : {action.name} done')
            elif 'time' in action.name:
                time.sleep(3)
                # print(f'Step {i} : {action.name} done')
            elif 'click' in action.name:
                webdriver.ActionChains(driver).move_to_element(iteration).click().perform()
                # print(f'Step {i} : {action.name} done')
            elif 'input' in action.name:
                self.inputAction(driver, action)
                # print(f'Step {i} : {action.name} done')
            elif 'enter' in action.name:
                self.enterAction(driver, action)
                # print(f'Step {i} : {action.name} done')
            elif 'back' in action.name:
                driver.back()
                # print(f'Step {i} : {action.name} done')
            else:
                print('')


class WebDriverUpdateChecker:
    def __init__(self):
        print('class WebDriverUpdateChecker : init')
    
    def checkWebdriverVersion(self):
        chromeVersion = chromedriver_autoinstaller.get_chrome_version().split('.')[0]
        parentDirectoryList = os.listdir(parentDirectory)
        if 'chromedriver' not in parentDirectoryList:
            os.mkdir(parentDirectory + '/chromedriver')
        chromedriverDirectory = parentDirectory + '/chromedriver'
        chromedriverDirectoryList = os.listdir(chromedriverDirectory)
        if chromeVersion not in chromedriverDirectoryList:
            chromedriver_autoinstaller.install(False, chromedriverDirectory)
            print('\ncheckWebdriverVersion : downloaded the latest chromedriver')
        else:
            print('\ncheckWebdriverVersion : already the latest chromedriver')

    def getWebdriverDirectory(self):
        chromeVersion = chromedriver_autoinstaller.get_chrome_version().split('.')[0]
        webdriverDirectory = parentDirectory + '/chromedriver' + '/' + chromeVersion + '/' + 'chromedriver'
        return webdriverDirectory


class Crawler(WebDriverUpdateChecker, ActionChain, Preprocessor):
    def __init__(self):
        super().__init__()
        print('\tclass Crawler : init')
    
    def getResponse(self, url):
        try:
            response = requests.get(url)
        except Exception as e:
            print('Crawler.getResponse error : ', e)
        return response

    def setChromeOptions(self, headless=True):
        chromeOptions = webdriver.ChromeOptions()
        if headless:
            chromeOptions.add_argument('headless')
        chromeOptions.add_argument('lang=ko_KR')
        chromeOptions.add_argument('disable-gpu')
        chromeOptions.add_argument('window-size=1920x1080')
        chromeOptions.add_argument(
            'user-agent='+'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36')
        return chromeOptions

    def getDriver(self, url, actions):
        super().checkWebdriverVersion()
        webdriverDirectory = super().getWebdriverDirectory()
        chromeOptions = self.setChromeOptions()
        driver = webdriver.Chrome(webdriverDirectory, options=chromeOptions)
        try:
            driver.get(url)
        except Exception as e:
            print('Crawling.getDriver error : ', e)
        if actions is not None:
            super().doActionChain(driver, actions)
        return driver

    def getSource(self, url, actions, module):
        if module == 'requests':
            reponse = self.getResponse(url)
            source = reponse.content
        elif module == 'selenium':
            driver = self.getDriver(url, actions)
            source = driver.page_source
        return source

    def parseSource(self, source, parser):
        soup = BeautifulSoup(source, parser)
        return soup

    def selectText(self, soup, content):
        if soup is not None:
            try:
                texts = soup.select(content.identifier)
            except Exception as e:
                print('Crawler.selectText error : ', e)
            return texts

    def selectTable(self, soup, content):
        if soup is not None:
            try:
                target = soup.select(content.identifier)
            except Exception as e:
                print('Crawler.selectTable error : ', e)
            table_list = pd.read_html(str(target))
            return table_list

    def selectImage(self, soup, content):
        if soup is not None:
            try:
                images = soup.find_all('img')
            except Exception as e:
                print('Crawler.selectText error : ', e)
            return images

    def getContents(self, website, contents, actions=None, module='selenium', parser='lxml'):
        source = self.getSource(website.target_url, actions, module)
        soup = self.parseSource(source, parser)
        dict_return = {}
        for content in contents:
            if 'text' in content.name:
                key = re.findall('\(([^)]+)', content.name)
                texts = self.selectText(soup, content)
                texts_preprocessed = super().preprocessContents(website, 'text', texts)
                dict_return[key[0]] = texts_preprocessed
            elif 'table' in content.name:
                key = re.findall('\(([^)]+)', content.name)
                table = self.selectTable(soup, content)
                dict_return[key[0]] = table[0]
            elif 'image' in content.name:
                key = re.findall('\(([^)]+)', content.name)
                images = self.selectImage(soup, content)
                images_preprocessed = super().preprocessContents(website, 'image', images)
                dict_return[key[0]] = images_preprocessed
            elif 'url' in content.name:
                key = re.findall('\(([^)]+)', content.name)
                urls = self.selectText(soup, content)
                urls_preprocessed = super().preprocessContents(website, 'url', urls)
                dict_return[key[0]] = urls_preprocessed
            else:
                print('')
        return dict_return
    
    
    
    
    def getContentsInSequence(self, website, contents, iteration, iteration_actions_in, iteration_actions_out, actions=None, module='selenium', parser='lxml'):
        driver = self.getDriver(website.target_url, actions)
        chains = webdriver.ActionChains(driver)
        
        if iteration.type == 'css':
            iterations = driver.find_elements(By.CSS_SELECTOR, iteration.identifier)
        elif iteration.type == 'xpath':
            iterations = driver.find_elements(By.XPATH, iteration.identifier)
        keys = [a.text for a in iterations]
        
        iteration_dict = {}
        # for i in range(len(iterations)):
        for i in range(4):
            
            if iteration.type == 'css':
                iterations = driver.find_elements(By.CSS_SELECTOR, iteration.identifier)
            elif iteration.type == 'xpath':
                iterations = driver.find_elements(By.XPATH, iteration.identifier)

            super().doActionRoutine(driver, iterations[i], iteration_actions_in)
            
            source = driver.page_source
            soup = self.parseSource(source, parser)
            dict_return = {}
            for content in contents:
                if 'text' in content.name:
                    key = re.findall('\(([^)]+)', content.name)
                    texts = self.selectText(soup, content)
                    texts_preprocessed = super().preprocessContents(website, 'text', texts)
                    dict_return[key[0]] = texts_preprocessed
                elif 'table' in content.name:
                    key = re.findall('\(([^)]+)', content.name)
                    table = self.selectTable(soup, content)
                    dict_return[key[0]] = table[0]
                elif 'image' in content.name:
                    key = re.findall('\(([^)]+)', content.name)
                    images = self.selectImage(soup, content)
                    images_preprocessed = super().preprocessContents(website, 'image', images)
                    dict_return[key[0]] = images_preprocessed
                elif 'url' in content.name:
                    key = re.findall('\(([^)]+)', content.name)
                    urls = self.selectText(soup, content)
                    urls_preprocessed = super().preprocessContents(website, 'url', urls)
                    dict_return[key[0]] = urls_preprocessed
                else:
                    print('')
            iteration_dict[keys[i]] = dict_return
            super().doActionRoutine(driver, iterations[i], iteration_actions_out)
            print(f'{website.name} - {i}/{len(iterations)} {keys[i]} Done!                                   ', end='\r')
        return iteration_dict

    
    
crawler = Crawler()

endTime = time.time()
print(f'\nRunning time : {endTime - startTime} \n')
print('==============================\n\n\n')


Current time is 2022-04-26 17:11:13.963547
class WebDriverUpdateChecker : init
	class Crawler : init

Running time : 0.0032939910888671875 






In [2]:
# Crawl {genre_names}, {genre_urls} from 'https://www.rottentomatoes.com/top'

startTime = time.time()
currentTime = datetime.datetime.today()
print('==============================\n')
print(f'Current time is {currentTime}')


rotten_tomatoes = Website('rotten tomatoes', 'genre_top_100', 'https://www.rottentomatoes.com', 'https://www.rottentomatoes.com/top')
contents = [
    WebElement('text(genre_names)', 'css', '#main_container > div.container-masonry > div:nth-child(3) > section > div > ul > li > a > div'),
    WebElement('url(genre_urls)', 'css', '#main_container > div.container-masonry > div:nth-child(3) > section > div > ul > li > a > div')
]
actions = []
genre_crawled = crawler.getContents(rotten_tomatoes, contents, module='requests')


endTime = time.time()
print(f'\nRunning time : {endTime - startTime} \n')
print('==============================\n\n\n')



for key, value in genre_crawled.items():
    print(key)
    print('{')
    print(*value, sep='\n')
    print('}')
    print()


Current time is 2022-04-26 17:11:15.880395

Running time : 2.752756118774414 




genre_names
{
Top 100 Action & Adventure Movies
Top 100 Animation Movies
Top 100 Art House & International Movies
Top 100 Classics Movies
Top 100 Comedy Movies
Top 100 Documentary Movies
Top 100 Drama Movies
Top 100 Horror Movies
Top 100 Kids & Family Movies
Top 100 Musical & Performing Arts Movies
Top 100 Mystery & Suspense Movies
Top 100 Romance Movies
Top 100 Science Fiction & Fantasy Movies
Top 100 Special Interest Movies
Top 100 Sports & Fitness Movies
Top 100 Television Movies
Top 100 Western Movies
}

genre_urls
{
/top/bestofrt/top_100_action__adventure_movies/
/top/bestofrt/top_100_animation_movies/
/top/bestofrt/top_100_art_house__international_movies/
/top/bestofrt/top_100_classics_movies/
/top/bestofrt/top_100_comedy_movies/
/top/bestofrt/top_100_documentary_movies/
/top/bestofrt/top_100_drama_movies/
/top/bestofrt/top_100_horror_movies/
/top/bestofrt/top_100_kids__family_movies/
/top/bestofrt

In [4]:
# Crawl {detail_label, detail_info, detail_crew, audience_reivews} from {genre_urls}

startTime = time.time()
currentTime = datetime.datetime.today()
print('==============================\n')
print(f'Current time is {currentTime}')


website_list = []
for i, genre_names in enumerate(genre_crawled['genre_names']):
    target_url = rotten_tomatoes.main_url + genre_crawled['genre_urls'][i]
    website_list.append(Website(genre_names, 'top100_movies', 'https://www.rottentomatoes.com', target_url))

    
movies_dict = {}
for target_website in website_list:  # limit
    iteration = WebElement('iterations','css','#top_movies_main > div > table > tbody > tr > td:nth-child(3) > a')
    iteration_actions_in = [
        WebElement('click','',''),
        WebElement('load','','')
    ]
    iteration_actions_out = [
        WebElement('back','',''),
        WebElement('time','','')
    ]
    contents = [
        WebElement('text(detail_label)', 'css', '#mainColumn > section.panel.panel-rt.panel-box.movie_info.media > div > div > ul > li > div.meta-label.subtle'),
        WebElement('text(detail_info)', 'css', '#mainColumn > section.panel.panel-rt.panel-box.movie_info.media > div > div > ul > li > div.meta-value'),
        # WebElement('text(detail_crew)', 'css', '#movie-cast > div > div > div > div.media-body > a > span'),
        # WebElement('text(audience_reivews)', 'css', '#audience_reviews > ul > li > div.mop-audience-reviews__review-quote > div.mop-audience-reviews__review--comment.clamp.clamp-4.js-clamp')
    ]
    movies_top100_dict = crawler.getContentsInSequence(target_website, contents, iteration, iteration_actions_in, iteration_actions_out)
    movies_dict[target_website.name] = movies_top100_dict
    print(f'{target_website.name} -  Done!                                                                ', end='\n\n')
    
    
endTime = time.time()
print(f'\nRunning time : {endTime - startTime} \n')
print('==============================\n\n\n')




for key, value in movies_dict.items():
    print(key)
    print('{')
    print(*value, sep='\n')
    print('}')
    print()


Current time is 2022-04-26 17:02:56.998191

checkWebdriverVersion : already the latest chromedriver
Top 100 Action & Adventure Movies - 2/100 Mission: Impossible - Fallout (2018) Done!                                   

KeyboardInterrupt: 

In [24]:
# Make DataFrame

container_genre = {}
for genre in movies_dict.keys():
    movie_titles = list(movies_dict[genre].keys())
    movie_details = list(movies_dict[genre].values())
    columns = ['Rating', 'Genre', 'Original Language', 'Director', 'Producer', 'Writer', 'Release Date (Theaters)', 'Release Date (Streaming)', 'Box Office (Gross USA)', 'Runtime', 'Distributor', 'Sound Mix', 'Aspect Ratio', 'View the collection']

    container = []
    for i, title in enumerate(movie_titles):
        detail_label = [label.replace(':','') for label in movie_details[i]['detail_label']]
        detail_info = [info.replace('\n','').replace('  ','').replace('\xa0',' ') for info in movie_details[i]['detail_info']]
        for j, column in enumerate(columns):
            try:
                if column != detail_label[j]:
                    detail_label.insert(j, column)
                    detail_info.insert(j, None)
            except Exception as e:
                detail_info.insert(j, None)
        container.append(detail_info)
    
    df = pd.DataFrame(data=container, index=movie_titles, columns=columns)
    container_genre[genre] = df

container_genre['Top 100 Art House & International Movies']

Unnamed: 0,Rating,Genre,Original Language,Director,Producer,Writer,Release Date (Theaters),Release Date (Streaming),Box Office (Gross USA),Runtime,Distributor,Sound Mix,Aspect Ratio,View the collection
Parasite (Gisaengchung) (2019),R (Sexual Content|Language|Some Violence),"Drama, Mystery & thriller, Comedy",Korean,Bong Joon Ho,"Kwak Sin-ae, Moon Yanggwon","Bong Joon Ho, Han Jinwon","Nov 1, 2019 wide","Oct 11, 2019",$53.4M,2h 12m,Neon,"Dolby Atmos, Dolby Digital",Scope (2.35:1),
The Cabinet of Dr. Caligari (Das Cabinet des Dr. Caligari) (1920),,"Horror, Fantasy, Mystery & thriller",German,Robert Wiene,"Rudolf Meinert, Erich Pommer","Hans Janowitz, Carl Mayer","Mar 19, 1921 wide","Feb 16, 1999",,1h 9m,"Grapevine Video, Kino Video",,,
Seven Samurai (Shichinin no Samurai) (1956),,Action,Japanese,Akira Kurosawa,Sojiro Motoki,"Shinobu Hashimoto, Akira Kurosawa, Hideo Oguni","Nov 19, 1956 wide","Sep 5, 2006",$271.7K,3h 28m,Columbia Pictures,Mono,Flat (1.37:1),
La Grande illusion (Grand Illusion) (1938),,"Drama, War",French (Canada),Jean Renoir,"Albert Pinkovitch, Frank Rollmer",,"Sep 12, 1937 wide","Nov 23, 1999",$516.5K,1h 57m,"Home Vision Entertainment, Barr Entertainment",,,
