# Import all the necessary libraries

In [None]:
## Web Scraping
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options  
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
from collections import defaultdict
from fake_useragent import UserAgent
import requests
import time, os

## Others
import random
import re
import pandas as pd
import pickle

# Scrape the IMDb website for movie information

## 1) Set up the webdriver and options for chrome

In [None]:
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

chrome_options = Options()  
chrome_options.add_argument("--headless")  

## headless driver
## driver = webdriver.Chrome(executable_path=chromedriver, options=chrome_options)

## nonheadless driver
## driver = webdriver.Chrome(chromedriver)

## 2) Set up sleep timer between each request to avoid being blocked

In [None]:
sleep_time = 28+20*random.random() # wait at least every 30 secs

## 3) Miscellaneous customized functions used in the scraping later

In [None]:
def movie_id_extract(link):
    regex = re.compile("^/title/tt(\d+)/.+$")
    return re.findall(regex, link)

## 4) Scrape for the movie IDs and movie titles

In [None]:
url = "https://www.imdb.com/search/title/?year=2018&title_type=feature&"
driver = webdriver.Chrome(chromedriver) # nonheadless driver
driver.set_window_size(1120, 1000)
driver.get(url)
time.sleep(.1)

df = pd.DataFrame(columns=['movie_id', 'movie_title'])
i = 0
total_pages = 25

while i < total_pages: ## each page has 50 movies, hence scraping 21 pages
    user_agent = {'User-agent': UserAgent().random}
    url = driver.current_url
    page = requests.get(url, headers = user_agent).text
    soup = BeautifulSoup(page, "lxml")
    for h in soup.find_all("h3"):
        a = h.find("a")
        if a is not None and 'href' in a.attrs:
            title_link = a.attrs['href']
            movie_id = movie_id_extract(title_link)[0]
            movie_title = a.text

        ## Saving it to the file every time
        df = df.append(pd.Series([movie_id, movie_title],
                                 index=df.columns),
                       ignore_index=True)
        df.to_csv(r'imdb_movie_id.csv', index = False)
        df.to_pickle('./imdb_movie_id.pkl')
    
    ## Status to prompt on screen
    print("Completed scraping page", str(i+1), "out of", str(total_pages),
          "("+ str(round((i+1)/(total_pages)*100,2))+ "%)")

    ## Find the element, then click!
    ## driver.find_element_by_xpath("//a[@class='lister-page-next next-page']").click()
    
    ## Waiting for the path to be clickable, then click!
    WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable(
            (By.XPATH, "//a[@class='lister-page-next next-page']"))).click()
    i+= 1
    time.sleep(sleep_time)
    
driver.quit() 
print("100% of movie ID and moive title has been scraped!") 

## Remove duplicates then save it to csv!
df.drop_duplicates(inplace=True)
df.to_csv(r'imdb_movie_id_2018.csv', index = False)
df.to_pickle('./imdb_movie_ids_2018.pkl')

## 5) Scrape for movie information

In [None]:
driver = webdriver.Chrome(chromedriver) # nonheadless driver
imdb_df = pd.DataFrame(columns=['movie_id', 'runtime', 'genres_list', 
                                'release_date', 'imdb_user_rating', 'imdb_user_rating_count'])

for movie_id in df['movie_id']:
    try:
        driver.get("https://www.imdb.com/title/tt" + movie_id)
        page = driver.execute_script("return document.documentElement.innerHTML;")
        soup = BeautifulSoup(page, "lxml")

        # "runtime"
        try:
            runtime = soup.find('time').text.strip()
        except:
            runtime = 0

        # "genres_list" and "release_date"
        movie_subtext = soup.find('div',attrs={"class" : "subtext"})
        a_hrefs_list = []
        for a_href in movie_subtext.find_all('a'):
            a_hrefs = movie_subtext.find_all('a')
            a_hrefs_list.append(a_href.text)

        genres_list = a_hrefs_list[:-1] #last one is the released date
        release_date = a_hrefs_list[-1].rstrip()

        # "imdb_rating" and "imdb_user_rating_count"
        rating_and_user_rating_count_html = str(soup.find("div", attrs={"class": "ratingValue"}).find("strong"))
        imdb_user_rating = re.findall(r'<strong title="(\d+.\d) based on .+ user ratings">.+', rating_and_user_rating_count_html)[0]
        imdb_user_rating_count = re.findall(r'<strong title="\d+.\d based on (.+) user ratings">.+', rating_and_user_rating_count_html)[0]

        imdb_df = imdb_df.append(pd.Series([movie_id, runtime, genres_list, release_date, imdb_user_rating, 
                                            imdb_user_rating_count], index=imdb_df.columns ), ignore_index=True)
        imdb_df.to_csv(r'imdb_movie.csv', index = False)
        imdb_df.to_pickle('./imdb_movie.pkl')

        ## Status to prompt on screen
        total_movies = len([movie_id for movie_id in df['movie_id']])
        current_movie_index = int(df[df['movie_id']==movie_id].index[0])
        print("Completed scraping movie #", 
              str(current_movie_index), "out of", str(total_movies+1), 
              "("+ str(round(current_movie_index/(total_movies+1)*100,2))+ "%)")

        time.sleep(sleep_time)
    except:
        pass

driver.quit()

## Remove duplicates then save it to csv!
imdb_df.drop_duplicates(inplace=True)
imdb_df.to_csv(r'imdb_movie_2018.csv', index = False)

## 6) Merge both data frames and save them to csv file.

In [None]:
all_df = pd.merge(df1, df2, how="inner", on="movie_id")
all_df.to_csv(r'imdb_movie.csv', index = False)