# Web Scraping Part-2:

## Obtain all movies' information, write them into a local csv file

* To run this file successfully, you need to download chromdriver:
https://chromedriver.chromium.org/
* And install selenium package:
https://selenium-python.readthedocs.io/installation.html

In [None]:
import requests
from bs4 import BeautifulSoup
import re
from selenium import webdriver      
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from multiprocessing import Pool
import pandas as pd

In [None]:
def review_data_key(review_link):
    # get review data key in each page
    review_response = requests.get(review_link)
    if review_response.status_code == 200:
        pass
    else:
        print("HTTP Error")
    review_results_page = BeautifulSoup(review_response.content,'lxml')
    data_key = review_results_page.find('div',class_ = 'load-more-data').get('data-key')
    new_url = review_link + f'_ajax?paginationKey={data_key}'
    return new_url

In [None]:
def get_reviews(review_link,driver):
    # this function is the core to use selenium
    review_list = []
    review_response = requests.get(review_link)
    if review_response.status_code == 200:
        pass
    else:
        print("HTTP Error")
    review_results_page = BeautifulSoup(review_response.content,'lxml')
    for review in review_results_page.find_all('div',class_ = 'text show-more__control'):
        review_list.append(review.get_text())
    if review_results_page.find('button', class_ = 'ipl-load-more__button'):
        driver.get(review_link)
        while len(review_list)<30:
            loadMoreButton = driver.find_element_by_class_name('ipl-load-more__button')
            # get review for this page
            review_response = requests.get(review_data_key(review_link))
            if review_response.status_code == 200:
                pass
            else:
                print("HTTP Error")
            review_results_page = BeautifulSoup(review_response.content,'lxml')
            for review in review_results_page.find_all('div',class_ = 'text show-more__control'):
                review_list.append(review.get_text())
            loadMoreButton.click()
        review_list = review_list[:31]
    return review_list

In [None]:
def imdb_film_feature(movie_link_list,driver):
    # this function will give back all information about every movie
    dict_ = {}
    for movie in movie_link_list:
        movie_name = movie[0]
        movie_link = movie[1]
        dict_[movie_name] = {}
        response = requests.get(movie_link)
        if response.status_code == 200:
            pass
        else:
            print("Failure")
        results_page = BeautifulSoup(response.content,'lxml')
        
        try:
            # get rating_value
            dict_[movie_name]['rating_value'] = results_page.find('div',class_ = 'ratingValue').find('span').get_text()
            # get time
            dict_[movie_name]['time'] = results_page.find('div',class_ = 'subtext').find('time').get_text().strip()
            # get genres
            genres = []
            for i in results_page.find('div',class_ = 'subtext').find_all('a')[:-1]:
                genres.append(i.get_text().strip())
            dict_[movie_name]['genres'] = genres
            # get release_date
            dict_[movie_name]['release_date'] = ' '.join(results_page.find('div',class_ = 'subtext').find_all('a')[-1].get_text().strip().split()[:-1])
            # get release_country
            dict_[movie_name]['release_country'] = results_page.find('div',{'class':'article','id':'titleDetails'}).find_all('div',{'class':'txt-block'})[1].get_text().strip().split('\n')
            if '|' in dict_[movie_name]['release_country']:
                dict_[movie_name]['release_country'].remove('|')
            dict_[movie_name]['release_country'] = dict_[movie_name]['release_country'][1:]  
            # preparation for getting director,writers,stars
            pattern = re.compile(r'\d{1} more credits')
            # get director
            director_writers_stars_0 = results_page.find_all('div',class_ = 'credit_summary_item')[0].get_text()
            if pattern.findall(director_writers_stars_0):
                director_link = movie_link + results_page.find_all('div',class_ = 'credit_summary_item')[0].find_all('a')[-1].get('href')
                director_response = requests.get(director_link)
                if not director_response.status_code == 200:
                     print("HTTP Error")
                director_results_page = BeautifulSoup(director_response.content,'lxml')
                dict_[movie_name]['director'] = [director_name.get_text().strip() for director_name in director_results_page.find_all('table',class_='simpleTable simpleCreditsTable')[0].find_all('td',class_='name')]
            else:
                dict_[movie_name]['director'] = results_page.find_all('div',class_ = 'credit_summary_item')[0].get_text().split('\n')[2].strip('|').strip(' ').split(',')
            # get writers
            director_writers_stars_1 = results_page.find_all('div',class_ = 'credit_summary_item')[1].get_text()
            if pattern.findall(director_writers_stars_1):
                writers_link = movie_link + results_page.find_all('div',class_ = 'credit_summary_item')[1].find_all('a')[-1].get('href')
                writers_response = requests.get(director_link)
                if not writers_response.status_code == 200:
                     print("HTTP Error")
                writers_results_page = BeautifulSoup(writers_response.content,'lxml')
                dict_[movie_name]['writers'] = [writers_name.get_text().strip() for writers_name in writers_results_page.find_all('table',class_='simpleTable simpleCreditsTable')[1].find_all('td',class_='name')]
            else:
                dict_[movie_name]['writers'] = results_page.find_all('div',class_ = 'credit_summary_item')[1].get_text().split('\n')[2].strip('|').strip(' ').split(',')
            # get stars
            stars_link = movie_link + results_page.find_all('div',class_ = 'credit_summary_item')[2].find_all('a')[-1].get('href')
            stars_response = requests.get(stars_link)
            if not stars_response.status_code == 200:
                print("HTTP Error")
            stars_results_page = BeautifulSoup(stars_response.content,'lxml')
            dict_[movie_name]['stars'] = [stars.find_all('td')[1].get_text().strip() for stars in stars_results_page.find('table',class_='cast_list').find_all('tr') if not len(stars.find_all('td')) == 1]
            # get storyline
            dict_[movie_name]['storyline'] = results_page.find('div',class_ = 'inline canwrap').find('span').get_text().strip()
            # get keywords
            url = 'https://www.imdb.com'
            try:
                keywords_url = url + results_page.find('div',class_ = 'see-more inline canwrap').find('nobr').find('a').get('href')
                keywords_response = requests.get(keywords_url)
                if keywords_response.status_code == 200:
                    pass
                else:
                    print("HTTP Error")
                keywords_results_page = BeautifulSoup(keywords_response.content,'lxml') 
                dict_[movie_name]['keywords'] = [ i.get_text().strip() for i in keywords_results_page.find_all('div',class_ = 'sodatext')]
            except:
                dict_[movie_name]['keywords'] = None
            # get reviews
            # in this part, we use selenium to help us cilck on button automatically
            review_link = movie_link + 'reviews/'
            dict_[movie_name]['reviews'] = get_reviews(review_link,driver)
        except:
            pass
    driver.quit()
    return dict_    

In [None]:
def big_func(movie_link_list):
    # write a big function to run all codes before
    # thus we can use this function in multiprocessing
    driver = webdriver.Chrome('/Users/fancy/Downloads/chromedriver') # change the path to where the chromedriver is located
    result = imdb_film_feature(movie_link_list,driver)
    return result

In [None]:
# read data from csv and prepare it to use in multiprocessing
test = pd.read_csv('movie_unique')
movie_link_list_1 = test[['name','link']].values.tolist()[90000:105000]  # change the index here to obtain data you want to run
movie_link_list_2 = test[['name','link']].values.tolist()[105000:120000]
movie_link_list_3 = test[['name','link']].values.tolist()[120000:135000]

In [None]:
# the code is the core of multuprocessing
# change the int value of Pool() to limit the maxmium processing you can run
with Pool(5) as p:
    t = p.map(big_func, [movie_link_list_1,movie_link_list_2,movie_link_list_3])

In [None]:
# the code is to obtain result from multiprocessing
# and write the result into a csv file
d1= pd.DataFrame(t[0]).T
d1.to_csv('90000-105000')
d2 = pd.DataFrame(t[1]).T
d2.to_csv('105000-120000')
d3 = pd.DataFrame(t[2]).T
d3.to_csv('120000-135000')

### Hint: 
* because the data we obtain from these functions is seperate, so we use below code to obtain a complete database

In [None]:
# use variable d1 to d41 to read all seperate file we have obtained
# for example:
d1 = pd.read_csv('0-20',index_col='Unnamed: 0')

In [None]:
# concat them into a dataframe
data = pd.concat([d1,d2,d3,d4,d5,d7,d6,d8,d9,
                 d10,d11,d12,d13,d14,d15,d16,d17,d18,d19,
                 d20,d21,d22,d23,d24,d25,d26,d27,d28,d29,
                 d30,d31,d32,d33,d34,d35,d36,d37,d38,d39,
                 d40,d41])

In [None]:
# before we write it into a csv, we do some basic cleaning
# drop rows with all None value
data = data.dropna(how='all')
# drop dupliates 
data = data.drop_duplicates(subset='index',keep='first')
# set column 'index' as index of datafram
data.set_index('index')
# write the data to csv file
data.to_csv('data')