In [1]:
import urllib3
from bs4 import BeautifulSoup
import requests
import json
import re
import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import time

In [2]:
os.chdir("../")

In [3]:
base_url = "https://www.imdb.com/"

In [4]:
def get_soup(path_url):
    try:
        full_url = base_url + path_url
        headers = {'User-Agent': 'Thunder Client (https://www.thunderclient.com)'}

        time.sleep(1)
        res = requests.get(full_url, headers)
        soup = BeautifulSoup(res.text, 'html.parser')
        return soup
    except Exception as e:
        raise e

def extract_runtime(text):
    for time_format in ('%H hours %M minutes', '%H hour %M minutes', '%H hours %M minute', '%H hour %M minute'):
        try:
            timeobj = datetime.datetime.strptime(text, time_format).time()
            delta = datetime.datetime.combine(datetime.date.min, timeobj) - datetime.datetime.min
            return int(delta.total_seconds()/60)
        except ValueError:
            continue

    return np.nan

def extract_opening_date(text):
    months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
    patterns = "(" + ")|(".join(months) + ")"
    full_patterns = f"(.*)(?={patterns})"
    return datetime.datetime.strptime(re.sub(full_patterns, "", text), '%b %d, %Y')

def extract_release_date(text):
    text = re.sub(" \(.*?\)", "", text)
    for time_format in ('%B %d, %Y', '%Y', '%b %d, %Y'):
        try:
            return datetime.datetime.strptime(text, time_format)
        except ValueError:
            continue

    return pd.NaT

def extract_revenue_usa_opening(text):
    months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
    patterns = "(" + ")|(".join(months) + ")"
    full_patterns = f"(.*)(?={patterns})"
    new_text = re.sub("[\$,]", "", re.sub(re.sub(full_patterns, "", text), "", text))
    if new_text.isnumeric():
        return int(new_text)
    else:
        return np.nan

def get_rating_count(rating_url):
    soup = get_soup(rating_url)
    
    text = soup.select_one("div.allText > div.allText").text.split("\n")[1].strip()
    text_num = "".join(re.findall("[0-9]+", text))
    return text_num
    
def get_review_count(user_review_url):
    soup = get_soup(user_review_url)
    
    text = soup.select("div.header > div > span")[0].text
    text_num = "".join(re.findall("[0-9]+", text))
    return text_num.replace(",", "").strip()

def mapping_key(label):
    res = []

    if label in ("Director", "Directors"):
        res = ["directors"]
    elif label in ("Writer", "Writers"):
        res = ["writers"]
    elif label in ("Star", "Stars"):
        res = ["stars"]
    elif label in ("Genre", "Genres"):
        res = ["genres"]
    elif label in ("Country of origin", "Countries of origin"):
        res = ["country"]
    elif label in ("Language", "Languages"):
        res = ["language"]
    elif label == "Budget":
        res = ["budget"]
    elif label == "Gross US & Canada":
        res = ["revenue_usa"]
    elif label == "Opening weekend US & Canada":
            res = ["revenue_usa_opening", "opening_date"]
    elif label == "Gross worldwide":
        res = ["revenue_world"]
    elif label == "Runtime":
        res = ["runtime"]
    elif label == "Release date":
        res = ["release_date"]

    return res

def extract_list_item(label, content):

    res = None, None

    if label in ("Director", "Directors"):
        res = "directors", re.sub("\(.*?\)", ",", content).strip(",").split(",")
    elif label in ("Writer", "Writers"):
        res = "writers", re.sub("\(.*?\)", ",", content).strip(",").split(",")
    elif label in ("Star", "Stars"):
        res = "stars", re.sub("\(.*?\)", ",", content).strip(",").split(",")
    elif label in ("Genre", "Genres"):
        res = "genres", content
    elif label in ("Country of origin", "Countries of origin"):
        res = "country", content  
    elif label in ("Language", "Languages"):
        res = "language", content    
    elif label == "Budget":
        text = re.sub("(\(.*?\))|(\$)|(,)", "", content).strip()
        if text.isnumeric():
            res = "budget", int(text)
    elif label == "Gross US & Canada":
        text = re.sub("(\(.*?\))|(\$)|(,)", "", content).strip()
        if text.isnumeric():
            res = "revenue_usa", int(text)
    elif label == "Opening weekend US & Canada":
        if "$" in content:
            res = "revenue_usa_opening", extract_revenue_usa_opening(content)
        else:
            res = "opening_date", extract_opening_date(content)
    elif label == "Gross worldwide":
        text = re.sub("(\(.*?\))|(\$)|(,)", "", content).strip()
        if text.isnumeric():
            res = "revenue_world", int(text)
    elif label == "Runtime":
        res = "runtime", extract_runtime(content)
    elif label == "Release date":
        res = "release_date", extract_release_date(content)

    return res

def extract_list(label, inner_content):
    
    res = None, None

    if len(inner_content) == 1:
        res = extract_list_item(label, inner_content[0])
    else:
        label_key = None
        content_list = []
        for content in inner_content:
            label_key, content_value = extract_list_item(label, content)
            if type(content_value) != list:
                content_list.append(content_value)
            else:
                content_list.extend(content_value)
                
        res = label_key, content_list

    return res

def extract_crew_url(label, url_http):
    if url_http:
        crew_urls = [k['href'] for k in url_http]

        if label in ("Director", "Directors"):
            res = "directors_url", crew_urls
        elif label in ("Writer", "Writers"):
            res = "writers_url", crew_urls
        else:
            res = None, []

        return res
    else:
        return None, []

class MovieCrawler():
    
    def __init__(self, path_url):

        self.movie = dict(
            name = None,
            popularity = None,
            rating = None,
            rating_url = None,
            rating_count = None,
            user_review_count = None,
            critic_review_count = None,
            # metascore = None,
            directors = [],
            directors_url = [],
            writers = [],
            writers_url = [],
            top_cast = [],
            top_cast_url = [],
            stars = [],
            genres = [],
            country = [],
            language = [],
            budget = None,
            revenue_usa = None,
            revenue_usa_opening = None,
            revenue_world = None,
            runtime = None,
            opening_date = None,
            release_date = None,
        )

        try:
            self.soup = get_soup(path_url=path_url)
        except Exception as e:
            self.soup = None
            print(repr(e))

    def extract_info(self, c, label_http, url_http):
        if label_http:
            label = label_http[0].text
            all_keys = mapping_key(label)

            if len(all_keys) == 0:
                return
            elif len(all_keys) == 1:
                if  not (self.movie[all_keys[0]] is None or len(self.movie[all_keys[0]]) == 0):
                    return

            else:
                check = any([self.movie[ak] is None or len(self.movie[ak]) == 0 for ak in all_keys])
                if not check:
                    return
            
            if label in ("Director", "Directors", "Writer", "Writers"):
                url_key, url_val = extract_crew_url(label, url_http)
                # print(url_key, url_val)
                if url_key in ('directors_url','writers_url'):
                    if len(self.movie[url_key]) == 0:
                        self.movie[url_key] = url_val

            # content = c.text.replace(label, "")
            http = c.select("a.ipc-metadata-list-item__list-content-item.ipc-metadata-list-item__list-content-item--link")
            inner_content = [s.text for s in http]
            # print(label, inner_content)
            key, value = extract_list(label, inner_content)
            if key is not None:
                if (self.movie[key] is None or len(self.movie[key]) == 0):
                    self.movie[key] = value
                    return
            else:
                http = c.select("span.ipc-metadata-list-item__list-content-item")
                if len(http) != 0:
                    inner_content = [s.text for s in http]

                    for content in inner_content:
                        # print(content)
                        key, value = extract_list_item(label, content)
                        if key is not None and (self.movie[key] is None or len(self.movie[key]) == 0):
                            self.movie[key] = value
                else:
                    http = c.select_one("div.ipc-metadata-list-item__content-container")
                    if http is not None:
                        content = http.text
                        # print(content)
                        key, value = extract_list_item(label, content)
                        if key is not None and (self.movie[key] is None or len(self.movie[key]) == 0):
                            self.movie[key] = value

    def get_movie_info(self):
        if self.soup is None:
            return self.movie
            
        try:
            self.movie['name'] = self.soup.select_one("h1.sc-b73cd867-0.eKrKux").text
        except AttributeError:
            pass
        
        try:
            self.movie['popularity'] = self.soup.select_one("div.sc-edc76a2-1.gopMqI").text
        except AttributeError:
            pass

        self.movie['top_cast'] = [actor.text for actor in self.soup.select("a.sc-11eed019-1.jFeBIw")]
        self.movie['top_cast_url'] = [actor['href'] for actor in self.soup.select("a.sc-11eed019-1.jFeBIw")]
        rating_http = self.soup.select("a.ipc-button ipc-button--single-padding ipc-button--center-align-content ipc-button--default-height ipc-button--core-baseAlt ipc-button--theme-baseAlt ipc-button--on-textPrimary ipc-text-button sc-f6306ea-2 dfHGIi".replace(" ", "."))

        rating = rating_http[0].select("span.sc-7ab21ed2-1.jGRxWM")[0].text
        try:
            self.movie['rating'] = float(rating)
        except ValueError:
            pass

        self.movie['rating_url'] = rating_http[0]['href']

        # rating_count = get_rating_count(rating_http[0]['href'])
        # if rating_count.isnumeric():
        #     self.movie['rating_count'] = int(rating_count)

        user_review_url = self.soup.select("a.ipc-link ipc-link--baseAlt ipc-link--touch-target sc-124be030-2 eshTwQ isReview".replace(" ", "."))[0]['href']
        user_review_count = get_review_count(user_review_url)
        if user_review_count.isnumeric():
            self.movie['user_review_count'] = int(user_review_count)

        critic_review_count = self.soup.select("a.ipc-link ipc-link--baseAlt ipc-link--touch-target sc-124be030-2 eshTwQ isReview".replace(" ", "."))[1].text.replace(",","").replace("Critic reviews", "")
        if critic_review_count.isnumeric():
            self.movie['critic_review_count'] = int(critic_review_count)
        
        # metascore = self.soup.select("a.ipc-link ipc-link--baseAlt ipc-link--touch-target sc-124be030-2 eshTwQ isReview".replace(" ", "."))[2].text.replace(",","").replace("Metascore", "")
        # if metascore.isnumeric():
        #     self.movie['metascore'] = int(metascore)

        http1 = self.soup.select("li.ipc-metadata-list__item")
        http2 = self.soup.select("li.ipc-metadata-list__item.ipc-metadata-list-item--link")

        for c in http1:
            label_http = c.select("span.ipc-metadata-list-item__label")
            url_http = c.select("a.ipc-metadata-list-item__list-content-item.ipc-metadata-list-item__list-content-item--link")
            self.extract_info(c, label_http, url_http)

        for c in http2:
            label_http = c.select("a.ipc-metadata-list-item__label.ipc-metadata-list-item__label--link")
            url_http = c.select("a.ipc-metadata-list-item__list-content-item.ipc-metadata-list-item__list-content-item--link")
            self.extract_info(c, label_http, url_http)

        return self.movie

In [5]:
# movie_path = "/title/tt0118849/"
# movie_url = base_url + movie_path
# headers = {'User-Agent': 'Mozilla/5.0'}
# res = requests.get(movie_url, headers)
# soup = BeautifulSoup(res.text, 'html.parser')

# http2 = soup.select("li.ipc-metadata-list__item.ipc-metadata-list-item--link")

# for c in http2:
#     label_http = c.select("a.ipc-metadata-list-item__label.ipc-metadata-list-item__label--link")
#     if label_http:
#         label = label_http[0].text
#         if label == "Release date":
#             http = c.select("a.ipc-metadata-list-item__list-content-item.ipc-metadata-list-item__list-content-item--link")
#             inner_content = [s.text for s in http]
#             print(inner_content)
    

In [6]:
# crawler = MovieCrawler("/title/tt0118849/")
# crawler.get_movie_info()

In [7]:
def crawl_movie_data(crawler, status, movie_name, movie_rating_count, movie_index, movie_url):
    if status:
        movie_dict = crawler.get_movie_info()
    else:
        movie_dict = crawler.movie

    if movie_dict['name'] is None:
        movie_dict['name'] = movie_name
    if movie_dict['rating_count'] is None:
        if movie_rating_count.isnumeric():
            movie_dict['rating_count'] = movie_rating_count
    movie_dict['id'] = movie_index
    movie_dict['url'] = movie_url

    return movie_dict

top250_url = "/chart/top"
top250_soup = get_soup(top250_url)
urls = [element.attrs.get('href') for element in top250_soup.select('td.titleColumn a')]
movie_names = top250_soup.select('td.titleColumn')
movie_rating_counts = [element.attrs.get('data-value') for element in top250_soup.select('td.posterColumn span[name=nv]')]

movie_list = []
for index, url in tqdm(enumerate(urls), total = len(urls)):
    crawler = MovieCrawler(url)
    # time.sleep(1)
    try:
        movie_dict = crawl_movie_data(crawler, True, movie_names[index], movie_rating_counts[index], index, url)
        movie_list.append(movie_dict)
    except Exception as e:
        movie_dict = crawl_movie_data(crawler, False, movie_names[index], movie_rating_counts[index], index, url)
        movie_list.append(movie_dict)
        print(repr(e))
    # if index == 10:
    #     break

 23%|██▎       | 57/250 [07:35<2:22:31, 44.31s/it]

ConnectionError(MaxRetryError("HTTPSConnectionPool(host='www.imdb.com', port=443): Max retries exceeded with url: //title/tt1853728/reviews?ref_=tt_ov_rt&User-Agent=Thunder+Client+%28https%3A%2F%2Fwww.thunderclient.com%29 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f017ab609d0>: Failed to establish a new connection: [Errno 110] Connection timed out'))"))


 59%|█████▉    | 148/250 [18:15<1:13:47, 43.41s/it]

ConnectionError(MaxRetryError("HTTPSConnectionPool(host='www.imdb.com', port=443): Max retries exceeded with url: //title/tt0057115/?User-Agent=Thunder+Client+%28https%3A%2F%2Fwww.thunderclient.com%29 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f018040ee20>: Failed to establish a new connection: [Errno 110] Connection timed out'))"))


 66%|██████▌   | 165/250 [22:00<1:02:40, 44.24s/it]

ConnectionError(MaxRetryError("HTTPSConnectionPool(host='www.imdb.com', port=443): Max retries exceeded with url: //title/tt0347149/?User-Agent=Thunder+Client+%28https%3A%2F%2Fwww.thunderclient.com%29 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f017b729550>: Failed to establish a new connection: [Errno 110] Connection timed out'))"))


 79%|███████▉  | 197/250 [27:07<38:59, 44.14s/it]  

ConnectionError(MaxRetryError("HTTPSConnectionPool(host='www.imdb.com', port=443): Max retries exceeded with url: //title/tt2119532/reviews?ref_=tt_ov_rt&User-Agent=Thunder+Client+%28https%3A%2F%2Fwww.thunderclient.com%29 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f017bb71160>: Failed to establish a new connection: [Errno 110] Connection timed out'))"))


 84%|████████▍ | 210/250 [30:27<29:01, 43.54s/it]

ConnectionError(MaxRetryError("HTTPSConnectionPool(host='www.imdb.com', port=443): Max retries exceeded with url: //title/tt0097165/?User-Agent=Thunder+Client+%28https%3A%2F%2Fwww.thunderclient.com%29 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f01803c5ac0>: Failed to establish a new connection: [Errno 110] Connection timed out'))"))


 97%|█████████▋| 243/250 [35:39<05:06, 43.76s/it]

ConnectionError(MaxRetryError("HTTPSConnectionPool(host='www.imdb.com', port=443): Max retries exceeded with url: //title/tt0025316/?User-Agent=Thunder+Client+%28https%3A%2F%2Fwww.thunderclient.com%29 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f017b8746a0>: Failed to establish a new connection: [Errno 110] Connection timed out'))"))


100%|██████████| 250/250 [36:18<00:00,  8.71s/it]


In [8]:
file_name = './cache/raw_movie_data.json'
with open(file_name, 'w+') as f:
    f.write(json.dumps(movie_list, default=str))

In [9]:
df = pd.DataFrame.from_dict(movie_list)

In [10]:
writers_df = pd.DataFrame({
    "name": [item for sublist in list(df['writers']) for item in sublist], 
    "url": [item for sublist in list(df['writers_url']) for item in sublist]
})
writers_df['url'] = writers_df['url'].str.replace("(?<=\?)(.*)|(\?)", "", regex=True)
writers_df.drop_duplicates(inplace=True)
writers_df.set_index('url', inplace=True)

directors_df = pd.DataFrame({
    "name": [item for sublist in list(df['directors']) for item in sublist], 
    "url": [item for sublist in list(df['directors_url']) for item in sublist]
})
directors_df['url'] = directors_df['url'].str.replace("(?<=\?)(.*)|(\?)", "", regex=True)
directors_df.drop_duplicates(inplace=True)
directors_df.set_index('url', inplace=True)

actors_df = pd.DataFrame({
    "name": [item for sublist in list(df['top_cast']) for item in sublist], 
    "url": [item for sublist in list(df['top_cast_url']) for item in sublist]
})
actors_df['url'] = actors_df['url'].str.replace("(?<=\?)(.*)|(\?)", "", regex=True)
actors_df.drop_duplicates(inplace=True)
actors_df.set_index('url', inplace=True)


In [11]:
def get_stars_url(top_cast, top_cast_url, stars):
    return [v for k,v in zip(top_cast, top_cast_url) if k in stars]

In [12]:
df['stars_url'] = df.apply(lambda x: get_stars_url(x.top_cast, x.top_cast_url, x.stars), axis=1)

In [13]:
top_cast_df = df[['id', 'top_cast_url']].explode(['top_cast_url'])
top_cast_df.columns = ['movie_id', 'actor_url']
stars_df = df[['id', 'stars_url']].explode(['stars_url'])
stars_df.columns = ['movie_id', 'actor_url']
stars_df['is_star'] = True
actor_movie_df = top_cast_df.merge(stars_df, how='left', on=['movie_id', 'actor_url'])
actor_movie_df['is_star'] = actor_movie_df['is_star'].fillna(False)
director_movie_df = df[['id', 'directors_url']].explode(['directors_url'])
writers_movie_df = df[['id', 'writers_url']].explode(['writers_url'])
genres_movie_df = df[['id', 'genres']].explode(['genres'])
country_movie_df = df[['id', 'country']].explode(['country'])
language_movie_df = df[['id', 'language']].explode(['language'])

In [14]:
df = df[['id','url','name','popularity','rating','rating_url','rating_count','user_review_count','critic_review_count','budget','revenue_usa','revenue_usa_opening','revenue_world','runtime','opening_date','release_date']]

In [15]:
df[df['release_date'].isnull()]

Unnamed: 0,id,url,name,popularity,rating,rating_url,rating_count,user_review_count,critic_review_count,budget,revenue_usa,revenue_usa_opening,revenue_world,runtime,opening_date,release_date
56,56,/title/tt1853728/,Hành Trình Django,178.0,8.4,/title/tt1853728/ratings/?ref_=tt_ov_rt,,,,,,,,,NaT,NaT
147,147,/title/tt0057115/,"[\n 148.\n , [Cuộc Đào Thoát Vĩ Đại]...",,,,240548.0,,,,,,,,NaT,NaT
164,164,/title/tt0347149/,"[\n 165.\n , [Lâu Đài Di Động Của Ho...",,,,378601.0,,,,,,,,NaT,NaT
196,196,/title/tt2119532/,Người Hùng Không Súng,239.0,8.1,/title/tt2119532/ratings/?ref_=tt_ov_rt,,,,,,,,,NaT,NaT
209,209,/title/tt0097165/,"[\n 210.\n , [Dead Poets Society], \...",,,,471368.0,,,,,,,,NaT,NaT
242,242,/title/tt0025316/,"[\n 243.\n , [Chuyện Xảy Ra Trong Đê...",,,,102143.0,,,,,,,,NaT,NaT


In [16]:
writers_df.to_csv('./cache/writers.csv', index=True)
directors_df.to_csv('./cache/directors.csv', index=True)
actors_df.to_csv('./cache/actors.csv', index=True)
actor_movie_df.to_csv('./cache/actor_movie.csv', index=False)
director_movie_df.to_csv('./cache/director_movie.csv', index=False)
writers_movie_df.to_csv('./cache/writers_movie.csv', index=False)
genres_movie_df.to_csv('./cache/genres_movie.csv', index=False)
country_movie_df.to_csv('./cache/country_movie.csv', index=False)
language_movie_df.to_csv('./cache/language_movie.csv', index=False)
df.to_csv('./cache/movies.csv', index=False)

In [18]:
x = "123,1231321"

In [19]:
float(x)

ValueError: could not convert string to float: '123,1231321'