# TL;DR
### Part 1: Crawl Movies
### Part 2: Crawl Additional Information for Movies
### Part 3: Crawl Reviews for Movies
### Part 4: Others
#### - Merge all three Movies related dataset crawled
#### - Extract persons (director, writer, and star) from movies and save as standalone dataset
#### - Extract users from reviews and save as standalone dataset
#### - Data too huge, generate sample for submission

## In summary, 4 csv files are generated:
### - movies.csv
### - review.csv
### - persons.csv
### - users.csv

### dataframe info scroll to the bottom.


In [1]:
import os
import re
import time
import json
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
genres = [
    'action', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'drama', 
    'family', 'fantasy', 'film-noir', 'history', 'horror', 'music', 'musical', 
    'mystery', 'romance', 'sci-fi', 'sport', 'thriller', 'war', 'western'
]         

In [3]:
host = 'https://www.imdb.com'
root_url = host + '/search/title/?languages=en&title_type=feature&genres={}&start={}'


In [4]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X '
                         '10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/72.0.3626.109 Safari/537.36'}


# Part 1: crawl movie (general information) from search list by genre

In [5]:
# in case crawling failed by half and need to restart somewhere for a genre 
# note: this will only apply to the first genre in 'genres'
#       remove the genre which has done crawling from 'genres' first
jumpstart = 0

# for each genre
for genre in genres:

    # page (of search list) to start crawling
    start = 1
    
    # in case need to crawl from a particular page (applicable only for the first genre in loop)
    if jumpstart > 1:
        start = jumpstart
        jumpstart = 0
        
    has_next_page = False
    
    # while list is not empty
    while has_next_page:
        
        # open the current page
        genre_url = root_url.format(genre, start)
        genre_bs4 = BeautifulSoup(requests.get(genre_url, headers=headers).text, 'html.parser')
        
        # find if there is next page
        next_page_link = genre_bs4.find_all('a', class_='lister-page-next next-page')
        
        # if there is next page
        if len(next_page_link) > 0:
            
            # get crawl url's parameter for the next page
            next_page_start_arg = next_page_link[0].get('href').split('start=')
            
            if len(next_page_start_arg) > 1:
                start = next_page_start_arg[1].split('&')[0]
            else:
                has_next_page = False
        else:
            has_next_page = False
        
        # crawl movies listed in the current list (page)
        for movie_div in genre_bs4.find_all('div', class_='lister-item-content'):

            # define the empty row with fields expected to be filled up by data from html
            movie = {
                "name": "", "url": "", "id": "", "year": "", "certificate": "", 
                "runtime": "", "genre": "", "rating": "", "metascore": ""
            }
            
            # extracted information from html tags
            movie_header = movie_div.select('.lister-item-header')
            
            movie_link = movie_header[0].select('a')
            movie_year = movie_header[0].select('span.lister-item-year')
            
            movie_certificate = movie_div.find_all('span', class_='certificate')
            movie_runtime = movie_div.find_all('span', class_='runtime')
            movie_genre = movie_div.find_all('span', class_='genre')
            movie_rating = movie_div.find_all('div', class_='ratings-imdb-rating')
            movie_metascore = movie_div.find_all('span', class_='metascore')
            
            # if respected information exists, assign to the row
            if len(movie_link) > 0:
                movie['name'] = movie_link[0].text.strip()
                movie['url'] = movie_link[0].get('href')
                movie['id'] = movie_link[0].get('href').split('/')[2]

            if len(movie_year) > 0:
                movie['year'] = movie_year[0].text.strip()

            if len(movie_certificate) > 0:
                movie['certificate'] = movie_certificate[0].text.strip()

            if len(movie_runtime) > 0:
                movie['runtime'] = movie_runtime[0].text.strip()

            if len(movie_genre) > 0:
                movie['genre'] = movie_genre[0].text.strip().split(', ')

            if len(movie_rating) > 0:
                movie['rating'] = movie_rating[0].get('data-value')

            if len(movie_metascore) > 0:
                movie['metascore'] = movie_metascore[0].text.strip()

            # dump the row into a json file
            with open('data/movies.json', 'a+') as movies_json:
                print(json.dumps(movie), file = movies_json, flush = True)



In [6]:
# read all rows from the crawled json file
movies_json = open('data/movies.json', 'r')
line = movies_json.readline()

movies = list()

while line:
    # read all lines and append each line to the list
    if len(line.strip()) > 0:
        movies.append(json.loads(line))
        
    line = movies_json.readline()
    
# convert the list as dataframe
movies_df = pd.DataFrame(movies)
movies_df.drop_duplicates(['id'], inplace = True)

In [7]:
movies_df

Unnamed: 0,name,url,id,year,certificate,runtime,genre,rating,metascore
0,Tenet,/title/tt6723592/?ref_=adv_li_tt,tt6723592,(2020),PG-13,,"[Action, Sci-Fi, Thriller]",,
1,The Last Days of American Crime,/title/tt1552211/?ref_=adv_li_tt,tt1552211,(2020),,148 min,"[Action, Crime, Thriller]",3.6,15
2,The Hunt,/title/tt8244784/?ref_=adv_li_tt,tt8244784,(II) (2020),R,90 min,"[Action, Horror, Thriller]",6.4,50
3,The Gentlemen,/title/tt8367814/?ref_=adv_li_tt,tt8367814,(2019),R,113 min,"[Action, Comedy, Crime]",7.9,51
4,Avengers: Endgame,/title/tt4154796/?ref_=adv_li_tt,tt4154796,(2019),P13,181 min,"[Action, Adventure, Drama]",8.4,78
...,...,...,...,...,...,...,...,...,...
137268,Three Who Paid,/title/tt0013673/?ref_=adv_li_tt,tt0013673,(1923),,50 min,[Western],,
137269,Huntin' Trouble,/title/tt0135464/?ref_=adv_li_tt,tt0135464,(1924),,,[Western],,
137270,A Fight for Honor,/title/tt0014888/?ref_=adv_li_tt,tt0014888,(1924),,50 min,"[Action, Adventure, Drama]",5.1,
137271,The Phantom Rider,/title/tt0020266/?ref_=adv_li_tt,tt0020266,(1929),,48 min,"[Action, Adventure, Romance]",,


In [8]:
# add additional columns
movies_df['review_url'] = movies_df['url'].str.replace('?ref_=adv_li_tt', 'reviews', regex = False)
movies_df['year'] = movies_df['year'].str.extract('(\d+)').fillna("0").astype(int)
movies_df.to_csv('data/movies_general_full.csv')
movies_df

Unnamed: 0,name,url,id,year,certificate,runtime,genre,rating,metascore,review_url
0,Tenet,/title/tt6723592/?ref_=adv_li_tt,tt6723592,2020,PG-13,,"[Action, Sci-Fi, Thriller]",,,/title/tt6723592/reviews
1,The Last Days of American Crime,/title/tt1552211/?ref_=adv_li_tt,tt1552211,2020,,148 min,"[Action, Crime, Thriller]",3.6,15,/title/tt1552211/reviews
2,The Hunt,/title/tt8244784/?ref_=adv_li_tt,tt8244784,2020,R,90 min,"[Action, Horror, Thriller]",6.4,50,/title/tt8244784/reviews
3,The Gentlemen,/title/tt8367814/?ref_=adv_li_tt,tt8367814,2019,R,113 min,"[Action, Comedy, Crime]",7.9,51,/title/tt8367814/reviews
4,Avengers: Endgame,/title/tt4154796/?ref_=adv_li_tt,tt4154796,2019,P13,181 min,"[Action, Adventure, Drama]",8.4,78,/title/tt4154796/reviews
...,...,...,...,...,...,...,...,...,...,...
137268,Three Who Paid,/title/tt0013673/?ref_=adv_li_tt,tt0013673,1923,,50 min,[Western],,,/title/tt0013673/reviews
137269,Huntin' Trouble,/title/tt0135464/?ref_=adv_li_tt,tt0135464,1924,,,[Western],,,/title/tt0135464/reviews
137270,A Fight for Honor,/title/tt0014888/?ref_=adv_li_tt,tt0014888,1924,,50 min,"[Action, Adventure, Drama]",5.1,,/title/tt0014888/reviews
137271,The Phantom Rider,/title/tt0020266/?ref_=adv_li_tt,tt0020266,1929,,48 min,"[Action, Adventure, Romance]",,,/title/tt0020266/reviews


In [9]:
# reducing the data range by selecting only movies from 2019 for the exam purpose
selected_movies = movies_df[movies_df['year'] >= 2019]
selected_movies.to_csv('data/movies_general.csv')
selected_movies

Unnamed: 0,name,url,id,year,certificate,runtime,genre,rating,metascore,review_url
0,Tenet,/title/tt6723592/?ref_=adv_li_tt,tt6723592,2020,PG-13,,"[Action, Sci-Fi, Thriller]",,,/title/tt6723592/reviews
1,The Last Days of American Crime,/title/tt1552211/?ref_=adv_li_tt,tt1552211,2020,,148 min,"[Action, Crime, Thriller]",3.6,15,/title/tt1552211/reviews
2,The Hunt,/title/tt8244784/?ref_=adv_li_tt,tt8244784,2020,R,90 min,"[Action, Horror, Thriller]",6.4,50,/title/tt8244784/reviews
3,The Gentlemen,/title/tt8367814/?ref_=adv_li_tt,tt8367814,2019,R,113 min,"[Action, Comedy, Crime]",7.9,51,/title/tt8367814/reviews
4,Avengers: Endgame,/title/tt4154796/?ref_=adv_li_tt,tt4154796,2019,P13,181 min,"[Action, Adventure, Drama]",8.4,78,/title/tt4154796/reviews
...,...,...,...,...,...,...,...,...,...,...
136110,Trespass,/title/tt8359690/?ref_=adv_li_tt,tt8359690,2020,,,[Western],,,/title/tt8359690/reviews
136126,Six Guns for Hire,/title/tt11540858/?ref_=adv_li_tt,tt11540858,2020,,,[Western],,,/title/tt11540858/reviews
136356,Turn and Burn,/title/tt11281290/?ref_=adv_li_tt,tt11281290,2021,,,"[Drama, Romance, Western]",,,/title/tt11281290/reviews
136402,Ruthless Reprisal,/title/tt11512180/?ref_=adv_li_tt,tt11512180,2020,,85 min,[Western],,,/title/tt11512180/reviews


# Part 2: crawl other details & box office info of each movie

In [10]:
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# list of movies which has been successfully crawled with other information
details_done_list = pd.read_csv('data/details_done.csv', names=['movie_id'])

# some issue with crawling process, needs to handle retries
retry_strategy = Retry(
    total = 10,
    status_forcelist = [429, 500, 502, 503, 504],
    method_whitelist = ["HEAD", "GET", "OPTIONS"],
    backoff_factor = 1
)
    
adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)
http.mount("http://", adapter)

# for each selected movie
for movie in selected_movies.to_dict('record'):
    
    # if other information of the movie has been successfully crawled before, skip
    if movie['id'] in list(details_done_list['movie_id']):
        print(movie['id'], 'skipped')
        continue
    
    movie_id = movie['id']
    url = movie['url']

    url = host + url
    print(url)
    
    # define empty row for movie's detailed information and movie's box office information
    details = {
        "movie_id": movie_id, "critic_count": "", "review_count": "", 
        "directors": list(), "writers": list(), "stars": list(), "awards": list(), 
    }
    
    box_office = {
        "movie_id": movie_id, "budget": "", "us_opening": "", "us_gross": "", "worldwide_gross": ""
    }
    
    # extracted information from html tags
    # and if respected information exists, assign to the row
    details_bs4 = BeautifulSoup(http.get(url, headers=headers).text, 'html.parser')
    
    for element in details_bs4.find_all("div", class_ = "credit_summary_item"):
        role = element.find_all('h4')[0].text.rstrip(':').rstrip('s').lower()
        links = element.find_all('a')
        for link in links:
            if role + 's' in ["directors", "writers", "stars"]:
                details[role + 's'].append({
                    link.text: link.get('href')
                })
            else:
                print('unknown roles', role, link.text, link.get('href'))
    
    review_div = details_bs4.find_all('div', class_='titleReviewbarItemBorder')
    
    if len(review_div) > 0:
        element_div = review_div[0].find_all('span', class_='subText')
        for element in element_div:
            for link in element.find_all('a'):
                if link.get('href').startswith('reviews'):
                    details['review_count'] = link.text
                if link.get('href').startswith('externalreviews'):
                    details['critic_count'] = link.text
    
    awards_div = details_bs4.find_all('span', class_='awards-blurb')
    
    if len(awards_div) > 0:
        for award in awards_div:
            details['awards'].append(award.text.strip())
    
    
    info_div = details_bs4.find_all('div', class_='txt-block')
    
    box_office_map = {
        'Budget:': 'budget', 
        'Opening Weekend USA:': 'us_opening', 
        'Gross USA:': 'us_gross', 
        'Cumulative Worldwide Gross:': 'worldwide_gross'
    }
    
    if len(info_div) > 0:
        for info in info_div:
            info_title_div = info.find_all('h4', class_='inline')
            
            for info_title in info_title_div:
                if info_title.text.strip() in box_office_map:
                    box_office[box_office_map[info_title.text.strip()]] = info.text.strip()
    
    # dump the movie's detailed information into a json file
    with open('data/movies_details.json', 'a+') as details_json:
        print(json.dumps(details), file = details_json, flush = True)
        
    # dump the movie's box office information into a json file
    with open('data/movies_box_office.json', 'a+') as box_office_json:
        print(json.dumps(box_office), file = box_office_json, flush = True)

    # add the movie as successfully crawled
    with open('data/details_done.csv', 'a+') as details_done:
        print(movie['id'], file = details_done, flush = True)
    
    

tt6723592 skipped
tt1552211 skipped
tt8244784 skipped
tt8367814 skipped
tt4154796 skipped
tt10314450 skipped
tt2382320 skipped
tt8936646 skipped
tt2527338 skipped
tt7713068 skipped
tt4566758 skipped
tt1502397 skipped
tt7126948 skipped
tt12361974 skipped
tt1025100 skipped
tt1877830 skipped
tt7975244 skipped
tt1745960 skipped
tt6048922 skipped
tt1950186 skipped
tt7439064 skipped
tt1634106 skipped
tt3794354 skipped
tt6902676 skipped
tt10308928 skipped
tt10838180 skipped
tt5033998 skipped
tt3480822 skipped
tt6806448 skipped
tt0437086 skipped
tt8851668 skipped
tt5774060 skipped
tt7556122 skipped
tt6320628 skipped
tt5822564 skipped
tt6475714 skipped
tt6146586 skipped
tt5433138 skipped
tt6924650 skipped
tt4154664 skipped
tt6450804 skipped
tt8106534 skipped
tt6264654 skipped
tt8041270 skipped
tt6334354 skipped
tt8688634 skipped
tt5034838 skipped
tt0448115 skipped
tt8629748 skipped
tt6189022 skipped
tt6856242 skipped
tt6565702 skipped
tt1560220 skipped
tt9032400 skipped
tt7456310 skipped
tt1206

tt11882032 skipped
tt3554874 skipped
tt5654204 skipped
tt11890562 skipped
tt3380766 skipped
tt11369420 skipped
tt8272004 skipped
tt12226660 skipped
tt7352264 skipped
tt7914022 skipped
tt3291788 skipped
tt12326812 skipped
tt4741504 skipped
tt6607258 skipped
tt1590031 skipped
tt5133062 skipped
tt6075690 skipped
tt10223850 skipped
tt11128528 skipped
tt9901162 skipped
tt9674196 skipped
tt8442346 skipped
tt5880374 skipped
tt10097958 skipped
tt9910302 skipped
tt11713972 skipped
tt7908426 skipped
tt8385184 skipped
tt6969338 skipped
tt12154030 skipped
tt3968196 skipped
tt10832284 skipped
tt3231352 skipped
tt9893652 skipped
tt10156586 skipped
tt10141026 skipped
tt10064536 skipped
tt7725504 skipped
tt12009802 skipped
tt10712828 skipped
tt10432350 skipped
tt7682204 skipped
tt10540120 skipped
tt8097016 skipped
tt3518152 skipped
tt5905328 skipped
tt11748280 skipped
tt9481630 skipped
tt9635760 skipped
tt10741412 skipped
tt8183762 skipped
tt10645852 skipped
tt11232322 skipped
tt12204316 skipped
tt877

tt9412786 skipped
tt10188156 skipped
tt10613866 skipped
tt12275108 skipped
tt11127324 skipped
tt11202264 skipped
tt12082990 skipped
tt12035576 skipped
tt9614796 skipped
tt10747476 skipped
tt11127358 skipped
tt9811798 skipped
tt5996648 skipped
tt9724056 skipped
tt9468936 skipped
tt10187780 skipped
tt11212530 skipped
tt10837374 skipped
tt10427070 skipped
tt11827400 skipped
tt11088104 skipped
tt11212672 skipped
tt12358176 skipped
tt11197712 skipped
tt11702612 skipped
tt9724076 skipped
tt10558280 skipped
tt12103846 skipped
tt11212484 skipped
tt11737594 skipped
tt12258280 skipped
tt9240352 skipped
tt9724092 skipped
tt8807482 skipped
tt11023504 skipped
tt11481502 skipped
tt4766078 skipped
tt11738338 skipped
tt11626142 skipped
tt11212744 skipped
tt11760836 skipped
tt9368024 skipped
tt9306654 skipped
tt11580360 skipped
tt12391868 skipped
tt12375546 skipped
tt12162552 skipped
tt6298602 skipped
tt1427931 skipped
tt12103836 skipped
tt12422086 skipped
tt9576232 skipped
tt11127340 skipped
tt1241181

tt10560862 skipped
tt10642834 skipped
tt5372528 skipped
tt7749142 skipped
tt6193470 skipped
tt2400447 skipped
tt9198724 skipped
tt6843538 skipped
tt10661180 skipped
tt9140598 skipped
tt8854392 skipped
tt4310022 skipped
tt5979556 skipped
tt4300754 skipped
tt11426538 skipped
tt8865546 skipped
tt8157074 skipped
tt10756160 skipped
tt9112712 skipped
tt7302054 skipped
tt3699702 skipped
tt10547906 skipped
tt10309552 skipped
tt9686350 skipped
tt8226892 skipped
tt5117326 skipped
tt10958102 skipped
tt9737688 skipped
tt8850932 skipped
tt10925838 skipped
tt6411748 skipped
tt3602422 skipped
tt9068272 skipped
tt7315570 skipped
tt9193890 skipped
tt6660238 skipped
tt5174992 skipped
tt8964214 skipped
tt5891062 skipped
tt10740928 skipped
tt8322502 skipped
tt10260692 skipped
tt8883472 skipped
tt7919180 skipped
tt8663446 skipped
tt8485548 skipped
tt10384514 skipped
tt9613316 skipped
tt6385768 skipped
tt8483220 skipped
tt7708376 skipped
tt10307856 skipped
tt10091530 skipped
tt3108448 skipped
tt8737608 skip

tt8649186 skipped
tt5577494 skipped
tt2717682 skipped
tt10640346 skipped
tt5598292 skipped
tt2197936 skipped
tt10937004 skipped
tt7395114 skipped
tt4585910 skipped
tt12326830 skipped
tt5952594 skipped
tt6920356 skipped
tt4622682 skipped
tt9419834 skipped
tt6198946 skipped
tt7557108 skipped
tt9130508 skipped
tt7983894 skipped
tt5769790 skipped
tt6012380 skipped
tt6744360 skipped
tt8323120 skipped
tt1657517 skipped
tt10370380 skipped
tt8526038 skipped
tt8767908 skipped
tt8710596 skipped
tt5843876 skipped
tt9411866 skipped
tt6878306 skipped
tt10095582 skipped
tt7305198 skipped
tt2420124 skipped
tt7313348 skipped
tt9026524 skipped
tt9261218 skipped
tt8123646 skipped
tt6423362 skipped
tt7737734 skipped
tt5919756 skipped
tt6910006 skipped
tt8635092 skipped
tt7939766 skipped
tt10199586 skipped
tt5439812 skipped
tt3344686 skipped
tt9143112 skipped
tt6217926 skipped
tt9608818 skipped
tt8186318 skipped
tt6803212 skipped
tt6598238 skipped
tt8801584 skipped
tt8506500 skipped
tt9257484 skipped
tt64

tt7888574 skipped
tt4943586 skipped
tt11990446 skipped
tt8995188 skipped
tt11135844 skipped
tt9309528 skipped
tt11416126 skipped
tt11568566 skipped
tt10738022 skipped
tt10319298 skipped
tt11703314 skipped
tt9914522 skipped
tt11444364 skipped
tt12053716 skipped
tt11443788 skipped
tt6556874 skipped
tt12183460 skipped
tt12416848 skipped
tt12043022 skipped
tt10349448 skipped
tt12409978 skipped
tt10836906 skipped
tt7778944 skipped
tt12404028 skipped
tt12169946 skipped
tt12410286 skipped
tt11668348 skipped
tt11152104 skipped
tt10873216 skipped
tt10613176 skipped
tt10125676 skipped
tt9738302 skipped
tt11548052 skipped
tt11724538 skipped
tt8438310 skipped
tt7287896 skipped
tt8844076 skipped
tt11616262 skipped
tt10645882 skipped
tt10925850 skipped
tt11750156 skipped
tt10569714 skipped
tt9364572 skipped
tt10408422 skipped
tt10417634 skipped
tt4328796 skipped
tt10974132 skipped
tt8182512 skipped
tt9046518 skipped
tt10911284 skipped
tt10203960 skipped
tt11963466 skipped
tt10458466 skipped
tt101553

tt8508734 skipped
tt8997134 skipped
tt9161168 skipped
tt10160976 skipped
tt7453138 skipped
tt8398892 skipped
tt10675136 skipped
tt5580688 skipped
tt2562146 skipped
tt7611352 skipped
tt6263490 skipped
tt9612368 skipped
tt5092484 skipped
tt11804152 skipped
tt10695464 skipped
tt7995680 skipped
tt10331140 skipped
tt9605894 skipped
tt12422064 skipped
tt8004664 skipped
tt7449376 skipped
tt11953368 skipped
tt11547496 skipped
tt10843306 skipped
tt10327842 skipped
tt10204512 skipped
tt5603620 skipped
tt4278760 skipped
tt8236214 skipped
tt10126434 skipped
tt9357288 skipped
tt6112524 skipped
tt12304094 skipped
tt7612032 skipped
tt8996254 skipped
tt10674930 skipped
tt10198072 skipped
tt7657364 skipped
tt6669466 skipped
tt11263022 skipped
tt11140488 skipped
tt3612008 skipped
tt3723784 skipped
tt8290478 skipped
tt9449952 skipped
tt10887344 skipped
tt7329642 skipped
tt10771232 skipped
tt5956016 skipped
tt12373798 skipped
tt7050896 skipped
tt8095862 skipped
tt2400305 skipped
tt7445682 skipped
tt119719

tt8828316 skipped
tt10862868 skipped
tt10563232 skipped
tt11885070 skipped
tt3442126 skipped
tt9184760 skipped
tt10182228 skipped
tt11074166 skipped
tt3312160 skipped
tt9570072 skipped
tt8202692 skipped
tt7679334 skipped
tt3392348 skipped
tt7976864 skipped
tt6385574 skipped
tt7620598 skipped
tt11212786 skipped
tt11267088 skipped
tt10305256 skipped
tt10172986 skipped
tt5072510 skipped
tt3651846 skipped
tt11751580 skipped
tt8373010 skipped
tt12433266 skipped
tt11694190 skipped
tt12250680 skipped
tt6736488 skipped
tt11362206 skipped
tt4106264 skipped
tt10746716 skipped
tt10800294 skipped
tt8931870 skipped
tt7053940 skipped
tt10224808 skipped
tt1645033 skipped
tt12037490 skipped
tt10193468 skipped
tt11255486 skipped
tt7738704 skipped
tt9052266 skipped
tt6785752 skipped
tt10739160 skipped
tt10223876 skipped
tt6810618 skipped
tt10539172 skipped
tt3917506 skipped
tt9358088 skipped
tt7569226 skipped
tt6787238 skipped
tt4961164 skipped
tt8840224 skipped
tt5778410 skipped
tt4488836 skipped
tt376

tt10290998 skipped
tt6576114 skipped
tt8425648 skipped
tt5096928 skipped
tt6466598 skipped
tt8549744 skipped
tt6692264 skipped
tt3145630 skipped
tt6076208 skipped
tt10400198 skipped
tt7278052 skipped
tt9189966 skipped
tt7631002 skipped
tt8997082 skipped
tt9137572 skipped
tt11165704 skipped
tt5820846 skipped
tt2378840 skipped
tt8898710 skipped
tt10451906 skipped
tt10368680 skipped
tt5007760 skipped
tt9412088 skipped
tt2278524 skipped
tt11968836 skipped
tt12356374 skipped
tt10631206 skipped
tt8912188 skipped
tt10864584 skipped
tt12180204 skipped
tt7874088 skipped
tt12233874 skipped
tt8985294 skipped
tt4727422 skipped
tt5181618 skipped
tt11074676 skipped
tt9079348 skipped
tt5322692 skipped
tt4355116 skipped
tt11401672 skipped
tt10081728 skipped
tt11469636 skipped
tt9834736 skipped
tt8740540 skipped
tt10087630 skipped
tt9703644 skipped
tt6646956 skipped
tt10946474 skipped
tt10417658 skipped
tt4352354 skipped
tt3496000 skipped
tt10905522 skipped
tt11183702 skipped
tt10935766 skipped
tt11723

tt7391272 skipped
tt8806520 skipped
tt3354720 skipped
tt8918562 skipped
tt11147332 skipped
tt6330436 skipped
tt3403700 skipped
tt10524828 skipped
tt11377474 skipped
tt9648806 skipped
tt12537572 skipped
tt8479232 skipped
tt11238116 skipped
tt2359479 skipped
tt12139214 skipped
tt11695082 skipped
tt11859460 skipped
tt10942166 skipped
tt9600960 skipped
tt11231356 skipped
tt10981092 skipped
tt10178846 skipped
tt11191490 skipped
tt11353248 skipped
tt10426006 skipped
tt10972912 skipped
tt11380046 skipped
tt11735608 skipped
tt11496650 skipped
tt11397250 skipped
tt9691846 skipped
tt12117734 skipped
tt12347376 skipped
tt7705106 skipped
tt11731072 skipped
tt12327734 skipped
tt11452612 skipped
tt11093580 skipped
tt12384786 skipped
tt10431294 skipped
tt10477868 skipped
tt2006294 skipped
tt11247696 skipped
tt9260960 skipped
tt9490498 skipped
tt8980476 skipped
tt10332500 skipped
tt8907592 skipped
tt5073172 skipped
tt12274210 skipped
tt8433920 skipped
tt10152264 skipped
tt12228612 skipped
tt12331436 s

tt10985590 skipped
tt7710486 skipped
tt9866208 skipped
tt5571814 skipped
tt11384790 skipped
tt10201328 skipped
tt10869956 skipped
tt7384402 skipped
tt9596692 skipped
tt3238924 skipped
tt11585506 skipped
tt7063474 skipped
tt11457566 skipped
tt11414766 skipped
tt11138294 skipped
tt8461408 skipped
tt4766456 skipped
tt11210084 skipped
tt11025860 skipped
tt10788058 skipped
tt11981274 skipped
tt11943142 skipped
tt11399330 skipped
tt12065910 skipped
tt11434050 skipped
tt8449562 skipped
tt10253466 skipped
tt9302994 skipped
tt9021234 skipped
tt10880084 skipped
tt10303892 skipped
tt12343138 skipped
tt10049848 skipped
tt9135272 skipped
tt10449052 skipped
tt11609692 skipped
tt8770976 skipped
tt5358056 skipped
tt12364630 skipped
tt8290068 skipped
tt10452224 skipped
tt5226618 skipped
tt10622840 skipped
tt12079822 skipped
tt11474972 skipped
tt11050682 skipped
tt10675042 skipped
tt2300212 skipped
tt4916788 skipped
tt11367378 skipped
tt9857850 skipped
tt9740414 skipped
tt4460424 skipped
tt7802198 skipp

In [11]:
# join all detailed information in json files and save as csv

# define the list for detailed information
details = list()

details_json = open('data/movies_details.json', 'r')
line = details_json.readline()

# for each movie details line
while line:
    if len(line.strip()) > 0:
        # append to the list
        details.append(json.loads(line))
        # print('.', end = '')

    # read next line
    line = details_json.readline()

# write into csv file
details_df = pd.DataFrame(details)
details_df.drop_duplicates(subset = ['movie_id'], inplace = True)
details_df.to_csv('data/movies_details.csv')

In [12]:
# join all box office information in json files and save as csv

# define the list for box office information
box_office = list()

box_office_json = open('data/movies_box_office.json', 'r')
line = box_office_json.readline()

# for each movie box office line
while line:
    if len(line.strip()) > 0:
        # append to the list
        box_office.append(json.loads(line))
        # print('.', end = '')

    # read next line
    line = box_office_json.readline()

# write into csv file
box_office_df = pd.DataFrame(box_office)
box_office_df.drop_duplicates(subset = ['movie_id'], inplace = True)
box_office_df.to_csv('data/movies_box_office.csv')

# Part 3: crawl reviews for movies

In [13]:
# comment are loaded via ajax for page 2++, need web driver to crawl
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# function to extract review details from beautiful soup object (shared by normal crawling & web driver crawling)
def extract_reviews(bs):
    
    reviews = list()
    # the beautiful soup object is a list of review
    for review_div in bs.find_all("div", class_="review-container"):
        # define empty row for review
        review = {
            "rating": "", "title": "", "url": "", "id": "", "user_id": "", "user_name": "", 
            "user_url": "", "date": "", "spoiler": "", "other": "", "comment": ""
        }
            
        # extracted information from html tags
        review_rating = review_div.find_all("span", class_="rating-other-user-rating")
        review_link = review_div.find_all("a", class_="title")
        review_user = review_div.find_all("span", class_="display-name-link")
        review_date = review_div.find_all("span", class_="review-date")
        review_spoiler = review_div.find_all("span", class_="spoiler-warning")
        review_comment = review_div.select("div.content > div.text")

        # and if respected information exists, assign to the row
        if len(review_rating) > 0:
            review['rating'] = review_rating[0].text.strip()

        if len(review_link) > 0:
            review['title'] = review_link[0].text.strip()
            review['url'] = review_link[0].get('href')
            review['id'] = review_link[0].get('href').split('/')[2]

        if len(review_user) > 0:
            review_user_link = review_user[0].select("a")
            if len(review_user_link) > 0:
                review['user_name'] = review_user_link[0].text.strip()
                review['user_url'] = review_user_link[0].get('href')
                review['user_id'] = review_user_link[0].get('href').split('/')[2]
            
        if len(review_date) > 0:
            review['date'] = review_date[0].text.strip()
            
        if len(review_spoiler) > 0:
            review['spoiler'] = review_spoiler[0].text.strip()

        if len(review_comment) > 0:
            review['comment'] = review_comment[0].text.strip()
            
        reviews.append(review)
    
    # return the list of review extraceted from the beautiful soup object
    return(reviews)
        
    
# function to crawl review of the given movie & url
def crawl_reviews(movie_id, review_url):
    
    # some issue with crawling process, needs to handle retries
    retry_strategy = Retry(
        total = 10,
        status_forcelist = [429, 500, 502, 503, 504],
        method_whitelist = ["HEAD", "GET", "OPTIONS"],
        backoff_factor = 1
    )
    
    adapter = HTTPAdapter(max_retries=retry_strategy)
    http = requests.Session()
    http.mount("https://", adapter)
    http.mount("http://", adapter)

    review_url = host + review_url
    print(review_url, end = " - ")
    
    # get the beautifu soup of jects of the review page
    reviews_bs4 = BeautifulSoup(http.get(review_url, headers=headers).text, 'html.parser')
    
    # extract review information from the beautiful soup object
    reviews = extract_reviews(reviews_bs4)
    
    # dump the first page of review into individual json file for movie
    with open('data/reviews_' + movie_id + '.json', 'a+') as reviews_json:
        print(json.dumps(reviews), file = reviews_json, flush = True)
        
    page_count = 0
    
    # if the page contains reviews
    if len(reviews) > 0:
        page_count = 1
        
        # initial the web drive to simulate click to retrieve ajax update
        driver = webdriver.Chrome('/home/kitlim/.wdm/drivers/chromedriver/linux64/80.0.3987.106/chromedriver')
        wait = WebDriverWait(driver,10)
        driver.get(review_url)
        soup = BeautifulSoup(driver.page_source, 'lxml')

        # infinite loop until no more new review pages
        previous_key = ""
        while True:
            try:
                # find if "Load More" button exist
                element = driver.find_element_by_class_name("load-more-data");
                key = element.get_attribute('data-key')
                
            except NoSuchElementException:
                # if not, end the loop
                break
                
            # if the key of the current "Load More" button is not the same with previous key
            if key != previous_key:
                if key is not None:
                    # get the ajax url for the new review page
                    load_more_review_ajax_url = review_url + "/_ajax?ref_=undefined&paginationKey=" + key
                
                    try:
                        # get the beautifu soup of jects of the review page returned by the ajax call
                        reviews_bs4 = BeautifulSoup(http.get(load_more_review_ajax_url, headers=headers).text, 'html.parser')
                    except: 
                        # error handling
                        print('ConnectionError: retry')
                        time.sleep(0.5)
                        continue
                        
                    # extract review information from the beautiful soup object
                    reviews = extract_reviews(reviews_bs4)
                    
                    page_count = page_count + 1
                
                # if same with previous key, do nothing (previous ajax call is not completed successfully yet)
                else: 
                    break
                
                # dump the current page of review into individual json file for movie
                with open('data/reviews_' + movie_id + '.json', 'a+') as reviews_json:
                    print(json.dumps(reviews), file = reviews_json, flush = True)

            try:
                # simulate click on the button to load more review 
                # and update the "Load more" button's key
                driver.find_element_by_css_selector("button#load-more-trigger").click()
                wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR,".ipl-load-more__load-indicator")))
                soup = BeautifulSoup(driver.page_source, 'lxml')
                
            except Exception:
                break
                
        # close the web driver
        driver.quit()
        
    print(page_count, "pages")

In [14]:
# list of movies which has been successfully crawled with reviews
review_done_list = pd.read_csv('data/review_done.csv', names=['movie_id'])
    
# for each selected movie
for movie in selected_movies.to_dict('record'):
    
    # if other reviews of the movie has been successfully crawled before, skip
    if movie['id'] in list(review_done_list['movie_id']):
        # print(movie['id'], 'skipped')
        continue
    
    # crawl reviews of current movie
    crawl_reviews(movie['id'], movie['review_url'])

    # add the movie as successfully crawled
    with open('data/review_done.csv', 'a+') as review_done:
        print(movie['id'], file = review_done, flush = True)
    

In [15]:
# join all reviews from individual movie's json files into one

# define the list for reviews
reviews = list()
for filename in sorted(os.listdir('data')):
    if filename.startswith('reviews_') == True & filename.endswith('.json') == True :
        # print(filename, end = ' ')
        
        # forgotten to include movie id during review crawling 
        movie_id = filename.split('.json')[0].split('reviews_')[1]
        
        reviews_json = open('data/' + filename, 'r')
        line = reviews_json.readline()

        # each line equal to one review page
        while line:
            if len(line.strip()) > 0:
                current_reviews = json.loads(line)
                
                # for each review in the page, append to the list
                for review in current_reviews:
                    # add into each review before write as csv
                    review['movie_id'] = movie_id
                    reviews.append(review)
                    
                # print('.', end = '')
            
            # read next line (review page)
            line = reviews_json.readline()

# write all review into a single review csv file
reviews_df = pd.DataFrame(reviews)
reviews_df.drop_duplicates(['id'], inplace = True)
reviews_df.to_csv('data/reviews.csv')

In [16]:
reviews_df

Unnamed: 0,rating,title,url,id,user_id,user_name,user_url,date,spoiler,other,comment,movie_id
0,4/10,The franchise has had a lobotomy,/review/rw4751363/?ref_=tt_urv,rw4751363,ur101048846,genebathurst,/user/ur101048846/?ref_=tt_urv,30 March 2019,Warning: Spoilers,,Dinosaurs. Amusement Park. Tourists. Disaster....,tt0369610
1,2/10,There is a plus...,/review/rw3844587/?ref_=tt_urv,rw3844587,ur22419131,vulneri,/user/ur22419131/?ref_=tt_urv,29 October 2017,,,... and it's dinosaurs. Absolutely everything ...,tt0369610
2,7/10,"Spielberg Magic, This Is Not. Still, a Visit t...",/review/rw4200129/?ref_=tt_urv,rw4200129,ur35359466,jaredpahl,/user/ur35359466/?ref_=tt_urv,12 June 2018,Warning: Spoilers,,You may have heard some critics champion Juras...,tt0369610
3,3/10,Another piece of modern trash.,/review/rw3846832/?ref_=tt_urv,rw3846832,ur9357474,marbl-1,/user/ur9357474/?ref_=tt_urv,1 November 2017,Warning: Spoilers,,"There is a young, handsome (I suspect), super ...",tt0369610
4,3/10,Bleah,/review/rw3387151/?ref_=tt_urv,rw3387151,ur33389853,antonioborrani,/user/ur33389853/?ref_=tt_urv,3 January 2016,,,"A dull monster movie without ideas, with all t...",tt0369610
...,...,...,...,...,...,...,...,...,...,...,...,...
482239,1/10,Low budget junk,/review/rw5063745/?ref_=tt_urv,rw5063745,ur38716160,bastian-kreuzer,/user/ur38716160/?ref_=tt_urv,16 August 2019,Warning: Spoilers,,That's the best words I can find to describe i...,tt9904820
482240,2/10,its a surviving story if in the right hands an...,/review/rw4823075/?ref_=tt_urv,rw4823075,ur79950921,ops-52535,/user/ur79950921/?ref_=tt_urv,2 May 2019,,,"This is a c-level horror flick, and like most ...",tt9904820
482241,,It don't add up,/review/rw4842487/?ref_=tt_urv,rw4842487,ur26490810,raindog_mk,/user/ur26490810/?ref_=tt_urv,11 May 2019,,,"Look, I ain't seen this movie, neither I will,...",tt9904820
482242,10/10,Loved this movie!,/review/rw5817238/?ref_=tt_urv,rw5817238,ur119650852,shirleydahlseide,/user/ur119650852/?ref_=tt_urv,12 June 2020,,,Very intense social drama. Realistic character...,tt9913660


# Part 4: Others

### Merging all three dimensions of movie datasets

In [17]:
# merge general info with detailed info
merged_movie_df = pd.merge(movies_df, details_df, how = 'inner', left_on = 'id', right_on = 'movie_id')
merged_movie_df.shape

(5478, 17)

In [18]:
# merge again with box office info
merged_movie_df = pd.merge(merged_movie_df, box_office_df, how = 'inner', on = 'movie_id')
merged_movie_df.shape

(5478, 21)

In [19]:
# drop the duplicated colum after merged
merged_movie_df.drop(['id'], inplace = True, axis = 1)
merged_movie_df

Unnamed: 0,name,url,year,certificate,runtime,genre,rating,metascore,review_url,movie_id,critic_count,review_count,directors,writers,stars,awards,budget,us_opening,us_gross,worldwide_gross
0,Tenet,/title/tt6723592/?ref_=adv_li_tt,2020,PG-13,,"[Action, Sci-Fi, Thriller]",,,/title/tt6723592/reviews,tt6723592,1 critic,,[{'Christopher Nolan': '/name/nm0634240/?ref_=...,[{'Christopher Nolan': '/name/nm0634240/?ref_=...,[{'John David Washington': '/name/nm0913475/?r...,[],"Budget:$205,000,000\n (estimated)",,,
1,The Last Days of American Crime,/title/tt1552211/?ref_=adv_li_tt,2020,,148 min,"[Action, Crime, Thriller]",3.6,15,/title/tt1552211/reviews,tt1552211,55 critic,294 user,[{'Olivier Megaton': '/name/nm0576298/?ref_=tt...,[{'Karl Gajdusek': '/name/nm2244980/?ref_=tt_o...,[{'Neels Clasen': '/name/nm1085134/?ref_=tt_ov...,[],,,,
2,The Hunt,/title/tt8244784/?ref_=adv_li_tt,2020,R,90 min,"[Action, Horror, Thriller]",6.4,50,/title/tt8244784/reviews,tt8244784,209 critic,639 user,[{'Craig Zobel': '/name/nm0957505/?ref_=tt_ov_...,[{'Nick Cuse': '/name/nm7226510/?ref_=tt_ov_wr...,[{'Betty Gilpin': '/name/nm2365811/?ref_=tt_ov...,[],"Budget:$14,000,000\n (estimated)","Opening Weekend USA: $5,304,455,\n15 March 2020","Gross USA: $5,812,500","Cumulative Worldwide Gross: $6,562,393"
3,The Gentlemen,/title/tt8367814/?ref_=adv_li_tt,2019,R,113 min,"[Action, Comedy, Crime]",7.9,51,/title/tt8367814/reviews,tt8367814,247 critic,"1,058 user",[{'Guy Ritchie': '/name/nm0005363/?ref_=tt_ov_...,[{'Guy Ritchie': '/name/nm0005363/?ref_=tt_ov_...,[{'Matthew McConaughey': '/name/nm0000190/?ref...,[],"Budget:$22,000,000\n (estimated)","Opening Weekend USA: $10,651,884,\n26 January ...","Gross USA: $36,296,853","Cumulative Worldwide Gross: $114,996,853"
4,Avengers: Endgame,/title/tt4154796/?ref_=adv_li_tt,2019,P13,181 min,"[Action, Adventure, Drama]",8.4,78,/title/tt4154796/reviews,tt4154796,551 critic,"8,784 user",[{'Anthony Russo': '/name/nm0751577/?ref_=tt_o...,[{'Christopher Markus': '/name/nm1321655/?ref_...,[{'Robert Downey Jr.': '/name/nm0000375/?ref_=...,[Nominated for\n 1\n Osc...,"Budget:$356,000,000\n (estimated)","Opening Weekend USA: $357,115,007,\n28 April 2019","Gross USA: $858,373,000","Cumulative Worldwide Gross: $2,797,800,564"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5473,Trespass,/title/tt8359690/?ref_=adv_li_tt,2020,,,[Western],,,/title/tt8359690/reviews,tt8359690,,,[{'Paul Spadaro': '/name/nm4754362/?ref_=tt_ov...,[{'Paul Spadaro': '/name/nm4754362/?ref_=tt_ov...,[{'Paul Winston': '/name/nm3025848/?ref_=tt_ov...,[],"Budget:$100,000\n (estimated)",,,
5474,Six Guns for Hire,/title/tt11540858/?ref_=adv_li_tt,2020,,,[Western],,,/title/tt11540858/reviews,tt11540858,,,[],[],[],[],,,,
5475,Turn and Burn,/title/tt11281290/?ref_=adv_li_tt,2021,,,"[Drama, Romance, Western]",,,/title/tt11281290/reviews,tt11281290,,,[{'Travis Mills': '/name/nm3524360/?ref_=tt_ov...,[{'Joseph Pevey': '/name/nm10800204/?ref_=tt_o...,[{'Corah Brunson': '/name/nm11523667/?ref_=tt_...,[],,,,
5476,Ruthless Reprisal,/title/tt11512180/?ref_=adv_li_tt,2020,,85 min,[Western],,,/title/tt11512180/reviews,tt11512180,,,[],[],[],[],,,,


### Extract persons in director, writer, and star as standalone dataset

In [20]:
# for the person fields (director, writer, and star), extract and store as new csv
# bad stratergy while crawling, need extra work to extract

credit_list = pd.concat([merged_movie_df['directors'], merged_movie_df['writers'], merged_movie_df['stars']])

persons = list()
for person_list in credit_list:
    for person_dict in person_list:
        for person in person_dict:

            person_info = person_dict[person].split('/')

            if len(person_info) > 2:
                persons.append({
                    'person_id': person_info[2],
                    'person_url': person_dict[person],
                    'person_name': person
                })

persons_df = pd.DataFrame(persons).drop_duplicates()
persons_df.to_csv('data/persons.csv')



In [21]:
# and keep only the list of person id in the respective fields

def extract_person_id(persons):
    person_ids = list()
    
    for person in persons:
        person_info = list(person.values())[0].split('/')
        if len(person_info) > 2:
            person_ids.append(person_info[2])
    
    return person_ids

merged_movie_df['directors'] = merged_movie_df['directors'].apply(extract_person_id)
merged_movie_df['writers'] = merged_movie_df['writers'].apply(extract_person_id)
merged_movie_df['stars'] = merged_movie_df['stars'].apply(extract_person_id)

merged_movie_df.drop_duplicates(['movie_id'], inplace = True)
merged_movie_df.to_csv('data/movies.csv')
merged_movie_df

Unnamed: 0,name,url,year,certificate,runtime,genre,rating,metascore,review_url,movie_id,critic_count,review_count,directors,writers,stars,awards,budget,us_opening,us_gross,worldwide_gross
0,Tenet,/title/tt6723592/?ref_=adv_li_tt,2020,PG-13,,"[Action, Sci-Fi, Thriller]",,,/title/tt6723592/reviews,tt6723592,1 critic,,[nm0634240],[nm0634240],"[nm0913475, nm1500155, nm4456120]",[],"Budget:$205,000,000\n (estimated)",,,
1,The Last Days of American Crime,/title/tt1552211/?ref_=adv_li_tt,2020,,148 min,"[Action, Crime, Thriller]",3.6,15,/title/tt1552211/reviews,tt1552211,55 critic,294 user,[nm0576298],"[nm2244980, nm3559500]","[nm1085134, nm1183149, nm0135610]",[],,,,
2,The Hunt,/title/tt8244784/?ref_=adv_li_tt,2020,R,90 min,"[Action, Horror, Thriller]",6.4,50,/title/tt8244784/reviews,tt8244784,209 critic,639 user,[nm0957505],"[nm7226510, nm0511541]","[nm2365811, nm0005476, nm0054697]",[],"Budget:$14,000,000\n (estimated)","Opening Weekend USA: $5,304,455,\n15 March 2020","Gross USA: $5,812,500","Cumulative Worldwide Gross: $6,562,393"
3,The Gentlemen,/title/tt8367814/?ref_=adv_li_tt,2019,R,113 min,"[Action, Comedy, Crime]",7.9,51,/title/tt8367814/reviews,tt8367814,247 critic,"1,058 user",[nm0005363],"[nm0005363, nm6842463]","[nm0000190, nm0402271, nm1890784]",[],"Budget:$22,000,000\n (estimated)","Opening Weekend USA: $10,651,884,\n26 January ...","Gross USA: $36,296,853","Cumulative Worldwide Gross: $114,996,853"
4,Avengers: Endgame,/title/tt4154796/?ref_=adv_li_tt,2019,P13,181 min,"[Action, Adventure, Drama]",8.4,78,/title/tt4154796/reviews,tt4154796,551 critic,"8,784 user","[nm0751577, nm0751648]","[nm1321655, nm1321656]","[nm0000375, nm0262635, nm0749263]",[Nominated for\n 1\n Osc...,"Budget:$356,000,000\n (estimated)","Opening Weekend USA: $357,115,007,\n28 April 2019","Gross USA: $858,373,000","Cumulative Worldwide Gross: $2,797,800,564"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5473,Trespass,/title/tt8359690/?ref_=adv_li_tt,2020,,,[Western],,,/title/tt8359690/reviews,tt8359690,,,[nm4754362],"[nm4754362, nm4754362]","[nm3025848, nm4754362]",[],"Budget:$100,000\n (estimated)",,,
5474,Six Guns for Hire,/title/tt11540858/?ref_=adv_li_tt,2020,,,[Western],,,/title/tt11540858/reviews,tt11540858,,,[],[],[],[],,,,
5475,Turn and Burn,/title/tt11281290/?ref_=adv_li_tt,2021,,,"[Drama, Romance, Western]",,,/title/tt11281290/reviews,tt11281290,,,[nm3524360],[nm10800204],"[nm11523667, nm11432949, nm11215876]",[],,,,
5476,Ruthless Reprisal,/title/tt11512180/?ref_=adv_li_tt,2020,,85 min,[Western],,,/title/tt11512180/reviews,tt11512180,,,[],[],[],[],,,,


### Rename certain fields for more proper name

In [22]:
# rename rating in movies to average_rating
merged_movie_df.rename(columns={'rating': 'average_rating'}, inplace = True)

# rename id in review to review_id
reviews_df.rename(columns={'id': 'review_id'}, inplace = True)


### Extract user in reviews and save as standalone dataset

In [23]:
# bad stratergy while crawling, need extra work to extract
users_df = reviews_df[['user_id', 'user_name', 'user_url']].drop_duplicates()
users_df.to_csv('data/users.csv')
users_df

Unnamed: 0,user_id,user_name,user_url
0,ur101048846,genebathurst,/user/ur101048846/?ref_=tt_urv
1,ur22419131,vulneri,/user/ur22419131/?ref_=tt_urv
2,ur35359466,jaredpahl,/user/ur35359466/?ref_=tt_urv
3,ur9357474,marbl-1,/user/ur9357474/?ref_=tt_urv
4,ur33389853,antonioborrani,/user/ur33389853/?ref_=tt_urv
...,...,...,...
482234,ur56267831,philippejakko,/user/ur56267831/?ref_=tt_urv
482235,ur118208042,raywatts-72872,/user/ur118208042/?ref_=tt_urv
482237,ur119049573,getkoreandeal,/user/ur119049573/?ref_=tt_urv
482241,ur26490810,raindog_mk,/user/ur26490810/?ref_=tt_urv


In [24]:
# drop user_name and user_url from reviews
reviews_df.drop(['user_name', 'user_url'], axis = 1, inplace = True)

### Generating sample csv for the datasets (for submission)

In [25]:
movies_samples = merged_movie_df#.sample(1000)
movies_samples.to_csv('data/movies.csv')
movies_samples.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5478 entries, 0 to 5477
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   name             5478 non-null   object
 1   url              5478 non-null   object
 2   year             5478 non-null   int64 
 3   certificate      5478 non-null   object
 4   runtime          5478 non-null   object
 5   genre            5478 non-null   object
 6   average_rating   5478 non-null   object
 7   metascore        5478 non-null   object
 8   review_url       5478 non-null   object
 9   movie_id         5478 non-null   object
 10  critic_count     5478 non-null   object
 11  review_count     5478 non-null   object
 12  directors        5478 non-null   object
 13  writers          5478 non-null   object
 14  stars            5478 non-null   object
 15  awards           5478 non-null   object
 16  budget           5478 non-null   object
 17  us_opening       5478 non-null   

In [26]:
reviews_samples = reviews_df#.sample(1000)
reviews_samples.to_csv('data/reviews.csv')
reviews_samples.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 333656 entries, 0 to 482243
Data columns (total 10 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   rating     333656 non-null  object
 1   title      333656 non-null  object
 2   url        333656 non-null  object
 3   review_id  333656 non-null  object
 4   user_id    333656 non-null  object
 5   date       333656 non-null  object
 6   spoiler    333656 non-null  object
 7   other      333656 non-null  object
 8   comment    333656 non-null  object
 9   movie_id   333656 non-null  object
dtypes: object(10)
memory usage: 28.0+ MB


In [27]:
persons_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23554 entries, 0 to 27745
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   person_id    23554 non-null  object
 1   person_url   23554 non-null  object
 2   person_name  23554 non-null  object
dtypes: object(3)
memory usage: 736.1+ KB


In [28]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 188191 entries, 0 to 482242
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   user_id    188191 non-null  object
 1   user_name  188191 non-null  object
 2   user_url   188191 non-null  object
dtypes: object(3)
memory usage: 5.7+ MB
