In [24]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import pymongo
import json

In [2]:
imdb_url_base = 'https://www.imdb.com/chart/top?ref_=tt_awd'

In [16]:
df_rate = pd.read_table('data/title.ratings.tsv')

In [3]:
# get html
def get_html(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status
        #site encoding
        r.encoding = 'utf-8'
        return r.text
    except:
        print('ERROR')
        pass

In [4]:
# get top250 imdb movie data
def get_top250_html_list(url):
    html = get_html(url)
    soup = bs(html, 'lxml')
    data_list = []
    
    try:
        
        m_list = soup.find('tbody', class_='lister-list').find_all('tr')
        for tr in m_list:
            data = {}
            data['rank'] = tr.find('td', class_='titleColumn').contents[0].strip()[:-1]
            data['id'] = tr.find('div', class_='seen-widget')['data-titleid']
            data['movie'] = tr.find('td', class_='titleColumn').contents[1].text
            data['year'] = tr.find('td', class_='titleColumn').find('span', class_='secondaryInfo').text[1:-1]
            #data['rate'] = tr.find('strong').text
            data_list.append(data)
    except:
        print('error')
    
    return data_list

In [5]:
movie_list = get_top250_html_list(imdb_url_base)

In [6]:
# save data into mongo db
client = pymongo.MongoClient('localhost', 27017)
db = client['movie_db']
col = db['imbd_top250']

In [7]:
col.insert_many(movie_list)

<pymongo.results.InsertManyResult at 0x118c9e8c8>

In [21]:
# get details of each movie
def get_movie_detail(movie_id):
    base_url = 'https://www.imdb.com/title/'
    url = base_url + movie_id + '/' 
    data = {}
    # get movie data 
    html = get_html(url)
    soup = bs(html, 'lxml')
    try:
        data['rate'] = soup.find('div', class_='ratings_wrapper').find('span', itemprop="ratingValue").text
        data['rate_num'] = soup.find('div', class_='ratings_wrapper').find('span', itemprop="ratingCount").text
        #data['imdb_id'] = soup.find('div', id='info').find_all('a')[-1].text
    except:
        print('error')
    
    return data

In [32]:
# get details with omdb API call
def request_movie_detail(movie_id):
    url_base = 'http://www.omdbapi.com/?i='
    api_key = '&apikey=6b6ec75b'
    url_call = url_base + movie_id + api_key
    print(url_call)
    data = requests.get(url_call).json()
    
    data_update = {
        'box_office': data['BoxOffice'],
        'country': data['Country'],
        'genre': data['Genre'],
        'lan': data['Language'],
        'imdbRate': data['imdbRating'],
        'imdbVotes': data['imdbVotes'],
        'production': data['Production'],
        'rated': data['Rated'],
        'awards': data['Awards']
    }
    
    return data_update

In [36]:
# add details to database if field does not exist
def udpate_field_mongodb(movie_id, data):
    col.update_one(
        {
            'id': movie_id
        },
        {
            '$set':{
                'box_office': data['box_office'],
                'country': data['country'],
                'genre': data['genre'],
                'imdbRate': data['imdbRate'],
                'imdbVotes': data['imdbVotes'],
                'lan': data['lan'],
                'production': data['production'],
                'rated': data['rated'],
                'awards': data['awards']
            }
        }
    )

In [39]:
# get detais and update in mongodb
for movie in movie_list:
    update_data = request_movie_detail(movie['id'])
    print('movie:', movie['movie'], 'find')
    udpate_field_mongodb(movie['id'], update_data)
    print('movie:', movie['movie'], 'updated in mongoDB')
    print('---------------')

http://www.omdbapi.com/?i=tt0111161&apikey=6b6ec75b
movie: The Shawshank Redemption find
movie: The Shawshank Redemption updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0068646&apikey=6b6ec75b
movie: The Godfather find
movie: The Godfather updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0071562&apikey=6b6ec75b
movie: The Godfather: Part II find
movie: The Godfather: Part II updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0468569&apikey=6b6ec75b
movie: The Dark Knight find
movie: The Dark Knight updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0050083&apikey=6b6ec75b
movie: 12 Angry Men find
movie: 12 Angry Men updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0108052&apikey=6b6ec75b
movie: Schindler's List find
movie: Schindler's List updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0167260&apikey=6b6ec75b
movie: The Lord of the Rings: The Return of the King find
movie: The Lord of the Rings: The Retu

movie: Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb find
movie: Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt4154756&apikey=6b6ec75b
movie: Avengers: Infinity War find
movie: Avengers: Infinity War updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0050825&apikey=6b6ec75b
movie: Paths of Glory find
movie: Paths of Glory updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0081505&apikey=6b6ec75b
movie: The Shining find
movie: The Shining updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt1853728&apikey=6b6ec75b
movie: Django Unchained find
movie: Django Unchained updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0910970&apikey=6b6ec75b
movie: WALL·E find
movie: WALL·E updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0119698&apikey=6b6ec75b
movie: Mononoke-hime find
movie: Mononoke-hime updated in mongoDB
------------

movie: Some Like It Hot find
movie: Some Like It Hot updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0105695&apikey=6b6ec75b
movie: Unforgiven find
movie: Unforgiven updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0363163&apikey=6b6ec75b
movie: Der Untergang find
movie: Der Untergang updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0095016&apikey=6b6ec75b
movie: Die Hard find
movie: Die Hard updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0040897&apikey=6b6ec75b
movie: The Treasure of the Sierra Madre find
movie: The Treasure of the Sierra Madre updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0113277&apikey=6b6ec75b
movie: Heat find
movie: Heat updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt8108198&apikey=6b6ec75b
movie: Andhadhun find
movie: Andhadhun updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0044741&apikey=6b6ec75b
movie: Ikiru find
movie: Ikiru updated in mongoDB
----------

movie: Gone Girl find
movie: Gone Girl updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0892769&apikey=6b6ec75b
movie: How to Train Your Dragon find
movie: How to Train Your Dragon updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0018455&apikey=6b6ec75b
movie: Sunrise: A Song of Two Humans find
movie: Sunrise: A Song of Two Humans updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0107290&apikey=6b6ec75b
movie: Jurassic Park find
movie: Jurassic Park updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt3011894&apikey=6b6ec75b
movie: Relatos salvajes find
movie: Relatos salvajes updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt6966692&apikey=6b6ec75b
movie: Green Book find
movie: Green Book updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0758758&apikey=6b6ec75b
movie: Into the Wild find
movie: Into the Wild updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0120382&apikey=6b6ec75b
movie: The Truma

movie: Drishyam find
movie: Drishyam updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0056443&apikey=6b6ec75b
movie: Sanjuro find
movie: Sanjuro updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt1954470&apikey=6b6ec75b
movie: Gangs of Wasseypur find
movie: Gangs of Wasseypur updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0087884&apikey=6b6ec75b
movie: Paris, Texas find
movie: Paris, Texas updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0092067&apikey=6b6ec75b
movie: Tenkû no shiro Rapyuta find
movie: Tenkû no shiro Rapyuta updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0070510&apikey=6b6ec75b
movie: Paper Moon find
movie: Paper Moon updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0036868&apikey=6b6ec75b
movie: The Best Years of Our Lives find
movie: The Best Years of Our Lives updated in mongoDB
---------------
http://www.omdbapi.com/?i=tt0083922&apikey=6b6ec75b
movie: Fanny och Alexander find
movi