In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import pymongo

In [84]:
douban_url_base = 'https://movie.douban.com/top250?start='
page = 10

In [82]:
# get html
def get_html(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status
        #site encoding
        r.encoding = 'utf-8'
        return r.text
    except:
        print('ERROR')
        pass

In [107]:
# get top250 imdb movie data
def get_top250_html_list(url):
    base = url
    data_list = []
    # run 10 pages
    for i in range(page):
        num = str(i * 25)
        html = get_html(base + num)
        soup = bs(html, 'lxml')
        
        try:
            m_list = soup.find('ol', class_='grid_view').find_all('li')
            for item in m_list:
                data = {}
                data['rank'] = item.find('em').text
                data['id'] = item.find('div', class_='pic').find('a')['href'][33:-1]
                data['movie'] = item.find('span', class_='title').text
                data['year'] = item.find('div', class_='bd').find('p').contents[2].strip()[:4]
                data_list.append(data)
        except:
            print('error')
            
        print('page: ', i, 'finished')
    
    return data_list

In [109]:
movie_list = get_top250_html_list(douban_url_base)

page:  0 finished
page:  1 finished
page:  2 finished
page:  3 finished
page:  4 finished
page:  5 finished
page:  6 finished
page:  7 finished
page:  8 finished
page:  9 finished


In [70]:
# run 10 pages
page = 10

In [111]:
# save data into mongo db
client = pymongo.MongoClient('localhost', 27017)
db = client['movie_db']
col = db['douban_top250']

In [112]:
col.insert_many(movie_list)

<pymongo.results.InsertManyResult at 0x11ca54fc8>

In [114]:
movie_list[:1]  

[{'_id': ObjectId('5c4655787d76e005dff9db8f'),
  'id': '1292052',
  'movie': '肖申克的救赎',
  'rank': '1',
  'year': '1994'}]

In [134]:
# get details of 250 movies
def get_movie_detial(movie_id):
    base_url = 'https://movie.douban.com/subject/'
    url = base_url + movie_id + '/' 
    data = {}
    # get movie data 
    html = get_html(url)
    soup = bs(html, 'lxml')
    try:
        data['rate'] = soup.find('strong', class_='rating_num').text
        data['review'] = soup.find('span', property="v:votes").text
        data['imdb_id'] = soup.find('div', id='info').find_all('a')[-1].text
    except:
        print('error')
    
    return data

In [140]:
# add details to database if field does not exist
def udpate_field_mongodb(movie_id, data):
    col.update_one(
        {
            'id': movie_id
        },
        {
            '$set':{
                'imdb_id': data['imdb_id'],
                'rate': data['rate'],
                'review': data['review']
            }
        }
    )

In [144]:
# get detais and update in mongodb
for movie in movie_list:
    update_data = get_movie_detial(movie['id'])
    print('movie:', movie['movie'], 'find')
    udpate_field_mongodb(movie['id'], update_data)
    print('movie:', movie['movie'], 'updated in mongoDB')
    print('---------------')

movie: 肖申克的救赎 find
movie: 肖申克的救赎 updated in mongoDB
---------------
movie: 霸王别姬 find
movie: 霸王别姬 updated in mongoDB
---------------
movie: 这个杀手不太冷 find
movie: 这个杀手不太冷 updated in mongoDB
---------------
movie: 阿甘正传 find
movie: 阿甘正传 updated in mongoDB
---------------
movie: 美丽人生 find
movie: 美丽人生 updated in mongoDB
---------------
movie: 泰坦尼克号 find
movie: 泰坦尼克号 updated in mongoDB
---------------
movie: 千与千寻 find
movie: 千与千寻 updated in mongoDB
---------------
movie: 辛德勒的名单 find
movie: 辛德勒的名单 updated in mongoDB
---------------
movie: 盗梦空间 find
movie: 盗梦空间 updated in mongoDB
---------------
movie: 机器人总动员 find
movie: 机器人总动员 updated in mongoDB
---------------
movie: 忠犬八公的故事 find
movie: 忠犬八公的故事 updated in mongoDB
---------------
movie: 三傻大闹宝莱坞 find
movie: 三傻大闹宝莱坞 updated in mongoDB
---------------
movie: 海上钢琴师 find
movie: 海上钢琴师 updated in mongoDB
---------------
movie: 放牛班的春天 find
movie: 放牛班的春天 updated in mongoDB
---------------
movie: 大话西游之大圣娶亲 find
movie: 大话西游之大圣娶亲 updated in mongoDB
--------

movie: 七武士 find
movie: 七武士 updated in mongoDB
---------------
movie: 岁月神偷 find
movie: 岁月神偷 updated in mongoDB
---------------
movie: 怪兽电力公司 find
movie: 怪兽电力公司 updated in mongoDB
---------------
movie: 7号房的礼物 find
movie: 7号房的礼物 updated in mongoDB
---------------
movie: 真爱至上 find
movie: 真爱至上 updated in mongoDB
---------------
movie: 谍影重重3 find
movie: 谍影重重3 updated in mongoDB
---------------
movie: 电锯惊魂 find
movie: 电锯惊魂 updated in mongoDB
---------------
movie: 萤火虫之墓 find
movie: 萤火虫之墓 updated in mongoDB
---------------
movie: 萤火之森 find
movie: 萤火之森 updated in mongoDB
---------------
movie: 疯狂原始人 find
movie: 疯狂原始人 updated in mongoDB
---------------
movie: 东邪西毒 find
movie: 东邪西毒 updated in mongoDB
---------------
movie: 喜宴 find
movie: 喜宴 updated in mongoDB
---------------
movie: 超能陆战队 find
movie: 超能陆战队 updated in mongoDB
---------------
movie: 贫民窟的百万富翁 find
movie: 贫民窟的百万富翁 updated in mongoDB
---------------
movie: 唐伯虎点秋香 find
movie: 唐伯虎点秋香 updated in mongoDB
---------------
movie: 蝙蝠侠：黑暗骑士崛起 