In [69]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import pymongo
from pymongo import UpdateOne

In [2]:
douban_url_base = 'https://movie.douban.com/top250?start='
page = 10

In [3]:
# get html
def get_html(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status
        #site encoding
        r.encoding = 'utf-8'
        return r.text
    except:
        print('ERROR')
        pass

In [4]:
# get top250 imdb movie data
def get_top250_html_list(url):
    base = url
    data_list = []
    # run 10 pages
    for i in range(page):
        num = str(i * 25)
        html = get_html(base + num)
        soup = bs(html, 'lxml')
        
        try:
            m_list = soup.find('ol', class_='grid_view').find_all('li')
            for item in m_list:
                data = {}
                data['rank'] = item.find('em').text
                data['id'] = item.find('div', class_='pic').find('a')['href'][33:-1]
                data['movie'] = item.find('span', class_='title').text
                data['year'] = item.find('div', class_='bd').find('p').contents[2].strip()[:4]
                data_list.append(data)
        except:
            print('error')
            
        print('page: ', i, 'finished')
    
    return data_list

In [5]:
movie_list = get_top250_html_list(douban_url_base)

page:  0 finished
page:  1 finished
page:  2 finished
page:  3 finished
page:  4 finished
page:  5 finished
page:  6 finished
page:  7 finished
page:  8 finished
page:  9 finished


In [65]:
# save data into mongo db
client = pymongo.MongoClient('localhost', 27017)
db = client['movie_db']
col = db['douban_top250']

In [66]:
col.insert_many(movie_list)

<pymongo.results.InsertManyResult at 0x1101d6d08>

In [63]:
# get details of 250 movies
def get_movie_detial(movie_id):
    base_url = 'https://movie.douban.com/subject/'
    url = base_url + movie_id + '/' 
    data = {}
    # get movie data 
    html = get_html(url)
    soup = bs(html, 'lxml')
    try:
        data['rate'] = soup.find('strong', class_='rating_num').text
        data['review'] = soup.find('span', property="v:votes").text
        data['imdb_id'] = soup.find('div', id='info').find('span', string='IMDb链接:').find_next('a').text
        print(data['imdb_id'])
    except:
        print('error')
    
    return data

In [57]:
# add details to database if field does not exist
def udpate_field_mongodb(movie_id, data):
    col.update_one(
        {
            'id': movie_id
        },
        {
            '$set':{
                'imdb_id': data['imdb_id'],
                'rate': data['rate'],
                'review': data['review']
            }
        }
    )

In [67]:
# get detais and update in mongodb
for movie in movie_list:
    update_data = get_movie_detial(movie['id'])
    udpate_field_mongodb(movie['id'], update_data)
    print('movie:', movie['movie'], 'updated in mongoDB')
    print('---------------')

tt0111161
movie: 肖申克的救赎 updated in mongoDB
---------------
tt0106332
movie: 霸王别姬 updated in mongoDB
---------------
tt0110413
movie: 这个杀手不太冷 updated in mongoDB
---------------
tt0109830
movie: 阿甘正传 updated in mongoDB
---------------
tt0118799
movie: 美丽人生 updated in mongoDB
---------------
tt0120338
movie: 泰坦尼克号 updated in mongoDB
---------------
tt0245429
movie: 千与千寻 updated in mongoDB
---------------
tt0108052
movie: 辛德勒的名单 updated in mongoDB
---------------
tt1375666
movie: 盗梦空间 updated in mongoDB
---------------
tt1028532
movie: 忠犬八公的故事 updated in mongoDB
---------------
tt0910970
movie: 机器人总动员 updated in mongoDB
---------------
tt1187043
movie: 三傻大闹宝莱坞 updated in mongoDB
---------------
tt0372824
movie: 放牛班的春天 updated in mongoDB
---------------
tt0120731
movie: 海上钢琴师 updated in mongoDB
---------------
tt0120382
movie: 楚门的世界 updated in mongoDB
---------------
tt0114996
movie: 大话西游之大圣娶亲 updated in mongoDB
---------------
tt0816692
movie: 星际穿越 updated in mongoDB
---------------
tt0096

tt2119532
movie: 血战钢锯岭 updated in mongoDB
---------------
tt0435761
movie: 玩具总动员3 updated in mongoDB
---------------
tt0109688
movie: 东邪西毒 updated in mongoDB
---------------
tt0095327
movie: 萤火虫之墓 updated in mongoDB
---------------
tt1010048
movie: 贫民窟的百万富翁 updated in mongoDB
---------------
tt0414387
movie: 傲慢与偏见 updated in mongoDB
---------------
tt2194499
movie: 时空恋旅人 updated in mongoDB
---------------
tt0947798
movie: 黑天鹅 updated in mongoDB
---------------
tt0209144
movie: 记忆碎片 updated in mongoDB
---------------
tt4078856
movie: 心迷宫 updated in mongoDB
---------------
tt0101020
movie: 纵横四海 updated in mongoDB
---------------
tt3011894
movie: 荒蛮故事 updated in mongoDB
---------------
tt0099674
movie: 教父3 updated in mongoDB
---------------
tt0095953
movie: 雨人 updated in mongoDB
---------------
tt0107808
movie: 完美的世界 updated in mongoDB
---------------
tt0790636
movie: 达拉斯买家俱乐部 updated in mongoDB
---------------
tt0118694
movie: 花样年华 updated in mongoDB
---------------
tt5027774
movie: 三块广告

In [93]:
# change field data type in MongoDB
douban_data = pd.DataFrame(list(col.find()))
douban_data.id = douban_data.id.astype(int)
douban_data.rate = douban_data.rate.astype(float)
douban_data.review = douban_data.review.astype(int)

In [95]:
douban_data['rank'] = douban_data['rank'].astype(int)

In [96]:
col.update_many({}, {'$rename': {'rank': 'douban_rank'}})

<pymongo.results.UpdateResult at 0x10cfefb08>

In [98]:
for index, row in douban_data.iterrows():
    col.find_one_and_update(
        {'imdb_id': row['imdb_id']},
        {
            '$set': {
                'id': row['id'],
                'rate': row['rate'],
                'review': row['review'],
                'douban_rank': row['rank']
            }
        }
    )

In [99]:
col.update_many({}, {'$rename': {'id': 'douban_id'}})

<pymongo.results.UpdateResult at 0x10cfb2908>