In [3]:
import requests
import expanddouban
from bs4 import BeautifulSoup
import csv
import pandas as pd
import numpy as np
from collections import Counter
import operator

In [4]:
"""
part1 : return a string corresponding to the URL of douban movie lists given category and location.
观察豆瓣的url格式，写一个方法用来生成不同类型地区对应的URL
"""


def getMovieUrl(category, location):
    if location == '全部地区':
        url = 'https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=电影,{}'.format(category)
        return url
    else:
        url = 'https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=电影,{},{}'.format(category, location)
    return url

In [9]:
'''
part 2： a function to help "click" load more button to get more items

'''

from selenium import webdriver
import time 

"""
url: the douban page we will get html from
loadmore: whether or not click load more on the bottom 
waittime: seconds the broswer will wait after intial load and 
""" 
def getHtml(url, loadmore = False, waittime = 2):
    browser = webdriver.Chrome('chromedriver')
    browser.get(url)
    time.sleep(waittime)
    if loadmore:
        while True:
            try:
                next_button = browser.find_element_by_class_name("more")
                next_button.click()
                time.sleep(waittime)
            except:
                break
    html = browser.page_source
    browser.quit()
    return html

# for test
#url = "https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=电影,剧情,美国"
#html = getHtml(url)
#print(html) 

In [5]:
"""
part3: Movie class
建一个类，接下来每一部电影会被当作一个对象来存储。
"""


class Movie:
    # 电影名称
    # 电影评分
    # 电影类型
    # 电影地区
    # 电影页面链接
    # 电影海报图片链接

    def __init__(self, m_title, m_rate, m_category, m_location, m_url, m_picture_url):
        self.name = m_title
        self.rate = m_rate
        self.category = m_category
        self.location = m_location
        self.info_link = m_url
        self.cover_link = m_picture_url

In [6]:
"""
part4: return a list of Movie objects with the given category and location.
"""


def getMovies(category, location):
    l = location
    movie_object_list = []
    html = getHtml(getMovieUrl(category, location), loadmore=True, waittime=3)
    
    # Use BeautifulSoup to parse the HTML doc
    soup = BeautifulSoup(html, "html.parser")
    webpage = soup.find(id="content").find(class_="list-wp").find_all("a", recursive=False)

    for mov in webpage:
        title = mov.find(class_="title").string
        rate = mov.find(class_="rate").string
        if rate == None:
            rate = '该电影尚无评分'
        url = mov.get("href")
        if url == None:
            url = '该电影尚无豆瓣链接'
        cover_url = mov.find("img").get("src")
        movie = Movie(title, rate, category, l, url, cover_url)
        movie_object_list.append(movie)
    return movie_object_list

In [10]:

"""
part5: Output to .csv files
从网页上选取你最爱的三个电影类型，然后获取每个地区的电影信息后，我们可以获得一个包含三个类型、所有地区，评分超过9分的完整电影对象的列表。
"""

"""
get the location list from webpage
"""


def getlocationlist():
    l_list = []
    uuurl = getMovieUrl("全部类型", "全部地区")
    hhhtml = expanddouban.getHtml(uuurl, loadmore=True, waittime=3)
    sssoup = BeautifulSoup(hhhtml, "html.parser")
    wwwebpage = sssoup.find(id='content').find(class_='tags').find(class_='category').next_sibling
    for c in wwwebpage.next_sibling:
        lllocation = c.find(class_='tag').string
        if lllocation != '全部地区':
            l_list.append(lllocation)
    return l_list


location_list = getlocationlist()

story_list = []
crime_list = []
fiction_list = []

for i in range(len(location_list)):
    current_list1 = getMovies("犯罪", location_list[i])
    current_list2 = getMovies("剧情", location_list[i])
    current_list3 = getMovies("科幻", location_list[i])
    crime_list += current_list1
    story_list += current_list2
    fiction_list += current_list3

all_movie_list = story_list + crime_list + fiction_list


def writeToCsv(m_list):
    with open('movies.csv', 'w', newline='', encoding='utf-8-sig') as f:
        f.write("名称,评分,种类,地区,链接,海报\n")
        for l in m_list:
            f.write("{},{},{},{},{},{}\n".format(l.name.replace('，', '~'), l.rate, l.category, l.location, l.info_link,
                                                 l.cover_link))


writeToCsv(all_movie_list)

In [13]:
len(all_movie_list)

378

In [11]:
"""
part6:统计你所选取的每个电影类别中，数量排名前三的地区有哪些，分别占此类别电影总数的百分比为多少？

你可能需要自己把这个任务拆分成多个步骤，统计每个类别的电影个数，统计每个类别每个地区的电影个数，排序找到最大值，做一定的数学运算等等，相信你一定可以的！

请将你的结果输出文件 output.txt

"""
with open('movies.csv', 'r', encoding='utf-8-sig') as m:
    csv_reader = csv.reader(m)
    movies_list = list(csv_reader)

story_count = 0
fiction_count = 0
crime_count = 0
story_l = []
fiction_l = []
crime_l = []

for i in range(len(movies_list)):
    if movies_list[i][2] == "剧情":
        story_count += 1
        story_l.append(movies_list[i][3])
    elif movies_list[i][2] == "科幻":
        fiction_count += 1
        fiction_l.append(movies_list[i][3])
    else:
        crime_count += 1
        crime_l.append(movies_list[i][3])
a = dict(Counter(story_l))
b = dict(Counter(fiction_l))
c = dict(Counter(crime_l))

sorted_a = sorted(a.items(), key=operator.itemgetter(1))
sorted_b = sorted(b.items(), key=operator.itemgetter(1))
sorted_c = sorted(c.items(), key=operator.itemgetter(1))

with open('output-stats.txt', 'w', newline='', encoding='utf-8-sig') as ff:
    aa = "{}电影类别中，数量排名前三的地区有为{} {} {}，分别占此类别电影总数的百分比为{}% {}% {}%\n".format("剧情", sorted_a[-1][0], sorted_a[-2][0],sorted_a[-3][0],round((sorted_a[-1][1] / story_count * 100),2),round((sorted_a[-2][1] / story_count * 100),2),round((sorted_a[-3][1] / story_count * 100),2))
    bb = "{}电影类别中，数量排名前三的地区有为{} {} {}，分别占此类别电影总数的百分比为{}% {}% {}%\n".format("科幻", sorted_b[-1][0], sorted_b[-2][0],sorted_b[-3][0], round((sorted_b[-1][1] / fiction_count * 100), 2), round((sorted_b[-2][1] / fiction_count * 100), 2), round((sorted_b[-3][1] / fiction_count * 100), 2))
    cc = "{}电影类别中，数量排名前三的地区有为{} {} {}，分别占此类别电影总数的百分比为{}% {}% {}%".format("犯罪", sorted_c[-1][0], sorted_c[-2][0],sorted_c[-3][0],round((sorted_c[-1][1] / crime_count * 100),2),round((sorted_c[-2][1] / crime_count * 100),2),round((sorted_c[-3][1] / crime_count * 100),2))
    ff.write(aa)
    ff.write(bb)
    ff.write(cc)