In [8]:
import requests, json, re
from prettytable import PrettyTable

# 爬取豆瓣电影

In [17]:
url = 'https://movie.douban.com/j/chart/top_list'

param = {
    'type':'24',
    'interval_id':'100:90',
    'action':'',
    'start':'60', # 从数据库中第几部电影开始取
    'limit':'20' # 每次请求电影的数量
}
# UA 伪装
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
}
# 发请求，获取响应数据
response = requests.get(url=url, params=param, headers=headers)
list_data = response.json()
response.text

# 持久化存储
with open('crawledData/doubanMovie.json', 'w', encoding='utf-8') as fp:
    json.dump(list_data, fp=fp, ensure_ascii=False)
print('over!!!')

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

# 豆瓣电影Top250

In [None]:
def get_movie_top250_one_page(start):
    """爬取豆瓣电影 top250
    分页进行爬取
    """
    url = f'https://movie.douban.com/top250?start={start}&filter='

    # 发现不加 header 获取内容是空；先尝试加 User-Agent；如果还是空，那就需要加更多的 header
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
    }
    r = requests.get(url, headers=headers)
    html = r.text

    re_s = '<li>(.*?)</li>'
    item_info = re.findall(re_s, html, re.DOTALL)
    # print(item_info, '==========')

    movies = []
    for item in item_info:
        id_ = re.findall(r'<em class="">(\d+)</em>', item)[0]
        url = re.findall('<a href="(.*)" class="">', item)[0]
        title = re.findall('<span class="title">(.*)</span>', item)[0]
        starring = re.findall('主演: (.*?)[ .].*<br>', item)
        starring = starring[0] if starring else ''
        year = re.findall(r'(\d{4}).*&nbsp;', item)[0]
        score = re.findall(
            '<span class="rating_num" property="v:average">(.*)</span>',
            item)[0]
        print(id_, url, title, starring, year, score)
        movies.append({
            'id': int(id_),
            'url': url,
            'title': title,
            'starring': starring,
            'year': int(year),
            'score': float(score),
        })
    return movies


def get_movie_top250_all_page():
    """爬取所有页"""
    movies = []
    for page in range(10):
        start = page * 25
        movies.extend(get_movie_top250_one_page(start))
        # break
    return movies


def print_prettytable(movies):
    """以 table 格式输出"""
    
    tb = PrettyTable()
    tb.field_names = ['序号', '电影名', '主演', '上映年代', '豆瓣评分']
    for movie in movies[:10]:
        tb.add_row([
            movie['id'], movie['title'], movie['starring'],
            movie['year'], movie['score']
        ])
    print(tb)


def print_json(movies):
    """以 json 格式输出"""
    import json
    print(json.dumps(movies[:10], indent=4, ensure_ascii=False))


def save_to_csv(movies, fname='top250.csv'):
    """保存为 csv 文件"""
    data = '\n'.join([','.join(map(str, movie.values())) for movie in movies])
    with open(fname, 'w', encoding='utf_8_sig') as f:
        f.write(data)


def save_to_mysql(movies):
    """保存到 mysql 数据库中"""
    pass


if __name__ == '__main__':
    movies = get_movie_top250_all_page()
    print_prettytable(movies)
    print_json(movies)
    save_to_csv(movies)