In [4]:
import requests
import logging
import re
import pymongo
from pyquery import PyQuery as pq
from urllib.parse import urljoin

logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s: %(message)s')

BASE_URL = 'https://ssr1.scrape.center'
TOTAL_PAGE = 10

'''
程序作用: 爬取指定页面，返回页面文本
参数: url (爬取页面地址)
返回值: response.text (页面文本)
'''

def scrape_page(url):
    logging.info('scraping %s', url)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        logging.info('get invalid status code %s while scraping %s', 
        response.status_code, url)
    except requests.RequestException:
        logging.error('error occurred while scraping %s', url, exc_info=True)

'''
程序作用: 爬取电影列表页，返回列表页面的文本
参数: page (电影列表页)
返回值: scrape_page函数结果 (电影列表页面的文本)
'''

def scrape_index(page):
    index_url = f'{BASE_URL}/page/{page}'
    return scrape_page(index_url)

'''
程序作用: 解析电影列表页，返回每部电影的详情页网址
参数: html (电影列表页面的文本)
返回值: detail_url (电影详情页网址)
'''

def parse_index(html):
    doc = pq(html)
    links = doc('.el-card .name')
    for link in links.items():
        href = link.attr('href')
        detail_url = urljoin(BASE_URL, href)
        logging.info('get detail url %s', detail_url)
        yield detail_url

'''
程序作用: 爬取电影详情页，返回详情页面的文本
参数: url (电影详情页网址)
返回值: scrape_page函数结果 (电影详情页面的文本)
'''

def scrape_detail(url):
    return scrape_page(url)

'''

程序作用: 解析电影列表页，返回每部电影的详情页网址
参数: html (电影列表页面的文本)
返回值: detail_url (电影详情页网址)
'''

def parse_detail(html):
    doc = pq(html)
    cover = doc('img.cover').attr('src')
    name = doc('a > h2').text()
    categories = [item.text() for item in doc('.categories button span').items()]
    published_at = doc('.info:contains(上映)').text()
    published_at = re.search(r'\d{4}-\d{2}-\d{2}', published_at).group(1) \
        if published_at and re.search(r'\d{4}-\d{2}-\d{2}', published_at) else None
    drama = doc('.drama p').text()
    score = doc('.score').text()
    score = float(score) if score else None
    return {
        'cover': cover,
        'name': name,
        'categories': categories,
        'published_at': published_at,
        'drama': drama,
        'score': score
    }

In [26]:
score = doc('.score').text()
score = float(score) if score else None
print(score)

9.5


In [7]:
detail_html = scrape_detail('https://ssr1.scrape.center/detail/1')
doc = pq(detail_html)

2024-01-20 11:17:54,128 - INFO: scraping https://ssr1.scrape.center/detail/1


In [8]:
cover = doc('img.cover').attr('src')
print(cover)

https://p0.meituan.net/movie/ce4da3e03e655b5b88ed31b5cd7896cf62472.jpg@464w_644h_1e_1c


In [9]:
name = doc('a > h2').text()
print(name)

霸王别姬 - Farewell My Concubine


In [10]:
categories = [item.text() for item in doc('.categories button span').items()]
print(categories)

['剧情', '爱情']


In [24]:
published_at = doc('.info:contains(上映)').text()
print(published_at)
published_at = re.search(r'\d{4}-\d{2}-\d{2}', published_at).group(0) \
        if published_at and re.search(r'\d{4}-\d{2}-\d{2}', published_at) else None
print(published_at)

1993-07-26 上映
1993-07-26


In [19]:
published_at = re.search(r'\d{4}', '1993-07-26 上映').groups()
print(published_at)

()


In [22]:
print(re.search('www', 'www.runoob.com').group(0))

www
