In [None]:
import requests
from bs4 import BeautifulSoup
import time
# *************************************** Begin ******************************************
# 请求网页
def page_request(url, ua):
    try:
        response = requests.get(url, headers=ua)
        response.raise_for_status()  
        response.encoding = response.apparent_encoding
        return response.text
    except requests.RequestException as e:
        print(e)

# 解析网页
def page_parse(html):
    soup = BeautifulSoup(html, 'html.parser')
    movie_list = soup.find('ol', class_='grid_view').find_all('li')

    for movie in movie_list:
        rank = movie.find('em', class_='').text
        title = movie.find('span', class_='title').text
        rating = movie.find('span', class_='rating_num').text
        link = movie.find('a')['href']
        print(f'{rank}-{title}-{rating}-{link}')



# *************************************** End ******************************************
if __name__ == "__main__":
    print('**************开始爬取豆瓣电影**************')
    ua = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4421.5 Safari/537.36'}
    # 豆瓣电影Top250每页有25部电影，start就是每页电影的开头
    for startNum in range(0, 250, 25):
        url = "https://movie.douban.com/top250?start=%d" % startNum
        html = page_request(url=url, ua=ua)
        page_parse(html=html)
        print('**************爬取完成**************')



In [None]:
import requests
from bs4 import BeautifulSoup
import re
import docx
from docx.oxml.ns import qn
# ********************************************** Begin ************************************************
# 请求网页
def page_request(url, ua):
    try:
        response = requests.get(url, headers=ua)
        response.raise_for_status()  # 检查请求是否异常
        response.encoding = response.apparent_encoding
        return response.text
    except requests.RequestException as e:
        print(e)

# 解析网页
def page_parse(html, ua):
    soup = BeautifulSoup(html, 'html.parser')
    movie_list = soup.find('ol', class_='grid_view').find_all('li')

    for movie in movie_list:
        rank = movie.find('em', class_='').text
        title = movie.find('span', class_='title').text
        link = movie.find('a')['href']
        rating = movie.find('span', class_='rating_num').text
        rating_count_str = movie.find('div', class_='star').find_all('span')[-1].text
        rating_count = int(re.search(r'\((\d+)\)', rating_count_str).group(1))

        # 进入子网页，获取每部电影的具体信息
        sub_page_requests(link, ua, {
            'Rank': rank,
            'Title': title,
            'Link': link,
            'Rating': rating,
            'RatingCount': rating_count
        })

# 子网页处理函数：进入并解析子网页/请求子网页
# 获取影片详细信息
def sub_page_requests(url, ua, data):
    try:
        response = requests.get(url, headers=ua)
        response.raise_for_status()  # 检查请求是否异常
        response.encoding = response.apparent_encoding
        soup = BeautifulSoup(response.text, 'html.parser')

        # 影片信息
        info = soup.find('div', id='info')
        director = info.find('span', attrs={'class': 'attrs'}).text.strip()
        screenwriter = info.find(text=re.compile('编剧')).next_element.next_element.text.strip()
        actors = [actor.text.strip() for actor in info.find_all('span', attrs={'class': 'actor'})]
        genre = info.find('span', property='v:genre').text.strip()
        release_date = info.find('span', property='v:initialReleaseDate').text.strip()
        duration = info.find('span', property='v:runtime').text.strip()

        # 影片简介
        summary = soup.find('span', property='v:summary').text.strip()

        # 更新数据
        data.update({
            'Director': director,
            'Screenwriter': screenwriter,
            'Actors': ', '.join(actors),
            'Genre': genre,
            'ReleaseDate': release_date,
            'Duration': duration,
            'Summary': summary
        })

        # 保存影片信息
        save(data)

    except requests.RequestException as e:
        print(e)

def save(data):
    # 创建Word文档
    if not os.path.exists('/root/result'):
        os.makedirs('/root/result')

    file_path = os.path.join('/root/result', f"{data['Title']}.docx")
    doc = docx.Document()
    
    # 设置字体格式
    doc.styles['Normal'].font.name = u'Times New Roman'
    doc.styles['Normal'].element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')

    # 将爬取到的数据写入word中
    doc.add_paragraph(f"排名: {data['Rank']}")
    doc.add_paragraph(f"电影名称: {data['Title']}")
    doc.add_paragraph(f"豆瓣链接: {data['Link']}")
    doc.add_paragraph(f"评分: {data['Rating']} ({data['RatingCount']}人评价)")
    doc.add_paragraph(f"导演: {data['Director']}")
    doc.add_paragraph(f"编剧: {data['Screenwriter']}")
    doc.add_paragraph(f"主演: {data['Actors']}")
    doc.add_paragraph(f"类型: {data['Genre']}")
    doc.add_paragraph(f"上映日期: {data['ReleaseDate']}")
    doc.add_paragraph(f"片长: {data['Duration']}")
    doc.add_paragraph(f"剧情简介: {data['Summary']}")

    doc.save(file_path)


# ********************************************** End ************************************************

if __name__ == "__main__":
    print('**************开始爬取豆瓣电影**************')
    ua = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4421.5 Safari/537.36'}
    # 豆瓣电影Top250每页有25部电影，start就是每页电影的开头
    data_List = []
    for startNum in range(0, 251, 25):
        url = "https://movie.douban.com/top250?start=%d" % startNum
        html = page_request(url=url, ua=ua)
        # 获取每部影片的信息
        page_parse(html=html, ua=ua)
    print('**************爬取完成**************')


In [None]:

# 爬虫相关模块
import requests
from bs4 import BeautifulSoup

# 发送邮箱相关模块
import smtplib
from email.mime.text import MIMEText
from email.header import Header

# 定时模块
import schedule
import time

# *********************************** Begin ********************************************
# 请求网页
def page_request(url, header):
    try:
        response = requests.get(url, headers=header)
        response.raise_for_status()  
        response.encoding = response.apparent_encoding
        return response.text
    except requests.RequestException as e:
        print(e)

# 解析网页
def page_parse(html):
    news = []
    soup = BeautifulSoup(html, 'html.parser')
    hot_searches = soup.find_all('td', class_='td-02')

    for search in hot_searches:
        name = search.find('a').text
        link = search.find('a')['href']
        heat = search.find('span', class_='td-02').text
        news.append({'name': name, 'link': link, 'heat': heat})
    
    return news

def sendMail(news):
    sender = 'your_email@example.com'  
    receiver = 'receiver_email@example.com'  
    smtp_server = 'smtp.example.com'  
    smtp_port = 25  
    password = 'your_password'  

    message = '热搜信息如下：\n\n'
    for n in news:
        message += f'热搜名称: {n["name"]}\n链接: {n["link"]}\n实时热度: {n["heat"]}\n\n'


    msg = MIMEText(message, 'plain', 'utf-8')
    msg['From'] = sender
    msg['To'] = receiver
    msg['Subject'] = Header('微博热搜榜', 'utf-8')

    try:
        server = smtplib.SMTP(smtp_server, smtp_port)
        server.login(sender, password)
        server.sendmail(sender, [receiver], msg.as_string())
        server.quit()
        print("邮件发送成功！")
    except Exception as e:
        print(f"邮件发送失败：{e}")
# *********************************** End ********************************************

def job():
    print('**************开始爬取微博热搜**************')
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh-Hans;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Cookie': 根据各自的浏览器中显示的填写
    }

    url = 'https://s.weibo.com/top/summary'
    html = page_request(url=url, header=header)
    page_parse(html)

if __name__ == "__main__":
# 定时爬取，每隔20s爬取一次微博热搜榜并将爬取结果发送至个人邮箱
# 可以将20修改成其他时间
    schedule.every(20).seconds.do(job)
    while True:
        schedule.run_pending()







