In [None]:
from curl_cffi import requests
from fake_useragent import UserAgent
import numpy as np
from rich.progress import (
    BarColumn,
    MofNCompleteColumn,
    Progress,
    TextColumn,
    TimeElapsedColumn,
    TimeRemainingColumn,
)
import pandas as pd
import parsel

import asyncio
import random


headers = {"Accept": "*/*", "User-Agent": UserAgent().random}

progress = Progress(
    "[progress.description]{task.description}",
    TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
    BarColumn(),
    MofNCompleteColumn(),
    TimeElapsedColumn(),
    TimeRemainingColumn(),
)
miss_pages = []


async def get_data(session, page, task, max_retries=3):
    # async with asyncio.Semaphore(20):

    for attempt in range(max_retries):
        try:
            response_co = session.get(
                f"https://www.qut.edu.au/about/our-people/academic-profiles/search?SQ_ASSET_CONTENTS&query=&profilename=&discipline=&divfac=&school=&school_all=1&result_1066507_result_page={page}&ajax=true",
                # cookies=cookies,
                headers=headers,
            )
            response = await response_co
            if attempt > 0:
                print(f"第{page}页第{attempt}次重试,成功连接")
            break
        except requests.errors.RequestsError:
            print(f"第{page}页连接失败,重试中")
            if attempt == max_retries:
                progress.update(task, advance=1)
                miss_pages.append(page)
                return pd.DataFrame()
            else:
                sleep_time = random.randint(1, 5)
                print(f"随机等待{sleep_time}秒")
                await asyncio.sleep(sleep_time)
        except Exception as e:
            print(f"页面 {page} 遇到未知错误: {e}")
            miss_pages.append(page)
            progress.update(task, advance=1)
            return pd.DataFrame()

    s = parsel.Selector(response.text)

    names = s.xpath("//div[2]/div/a/text()").getall()

    faculty = s.xpath('//dd[@class="col-xs-12"]/text()').getall()

    email = s.xpath('//a[@class="email"]/text()').getall()

    position = s.xpath(
        '//dd[@class="col-xs-12 font-stack-headings-small"]/text()'
    ).getall()

    df = pd.DataFrame(
        # {
        #     'Name': names,
        #     'Faculty': faculty,
        #     'Email': email,
        #     'Position': position,
        # }
        {
            "University": "Queensland University of Technology",
            "Faculties": faculty,
            "Departments/Centre": np.nan,
            "Position Level/Title": position,
            "Name": names,
            "Email": email,
        }
    )

    df["page"] = page

    if df.empty:
        miss_pages.append(page)

    progress.update(task, advance=1)

    return df


pages = 174


task = progress.add_task("[red]Downloading...", total=174)

with progress:
    async with requests.AsyncSession(max_clients=25) as session:
        tasks = [get_data(session, page, task) for page in range(1, pages + 1)]

        results = await asyncio.gather(*tasks)
        df = pd.concat(results)

        df.to_csv(f"qut{pages}_.csv", index=False)
        if miss_pages:
            print(miss_pages)

In [None]:
# 筛选姓氏
import pandas as pd

df = pd.read_csv("./qut174.csv")

df = pd.DataFrame(
    {
        "University": "Queensland University of Technology",
        "Faculties": df["Faculty"],
        "Departments/Centre": np.nan,
        "Position Level/Title": df["Position"],
        "Name": df["Name"],
        "Email": df["Email"],
    }
)
# 整理name，position，email列
df["Name"] = (
    df["Name"].str.replace(r".*Professor ", "dr ", regex=True).str.extract(r"\s(.*)")
)
# 清除position中的换行符，制表符
df["Position Level/Title"] = df["Position Level/Title"].str.replace(
    r"\s+", " ", regex=True
)
df["Email"] = df["Email"] + "@qut.edu.au"

df_condition = pd.read_csv(
    "../姓氏列表/常见姓氏列表.csv",
)


# surname是否在name的字符中
surnames = df_condition["surname"].str.strip().dropna().unique().tolist()
# 创建正则表达式模式，使用管道符 '|' 分隔

name_pattern = r"\b(?:" + "|".join(surnames) + r")\b"

df_filter = df[df["Name"].str.contains(name_pattern, case=False, na=False)]

# 筛选position
position_pattern = df_condition["in position"].str.cat(sep="|")
df_filter = df_filter[
    df_filter["Position Level/Title"].str.contains(position_pattern, regex=True)
]

df1 = df_filter[
    df_filter["Position Level/Title"].str.contains(
        pat=("^" + df_condition["out position"] + "$").str.cat(sep="|"), regex=True
    )
]

with pd.ExcelWriter(
    path=f"{df.shape[0]}_{df_filter.shape[0]}_Queensland University of Technology.xlsx"
) as writer:
    df.to_excel(writer, index=False, sheet_name="Full Staff List")
    df_filter.to_excel(writer, index=False, sheet_name="Selected Staff List")

In [None]:
from rich.progress import (
    BarColumn,
    MofNCompleteColumn,
    Progress,
    TextColumn,
    TimeElapsedColumn,
    TimeRemainingColumn,
)
import time

# Define custom progress bar
progress_bar = Progress(
    "[progress.description]{task.description}",
    BarColumn(),
    TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
    MofNCompleteColumn(),
    TextColumn("•"),
    TimeElapsedColumn(),
    TextColumn("•"),
    TimeRemainingColumn(),
)
# progress_bar = Progress()
# Use custom progress bar
task1 = progress_bar.add_task("Task 1")
with progress_bar as p:
    for i in range(100):
        # Do something here
        time.sleep(0.1)
        progress_bar.update(task1, advance=1)
        pass