In [None]:
from curl_cffi import requests
from rich import print
from fake_useragent import UserAgent
import parsel
import pandas as pd
import asyncio
from rich.progress import Progress, BarColumn, TimeElapsedColumn


class cityu:
    def __init__(self, page_list):
        self.semaphore = asyncio.Semaphore(10)
        self.progress = Progress(
            "[progress.description]{task.description}",
            BarColumn(),
            "[progress.percentage]{task.percentage:>3.1f}%",
            TimeElapsedColumn(),
        )
        self.page_list = page_list
        self.null_pages = []

    def fetch_data(self, response, page):

        s = parsel.Selector(response)
        divs = s.xpath('//div[@class="view-content"]/div')
        data = []
        for div in divs:
            zh_name = div.xpath('.//div[@class="zh"]/text()').get()
            en_name = div.xpath('.//div[@class="en"]/text()').get()
            title_dep = div.xpath('.//div[@class="scholar-positions"]/text()').get()
            email = div.xpath('.//div[@class="email"]/a/text()').get()
            # 保存数据
            data.append([zh_name, en_name, title_dep, email])

        df = pd.DataFrame(
            data, columns=['zh_name', 'en_name', 'title_department', 'email']
        )
        df['page'] = page
        return df

        # df[['title', 'department']] = df['title_department'].str.split(',', expand=True)

    async def fetch_page(self, session, page, task_id):
        ua = UserAgent()
        params = {
            'page': f'{page}',
        }
        headers = {
            'User-Agent': ua.random,
        }
        url = 'https://www.cityu.edu.hk/directories/people/academic'
        async with self.semaphore:
            response_co = session.get(url, params=params, headers=headers)
            response = await response_co
        df = self.fetch_data(response.text, page)
        if df.empty:
            self.null_pages.append(page)
        self.progress.update(task_id, advance=1)
        return df

    async def main(self):
        data_task = self.progress.add_task("fetching data", total=len(self.page_list))
        with self.progress:
            async with requests.AsyncSession() as session:

                tasks = [
                    self.fetch_page(session, page, data_task) for page in self.page_list
                ]
                df_list = await asyncio.gather(*tasks)
                df = pd.concat(df_list)
                df.to_csv('cityu3.csv', index=False)


cityu1 = cityu(range(100))
await cityu1.main()
cityu1.null_pages

In [158]:
df1 = pd.read_csv('./cityu1.csv')
df2 = pd.read_csv('./cityu2.csv')
df_city = pd.concat([df1, df2])
df_city['page'].unique()
[i for i in range(141) if i not in df_city['page'].unique()]

[4, 14, 50, 54, 67, 104]

In [None]:
# 数据清洗


df1 = pd.read_csv('./cityu1.csv')

df2 = pd.read_csv('./cityu2.csv')

df_city = pd.concat([df1, df2])

df_city = df_city[df_city['zh_name'].notnull() & df_city['email'].notnull()]

df_td = (
    df_city['title_department']
    .str.replace('Affiliate, ', "")
    .str.split(', ', expand=True)
)


# 提取第一个空格后的内容

df_city['en_name'] = df_city['en_name'].str.extract(r'\s(.*)')


# 0-15中的单数列合并，换行符间隔

columns0 = list(range(0, 16, 2))

# columns每个值加1

columns1 = list(range(1, 16, 2))


# df_city['title'] = df_td[columns0].fillna("").astype(str).agg('\n'.join, axis=1)

df_city['title'] = df_td[columns0].apply(
    lambda x: '\n'.join(x.dropna().astype(str)), axis=1
)


# df_city['department'] = df_td[columns1].fillna("").astype(str).agg('\n'.join, axis=1)
df_city['department'] = df_td[columns1].apply(
    lambda x: '\n'.join(x.dropna().astype(str)), axis=1
)


# df_city.sort_values(


#     ['page', 'en_name'],

#     inplace=True,

# )

df_city.to_excel('cityu.xlsx', index=False)

In [None]:
import pandas as pd

'City University of Hong Kong'
df = pd.read_excel('../cityu.xlsx')
df_filter = pd.read_csv('../姓氏列表/常见姓氏列表.csv')

in_pattern = ('^' + df_filter['in position'] + '$').str.cat(sep='|')

# out_pattern = ('^' + df_filter['out position'] + '$').str.cat(sep='|')
in_df = df[df['Position Level/Title'].str.contains(regex=True, pat=in_pattern)]

# out = df[df['Position Level/Title'].str.contains(regex=True, pat=out_pattern)]

# in_df的行数

with pd.ExcelWriter(
    f'{df.shape[0]}_{in_df.shape[0]}_City University of Hong Kong.xlsx'
) as writer:
    df.to_excel(writer, sheet_name='Full Staff List', index=False)
    in_df.to_excel(writer, sheet_name='Selected Staff List', index=False)

Index(['University', 'Faculties ', 'Departments/Centre',
       'Position Level/Title', 'Name', 'Email'],
      dtype='object')