In [5]:
import pandas as pd
import re

def extract_additional_details(row):
    details = {}
    if pd.notna(row['信息']):
        # Basic details
        birth_search = re.search(r'(\d{4}年\d{1,2}月)', row['信息'])
        gender_search = re.search(r'男|女', row['信息'])
        birth_place_search = re.search(r'，(\w+人)，', row['信息'])

        # details['出生年月'] = birth_search.group(1) if birth_search else None
        details['性别'] = gender_search.group(0) if gender_search else None
        # details['出生地'] = birth_place_search.group(1) if birth_place_search else None

        # Educational background
        edu_search = re.search(r'(\w+大学)', row['信息'])
        details['教育背景'] = edu_search.group(1) if edu_search else None
        
        # Degree
        degree_search = re.search(r'(\w+博士|\w+硕士|\w+学士)', row['信息'])
        details['学位'] = degree_search.group(1) if degree_search else None
        
        # Position
        position_search = re.search(r'(\w+博士后|博士生导师|教授|副教授)', row['信息'])
        details['职位'] = position_search.group(1) if position_search else None

        # Research direction
        research_dir_search = re.search(r'研究方向为(\w+)', row['信息'])
        details['研究方向'] = research_dir_search.group(1) if research_dir_search else None

        # Research projects
        project_search = re.search(r'科研项目(\w+)', row['信息'])
        details['科研项目'] = project_search.group(1) if project_search else None

        # Contact and email
        contact_search = re.search(r'联系方式[:：](\d+)', row['信息'])
        email_search = re.search(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', row['信息'])

        details['联系方式'] = contact_search.group(1) if contact_search else None
        details['email'] = email_search.group(0) if email_search else None
        
    return details

data = pd.read_csv('teacher_info.csv')
# Apply the function to extract additional details
additional_detailed_info = data.apply(extract_additional_details, axis=1, result_type='expand')
additional_combined_data = pd.concat([data[['姓名', 'title', 'dept', 'url']], additional_detailed_info], axis=1)

# Convert to JSON
additional_json_result = additional_combined_data.to_json(orient='records', force_ascii=False)

# json to dict
import json
additional_dict_result = json.loads(additional_json_result)
additional_dict_result

[{'姓名': '张靖',
  'title': '教授',
  'dept': '电器工程及其自动化',
  'url': 'http://ee.gzu.edu.cn/2017/0918/c3595a23870/page.htm',
  '性别': None,
  '教育背景': '贵州大学',
  '学位': '做出突出贡献的工程硕士',
  '职位': '教授',
  '研究方向': None,
  '科研项目': None,
  '联系方式': None,
  'email': 'zhangjing@gzu.edu.cn'},
 {'姓名': '刘敏',
  'title': '教授',
  'dept': '电器工程及其自动化',
  'url': 'http://ee.gzu.edu.cn/2017/0918/c3595a23869/page.htm',
  '性别': None,
  '教育背景': None,
  '学位': None,
  '职位': None,
  '研究方向': None,
  '科研项目': None,
  '联系方式': None,
  'email': None},
 {'姓名': '韩松',
  'title': '教授',
  'dept': '电器工程及其自动化',
  'url': 'http://ee.gzu.edu.cn/2017/0914/c3595a23864/page.htm',
  '性别': '男',
  '教育背景': '浙江大学',
  '学位': '浙江大学博士',
  '职位': '加拿大阿尔伯特大学博士后',
  '研究方向': '电力系统与综合能源',
  '科研项目': None,
  '联系方式': None,
  'email': 'shan@gzu.edu.cn'},
 {'姓名': '郝正航',
  'title': '教授',
  'dept': '电器工程及其自动化',
  'url': 'http://ee.gzu.edu.cn/2017/0918/c3595a23868/page.htm',
  '性别': '男',
  '教育背景': '先后在天津大学',
  '学位': '清华大学及许继集团完成博士及博士',
  '职位': '清华大学及许继集团完成博士及博士后',


In [6]:
def merge_dicts_with_list_values(dict_list):
    # 确定所有字典中所有唯一键的集合
    all_keys = set(key for d in dict_list for key in d.keys())

    # 初始化最终的字典，每个键对应一个空列表
    merged_dict = {key: [] for key in all_keys}

    # 遍历每个字典和所有键
    for key in all_keys:
        for d in dict_list:
            # 如果当前字典有这个键，则添加它的值；否则，添加一个空值
            merged_dict[key].append(d.get(key, None))  # 用None作为空值，也可以用""或其他

    return merged_dict


result_df = pd.DataFrame(
    merge_dicts_with_list_values(additional_dict_result)
)

for col in result_df.columns:
    not_missing = result_df[col].notnull().sum()
    if not_missing <= 3:
        result_df.drop(columns=col, inplace=True)

result_df.to_csv('teacher_detail.csv', index=False, encoding='utf-8-sig')