In [2]:
input_file = '../data/details.json'  # 原始大 JSON 文件
output_file = '../data/details_cleaned.json'  # 格式化后的文件
classroom_json = '../data/classroom_data.json'

<h1>清理类</h1>

In [27]:
import json

# 读取并清理大 JSON 文件的指定字段
def clean_large_json(input_file, output_file, keys_to_remove):
    with open(input_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)  # 读取整个 JSON 文件

    # 递归删除指定的键
    def remove_keys(obj, keys):
        if isinstance(obj, dict):
            for key in keys:
                obj.pop(key, None)  # 删除指定键
            for value in obj.values():
                remove_keys(value, keys)
        elif isinstance(obj, list):
            for item in obj:
                remove_keys(item, keys)

    # 删除指定字段
    remove_keys(data, keys_to_remove)

    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(data, outfile, indent=4, ensure_ascii=False)  # 格式化并保存



# 指定要删除的键
keys_to_remove = ["additionalLinks", "bookstore", "cfg", "catalog_descr", "materials", "enrollment_information", "reserve_caps", "catalog_descr", "messages", "notes"]

clean_large_json(input_file, output_file, keys_to_remove)
print("指定字段已删除，清理后的 JSON 文件已保存为", output_file)


指定字段已删除，清理后的 JSON 文件已保存为 ../data/details_cleaned.json


In [28]:
import json

def remove_class_capacity_999():
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)  # 读取整个 JSON 文件

    # 递归删除 class_capacity 为 999 的项
    def remove_capacity_999(obj):
        if isinstance(obj, dict):
            # 检查并删除 class_capacity 为 999 的项
            if obj.get("class_capacity") == "999" or obj.get("class_capacity") == 999:
                return None  # 标记该项为删除
            # 继续检查子字段
            new_obj = {}
            for key, value in obj.items():
                result = remove_capacity_999(value)
                if result is not None:
                    new_obj[key] = result
            return new_obj
        elif isinstance(obj, list):
            # 遍历列表中的每一项
            return [remove_capacity_999(item) for item in obj if remove_capacity_999(item) is not None]
        else:
            return obj

    # 执行删除操作
    cleaned_data = remove_capacity_999(data)

    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(cleaned_data, outfile, indent=4, ensure_ascii=False)  # 格式化并保存

    print("class_capacity 为 999 的项已删除，清理后的 JSON 文件已保存。")

remove_class_capacity_999()


class_capacity 为 999 的项已删除，清理后的 JSON 文件已保存。


In [29]:
import json

def remove_online_instruction_mode():
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)  # 从输出文件读取数据

    # 递归删除 instruction_mode 为 'Online' 的项
    def remove_online_mode(obj):
        if isinstance(obj, dict):
            # 检查并删除 instruction_mode 为 'Online' 的项
            if obj.get("instruction_mode") == 'Online':
                return None  # 标记该项为删除
            # 继续检查子字段
            new_obj = {}
            for key, value in obj.items():
                result = remove_online_mode(value)
                if result is not None:
                    new_obj[key] = result
            return new_obj
        elif isinstance(obj, list):
            # 遍历列表中的每一项
            return [remove_online_mode(item) for item in obj if remove_online_mode(item) is not None]
        else:
            return obj

    # 执行删除操作
    cleaned_data = remove_online_mode(data)

    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(cleaned_data, outfile, indent=4, ensure_ascii=False)  # 格式化并保存

    print("instruction_mode 为 'Online' 的项已删除，清理后的 JSON 文件已保存。")

remove_online_instruction_mode()


instruction_mode 为 'Online' 的项已删除，清理后的 JSON 文件已保存。


In [42]:
import json

def remove_tba_instructors():
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)  # 从输出文件读取数据

    # 递归删除 instructors 名为 "To Be Announced" 的条目
    def clean_instructors(obj):
        if isinstance(obj, dict):
            if "meetings" in obj:
                for meeting in obj["meetings"]:
                    if "instructors" in meeting:
                        # 过滤掉 "To Be Announced" 的条目
                        meeting["instructors"] = [
                            instructor for instructor in meeting["instructors"]
                            if instructor.get("name") != "To Be Announced"
                        ]
            # 继续遍历子字段
            for value in obj.values():
                clean_instructors(value)
        elif isinstance(obj, list):
            for item in obj:
                clean_instructors(item)

    # 执行清理操作
    clean_instructors(data)

    # 将清理后的数据写回文件
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(data, outfile, indent=4, ensure_ascii=False)

    print("已删除 instructors 名为 'To Be Announced' 的条目，清理后的 JSON 文件已保存。")

remove_tba_instructors()


已删除 instructors 名为 'To Be Announced' 的条目，清理后的 JSON 文件已保存。


In [48]:
import json

def clean_empty_instructors_tba_meets_and_empty_times():
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)  # 从文件读取数据

    # 递归删除 instructors 为空、meets 为 "TBA"、以及 time 为空的项
    def clean_data(obj):
        if isinstance(obj, dict):
            # 删除 instructors 为空的项
            if "instructors" in obj and isinstance(obj["instructors"], list) and not obj["instructors"]:
                return None
            # 删除 meets 为 "TBA" 的项
            if obj.get("meets") == "TBA":
                return None
            # 删除 time 为空的项
            if "meeting_time_start" in obj and obj["meeting_time_start"] == "":
                return None
            if "meeting_time_end" in obj and obj["meeting_time_end"] == "":
                return None
            # 继续遍历子字段
            new_obj = {}
            for key, value in obj.items():
                result = clean_data(value)
                if result is not None:
                    new_obj[key] = result
            return new_obj
        elif isinstance(obj, list):
            # 对列表中的每个元素应用清理操作
            return [clean_data(item) for item in obj if clean_data(item) is not None]
        else:
            return obj

    # 执行清理操作
    cleaned_data = clean_data(data)

    # 将清理后的数据写回文件
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(cleaned_data, outfile, indent=4, ensure_ascii=False)

    print("已删除 instructors 为空、meets 为 'TBA'、以及 time 为空的项，清理后的 JSON 文件已保存。")

clean_empty_instructors_tba_meets_and_empty_times()


已删除 instructors 为空、meets 为 'TBA'、以及 time 为空的项，清理后的 JSON 文件已保存。


In [71]:
import json

def clean_classroom_names():
    with open(classroom_json, 'r', encoding='utf-8') as infile:
        data = json.load(infile)

    # 递归删除包含 '/' 的 Name 项
    def remove_invalid_classrooms(obj):
        if isinstance(obj, dict):
            if "Name" in obj and '/' in obj["Name"]:
                return None  # 将包含 '/' 的教室名称项标记为 None，表示删除
            # 遍历并处理子字段
            new_obj = {}
            for key, value in obj.items():
                result = remove_invalid_classrooms(value)
                if result is not None:
                    new_obj[key] = result
            return new_obj
        elif isinstance(obj, list):
            # 过滤掉标记为 None 的项
            return [remove_invalid_classrooms(item) for item in obj if remove_invalid_classrooms(item) is not None]
        else:
            return obj

    # 执行清理操作
    cleaned_data = remove_invalid_classrooms(data)

    # 将清理后的数据写回文件
    with open(classroom_json, 'w', encoding='utf-8') as outfile:
        json.dump(cleaned_data, outfile, indent=4, ensure_ascii=False)

    print("已删除包含 '/' 的教室名称项，清理后的 JSON 文件已保存。")

clean_classroom_names()


已删除包含 '/' 的教室名称项，清理后的 JSON 文件已保存。


In [77]:
import json
import re

def clean_classroom_names():
    with open(classroom_json, 'r', encoding='utf-8') as infile:
        data = json.load(infile)

    # 递归处理 Name 字段中的格式
    def process_classroom_names(obj):
        if isinstance(obj, dict):
            if "Name" in obj:
                # 将 '-' 替换为空格
                cleaned_name = obj["Name"].replace('-', ' ')
                # 删除 '(' 及其右边的所有内容
                cleaned_name = re.sub(r'\(.*$', '', cleaned_name).strip()
                obj["Name"] = cleaned_name
            # 继续遍历子字段
            for key, value in obj.items():
                process_classroom_names(value)
        elif isinstance(obj, list):
            for item in obj:
                process_classroom_names(item)

    # 执行清理操作
    process_classroom_names(data)

    # 将清理后的数据写回文件
    with open(classroom_json, 'w', encoding='utf-8') as outfile:
        json.dump(data, outfile, indent=4, ensure_ascii=False)

    print("已清理 Name 字段中的格式，清理后的 JSON 文件已保存。")

clean_classroom_names()


已清理 Name 字段中的格式，清理后的 JSON 文件已保存。


In [111]:
import json

def clean_none_and_no_room_classrooms():
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)  # 从文件读取数据

    # 筛选出包含有效 room 信息的对象
    cleaned_data = []
    for course in data:
        section_info = course.get("section_info", {})
        meetings = section_info.get("meetings", [])

        # 检查每个 meeting 的 room 字段，如果存在 "None" 或 "NO ROOM" 则跳过该 course
        if any(meeting.get("room") in [None, "NO ROOM"] for meeting in meetings):
            continue  # 如果满足删除条件，跳过该 course
        else:
            cleaned_data.append(course)  # 否则保留该 course

    # 将清理后的数据写回文件
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(cleaned_data, outfile, indent=4, ensure_ascii=False)

    print("已删除包含 room 为 None 或 'NO ROOM' 的项，清理后的 JSON 文件已保存。")

clean_none_and_no_room_classrooms()


已删除包含 room 为 None 或 'NO ROOM' 的项，清理后的 JSON 文件已保存。


In [103]:
import json

def clean_empty_meetings():
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)  # 从文件读取数据

    # 清理包含 meetings 为空的整个 JSON 对象
    cleaned_data = []
    for course in data:
        section_info = course.get("section_info", {})
        meetings = section_info.get("meetings", [])

        # 检查 meetings 列表是否为空
        if meetings:
            cleaned_data.append(course)  # 如果 meetings 非空，保留该对象

    # 将清理后的数据写回文件
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(cleaned_data, outfile, indent=4, ensure_ascii=False)

    print("已删除 meetings 为空的项，清理后的 JSON 文件已保存。")

clean_empty_meetings()


已删除 meetings 为空的项，清理后的 JSON 文件已保存。


<h1>查询类</h1>

In [8]:
import json

def count_and_print_top_ten_class_capacities():
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)  # 读取整个 JSON 文件

    capacities = []
    capacity_999_count = 0

    def find_class_capacities(obj):
        nonlocal capacity_999_count
        if isinstance(obj, dict):
            # 检查并记录 class_capacity
            if "class_capacity" in obj:
                try:
                    capacity = int(obj["class_capacity"])  # 转换为整数
                    capacities.append(capacity)
                    if capacity == 999:
                        capacity_999_count += 1
                except ValueError:
                    pass  # 忽略无法转换的值
            # 继续遍历子字段
            for value in obj.values():
                find_class_capacities(value)
        elif isinstance(obj, list):
            for item in obj:
                find_class_capacities(item)

    # 执行查找
    find_class_capacities(data)

    # 输出前十个 class_capacity 和 999 的数量
    print("前十节课的 class_capacity:", capacities[:10])
    print("class_capacity 为 999 的数量:", capacity_999_count)

count_and_print_top_ten_class_capacities()


前十节课的 class_capacity: [110, 23, 31, 2, 21, 5, 50, 5, 10, 30]
class_capacity 为 999 的数量: 0


In [9]:
import json

def count_unique_instruction_modes():
    with open(output_file, 'r', encoding='utf-8') as outfile:
        data = json.load(outfile)  # 从输出文件读取数据

    instruction_modes = set()  # 用于存储唯一的 instruction_mode 值

    # 递归查找 section_info 中的 instruction_mode
    def find_instruction_modes(obj):
        if isinstance(obj, dict):
            # 检查并添加 instruction_mode 值
            if "instruction_mode" in obj:
                instruction_modes.add(obj["instruction_mode"])
            # 继续遍历子字段
            for value in obj.values():
                find_instruction_modes(value)
        elif isinstance(obj, list):
            for item in obj:
                find_instruction_modes(item)

    # 执行查找
    find_instruction_modes(data)

    # 输出 instruction_mode 的不同值和总数
    print("不同的 instruction_mode 数量:", len(instruction_modes))
    print("不同的 instruction_mode 值:", instruction_modes)

count_unique_instruction_modes()


不同的 instruction_mode 数量: 1
不同的 instruction_mode 值: {'In-Person'}


In [10]:
print(list(professor_mapping.items())[:10])

[('Min Ye', 0), ('Sorcha Martin', 1), ('Vidhya Kumaresan', 2), ('Max Anzede', 3), ('Roberto Tron', 4), ('Libang Wang', 5), ('Kathleen Corriveau', 6), ('Ava Greene', 7), ('Jeffrey Markuns', 8), ('Aiman Abilova', 9)]


In [11]:
def display_sample_professor_schedule(professor_schedule, sample_size=10):
    print(f"Displaying schedule for the first {sample_size} professors:")
    for professor_id in range(sample_size):
        if professor_id in professor_schedule:
            print(f"\nProfessor ID {professor_id}:")
            for schedule in professor_schedule[professor_id]:
                start_time, end_time, capacity = schedule
                print(f"  Start Time (in 5-min units): {start_time}, "
                      f"End Time (in 5-min units): {end_time}, "
                      f"Capacity: {capacity}")
        else:
            print(f"\nProfessor ID {professor_id}: No schedule available")

# 假设 professor_schedule 已经根据前面的步骤构建完成
display_sample_professor_schedule(professor_schedule)


Displaying schedule for the first 10 professors:

Professor ID 0:
  Start Time (in 5-min units): 1014, End Time (in 5-min units): 1047, Capacity: 23
  Start Time (in 5-min units): 402, End Time (in 5-min units): 417, Capacity: 36
  Start Time (in 5-min units): 978, End Time (in 5-min units): 993, Capacity: 36
  Start Time (in 5-min units): 402, End Time (in 5-min units): 417, Capacity: 10
  Start Time (in 5-min units): 978, End Time (in 5-min units): 993, Capacity: 10

Professor ID 1:
  Start Time (in 5-min units): 150, End Time (in 5-min units): 168, Capacity: 31
  Start Time (in 5-min units): 96, End Time (in 5-min units): 117, Capacity: 90
  Start Time (in 5-min units): 672, End Time (in 5-min units): 693, Capacity: 90
  Start Time (in 5-min units): 122, End Time (in 5-min units): 143, Capacity: 31
  Start Time (in 5-min units): 186, End Time (in 5-min units): 204, Capacity: 30
  Start Time (in 5-min units): 168, End Time (in 5-min units): 186, Capacity: 29
  Start Time (in 5-min un

In [22]:
def decode_time(value):
    minutes_per_day = 24 * 60 // 5  # 一天中的 5 分钟时间段数量
    day = value // minutes_per_day  # 计算周几
    time_in_day = value % minutes_per_day  # 计算一天中的时间段

    # 计算具体小时和分钟
    hours = (time_in_day * 5) // 60
    minutes = (time_in_day * 5) % 60

    # 将周几数字映射到名称
    day_mapping = ["周一", "周二", "周三", "周四", "周五", "周六", "周日"]
    day_name = day_mapping[day]

    return f"{day_name} {hours:02}:{minutes:02}"

# 示例调用
print(decode_time(978))

周四 09:30


In [21]:
import json

def find_courses_for_instructor(instructor_name):
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)

    instructor_courses = []

    # 递归查找包含指定 instructor 的 JSON 对象
    def search_for_instructor(obj):
        if isinstance(obj, dict):
            # 查找包含指定 instructor 的 JSON 对象
            if "meetings" in obj:
                for meeting in obj["meetings"]:
                    if "instructors" in meeting:
                        for instructor in meeting["instructors"]:
                            if instructor.get("name") == instructor_name:
                                instructor_courses.append(obj)
                                return  # 找到后即可停止当前对象的进一步遍历
            # 继续遍历子字段
            for value in obj.values():
                search_for_instructor(value)
        elif isinstance(obj, list):
            for item in obj:
                search_for_instructor(item)

    # 执行查找
    search_for_instructor(data)

    # 输出结果
    print(f"包含 '{instructor_name}' 的课程 JSON 对象:")
    for course in instructor_courses:
        print(json.dumps(course, indent=4, ensure_ascii=False))

    return instructor_courses

# 示例调用
find_courses_for_instructor("Min Ye")


包含 'Min Ye' 的课程 JSON 对象:
{
    "class_details": {
        "institution": "BU001",
        "subject": "CASIR",
        "catalog_nbr": "564",
        "status": "Open",
        "class_number": 3298,
        "component": "IND",
        "course_offer_nbr": 1,
        "session": "Regular Academic Session",
        "session_code": "1",
        "class_section": "A1",
        "acad_org": "PAR",
        "section_descr": "CASIR 564 - A1",
        "units": "4 units",
        "acad_career": "UGRD",
        "acad_career_descr": "Undergraduate",
        "course_id": "105828",
        "course_title": "Political Economy of Rising Powers",
        "course_status": "A",
        "instruction_mode": "In-Person",
        "grading_basis": "Grades",
        "campus": "Boston University",
        "campus_code": "MAIN",
        "location": "Charles River",
        "topic": "",
        "class_components": "<table class=\"PSTEXT\"><tr><td>Independent Required</td></tr></table>"
    },
    "meetings": [
        {


[{'class_details': {'institution': 'BU001',
   'subject': 'CASIR',
   'catalog_nbr': '564',
   'status': 'Open',
   'class_number': 3298,
   'component': 'IND',
   'course_offer_nbr': 1,
   'session': 'Regular Academic Session',
   'session_code': '1',
   'class_section': 'A1',
   'acad_org': 'PAR',
   'section_descr': 'CASIR 564 - A1',
   'units': '4 units',
   'acad_career': 'UGRD',
   'acad_career_descr': 'Undergraduate',
   'course_id': '105828',
   'course_title': 'Political Economy of Rising Powers',
   'course_status': 'A',
   'instruction_mode': 'In-Person',
   'grading_basis': 'Grades',
   'campus': 'Boston University',
   'campus_code': 'MAIN',
   'location': 'Charles River',
   'topic': '',
   'class_components': '<table class="PSTEXT"><tr><td>Independent Required</td></tr></table>'},
  'meetings': [{'meets': 'Th 12:30PM - 3:15PM',
    'days': 'Th',
    'show_days': True,
    'meeting_time_start': '12:30PM',
    'meeting_time_end': '3:15PM',
    'bldg_cd': 'IRC',
    'bldg_h

In [75]:
import json

def find_room(keyword):
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)

    rooms_with_keyword = []  # 存储包含指定关键词的 meeting 项

    # 递归查找 meetings 中 room 包含关键词的项
    def search_meetings(obj):
        if isinstance(obj, dict):
            # 查找 meetings 中的 room 字段
            if "meetings" in obj:
                for meeting in obj["meetings"]:
                    if "room" in meeting and keyword in meeting["room"]:  # 部分匹配
                        rooms_with_keyword.append(meeting)
            # 继续遍历子字段
            for value in obj.values():
                search_meetings(value)
        elif isinstance(obj, list):
            for item in obj:
                search_meetings(item)

    # 执行查找
    search_meetings(data)

    # 输出结果
    print(f"包含 '{keyword}' 的房间信息:", rooms_with_keyword)
    return rooms_with_keyword

# 示例调用
rooms = find_room("CFA 154")


包含 'CFA 154' 的房间信息: [{'meets': 'Tu 9:00AM - 10:45AM', 'days': 'Tu', 'show_days': True, 'meeting_time_start': '9:00AM', 'meeting_time_end': '10:45AM', 'bldg_cd': 'CFA', 'bldg_has_coordinates': False, 'room': '855 Commonwealth Ave CFA 154', 'meeting_topic': 'TBA', 'instructors': [{'name': 'Samuel Bradley', 'email': ''}], 'topic': 'TBA', 'show_topic': False, 'date_range': '09/03/2024 - 12/10/2024'}, {'meets': 'TuTh 12:30PM - 1:45PM', 'days': 'TuTh', 'show_days': True, 'meeting_time_start': '12:30PM', 'meeting_time_end': '1:45PM', 'bldg_cd': 'CFA', 'bldg_has_coordinates': False, 'room': '855 Commonwealth Ave CFA 154', 'meeting_topic': 'TBA', 'instructors': [{'name': 'Victor Coelho', 'email': ''}], 'topic': 'TBA', 'show_topic': False, 'date_range': '09/03/2024 - 12/10/2024'}, {'meets': 'MoWeTh 8:00AM - 8:50AM', 'days': 'MoWeTh', 'show_days': True, 'meeting_time_start': '8:00AM', 'meeting_time_end': '8:50AM', 'bldg_cd': 'CFA', 'bldg_has_coordinates': False, 'room': '855 Commonwealth Ave CFA 

<h1>数据提取类</h1>

<h2>capacities: 所有教室容量的一维整数数组</h2><br>

In [3]:
import json

def extract_capacity_from_additional_info():
    with open(classroom_json, 'r', encoding='utf-8') as infile:
        data = json.load(infile)  # 从指定路径读取 JSON 文件

    capacities = []  # 用于存储所有的 Capacity 值

    # 递归查找 AdditionalInfo 中的 Capacity
    def find_capacity(obj):
        if isinstance(obj, dict):
            # 检查并提取 Capacity 值并转换为整数
            if "AdditionalInfo" in obj and "Capacity" in obj["AdditionalInfo"]:
                try:
                    capacities.append(int(obj["AdditionalInfo"]["Capacity"]))
                except ValueError:
                    pass  # 忽略无法转换为整数的值
            # 继续遍历子字段
            for value in obj.values():
                find_capacity(value)
        elif isinstance(obj, list):
            for item in obj:
                find_capacity(item)

    # 执行查找
    find_capacity(data)

    # 输出结果为 Python 整数列表
    # print("Capacity 整数列表:", capacities)
    return capacities

capacities = extract_capacity_from_additional_info()

<h2>name_capacity_dict: 教室名与容量的key-value pair</h2><br>

In [4]:
import json

def extract_name_capacity_dict():
    with open(classroom_json, 'r', encoding='utf-8') as infile:
        data = json.load(infile)  # 从指定路径读取 JSON 文件

    name_capacity_dict = {}  # 用于存储 Name 和 Capacity 的键值对

    # 递归查找 Name 和 AdditionalInfo 中的 Capacity
    def find_name_capacity(obj):
        if isinstance(obj, dict):
            # 检查并提取 Name 和 Capacity
            if "Name" in obj and "AdditionalInfo" in obj and "Capacity" in obj["AdditionalInfo"]:
                try:
                    name_capacity_dict[obj["Name"]] = int(obj["AdditionalInfo"]["Capacity"])
                except ValueError:
                    pass  # 忽略无法转换为整数的 Capacity 值
            # 继续遍历子字段
            for value in obj.values():
                find_name_capacity(value)
        elif isinstance(obj, list):
            for item in obj:
                find_name_capacity(item)

    # 执行查找
    find_name_capacity(data)

    # 输出结果为 Python 词典
    # print("Name-Capacity 词典:", name_capacity_dict)
    return name_capacity_dict

name_capacity_dict = extract_name_capacity_dict()


<h2>professor_mapping: 教授-id的字典</h2><br>

In [5]:
import json

def extract_professor_mapping():
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)  # 从输出文件读取数据

    professor_mapping = {}
    professor_id_counter = 0

    # 递归查找 meetings 中的 instructors 名字
    def find_instructors(obj):
        nonlocal professor_id_counter
        if isinstance(obj, dict):
            # 检查并提取 instructors 名字
            if "meetings" in obj:
                for meeting in obj["meetings"]:
                    if "instructors" in meeting:
                        for instructor in meeting["instructors"]:
                            name = instructor.get("name")
                            # 为每个新教授分配一个唯一 ID
                            if name and name not in professor_mapping:
                                professor_mapping[name] = professor_id_counter
                                professor_id_counter += 1
            # 继续遍历子字段
            for value in obj.values():
                find_instructors(value)
        elif isinstance(obj, list):
            for item in obj:
                find_instructors(item)

    # 执行查找
    find_instructors(data)

    # 输出教授映射表
    # print("教授映射表:", professor_mapping)
    return professor_mapping

professor_mapping = extract_professor_mapping()


<h2>professor_schedule: 教授上课信息 词典</h2><br>

In [6]:
import json
from datetime import datetime

def build_professor_schedule():
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)

    professor_schedule = {}

    # 将星期缩写映射为数字，以便记录每天的上课信息
    day_mapping = {
        "Mo": 0, "Tu": 1, "We": 2, "Th": 3, "Fr": 4, "Sa": 5, "Su": 6
    }

    # 解析时间为分钟并整除5，同时考虑周几的影响
    def parse_time(time_str, day):
        time_obj = datetime.strptime(time_str, "%I:%M%p")  # 解析时间
        minutes = time_obj.hour * 60 + time_obj.minute
        # 时间段的计算公式，加入周几的信息
        return (minutes // 5) + day * (24 * 60 // 5)

    # 递归查找 meetings 并构建教授的课程安排
    def find_meetings(obj):
        if isinstance(obj, dict):
            # 提取教授及课程信息
            if "meetings" in obj and "class_availability" in obj:
                # 将 capacity 转换为整数格式
                capacity = obj["class_availability"].get("class_capacity")
                if capacity is not None:
                    capacity = int(capacity)  # 转换为整数

                for meeting in obj["meetings"]:
                    if "instructors" in meeting and "days" in meeting and "meeting_time_start" in meeting and "meeting_time_end" in meeting:
                        # 提取上课天数并处理
                        days_str = meeting["days"]
                        days = [day_mapping[days_str[i:i+2]] for i in range(0, len(days_str), 2) if days_str[i:i+2] in day_mapping]

                        # 为每位教授记录课程信息
                        for instructor in meeting["instructors"]:
                            professor_name = instructor.get("name")
                            professor_id = professor_mapping.get(professor_name)

                            if professor_id is not None:
                                if professor_id not in professor_schedule:
                                    professor_schedule[professor_id] = []

                                # 计算并存储每一天的课程信息
                                for day in days:
                                    start_time = parse_time(meeting["meeting_time_start"], day)
                                    end_time = parse_time(meeting["meeting_time_end"], day)
                                    professor_schedule[professor_id].append((start_time, end_time, capacity))

            # 继续遍历子字段
            for value in obj.values():
                find_meetings(value)
        elif isinstance(obj, list):
            for item in obj:
                find_meetings(item)

    # 执行查找
    find_meetings(data)

    # 输出结果
    return professor_schedule

# 假设之前生成的教授映射表 professor_mapping 已存在
professor_schedule = build_professor_schedule()

<h2>classroom_mapping: 教室mapping</h2><br>

In [7]:
import json

def create_classroom_mapping():
    with open(classroom_json, 'r', encoding='utf-8') as infile:
        data = json.load(infile)

    classroom_mapping = {}
    classroom_id_counter = 0

    # 递归查找 Name 字段，创建映射表
    def find_classrooms(obj):
        nonlocal classroom_id_counter
        if isinstance(obj, dict):
            if "Name" in obj:
                classroom_name = obj["Name"]
                # 如果该教室名称尚未在映射表中，则添加
                if classroom_name not in classroom_mapping:
                    classroom_mapping[classroom_name] = classroom_id_counter
                    classroom_id_counter += 1
            # 继续遍历子字段
            for value in obj.values():
                find_classrooms(value)
        elif isinstance(obj, list):
            for item in obj:
                find_classrooms(item)

    # 执行查找
    find_classrooms(data)

    # 输出教室映射表
    # print("教室映射表:", classroom_mapping)
    return classroom_mapping

# 创建教室映射表
classroom_mapping = create_classroom_mapping()


In [8]:
import numpy as np
import json
from datetime import datetime

# 假设存在 JSON 文件路径 output_file，和 professor_mapping、classroom_mapping 已加载
# professor_mapping = {"Professor A": 0, "Professor B": 1, ...}
# classroom_mapping = {"IRC 220": 0, "COM 111": 1, ...}

# 初始化整数数组：假设有 N 名教授，M 个教室，一周总的时间段数 T
N = len(professor_mapping)
M = len(classroom_mapping)
T = 7 * 24 * 60 // 5  # 一周的时间段数量（按 5 分钟间隔）

professor_courses = np.zeros((N, M, T), dtype=int)  # 初始化为整数类型

# 将时间字符串解析为分钟数并整除 5
def parse_time_to_5_min_units(time_str):
    time_obj = datetime.strptime(time_str, "%I:%M%p")
    minutes = time_obj.hour * 60 + time_obj.minute
    return minutes // 5

# 从 JSON 文件中提取信息并填充 professor_courses
with open(output_file, 'r', encoding='utf-8') as infile:
    data = json.load(infile)
    for course in data:  # 遍历每个课程对象
        section_info = course.get("section_info", {})
        meetings = section_info.get("meetings", [])
        for obj in meetings:
            # 提取教室名称
            room_field = obj.get("room", "")
            room_parts = room_field.split()
            room_name = room_parts[-2] + " " + room_parts[-1] if len(room_parts) >= 2 else None

            # 跳过 room 为 None 或 "NO ROOM" 的情况
            if room_name in [None, "NO ROOM"]:
                continue

            # 获取教授名称
            instructors = obj.get("instructors", [])
            for instructor in instructors:
                professor_name = instructor.get("name")
                professor_id = professor_mapping.get(professor_name)
                room_id = classroom_mapping.get(room_name)

                # 输出错误信息（未找到教授或教室的 ID）
                if professor_id is None:
                    print(f"教授 '{professor_name}' 未找到对应的 ID")
                if room_id is None:
                    print(f"Room '{room_name}' can't find ID")

                # 只有在找到教授和教室的 ID 后，才进行设置
                if professor_id is not None and room_id is not None:
                    # 提取时间信息
                    days_str = obj.get("days", "")
                    start_time = parse_time_to_5_min_units(obj["meeting_time_start"])
                    end_time = parse_time_to_5_min_units(obj["meeting_time_end"])

                    # 计算每周的时间偏移量
                    day_mapping = {"Mo": 0, "Tu": 1, "We": 2, "Th": 3, "Fr": 4, "Sa": 5, "Su": 6}
                    for day_abbr in [days_str[i:i+2] for i in range(0, len(days_str), 2)]:
                        day = day_mapping.get(day_abbr)
                        if day is not None:
                            start_k = start_time + day * (24 * 60 // 5)
                            end_k = end_time + day * (24 * 60 // 5)

                            # 填充 professor_courses 数组，将对应时间段设置为 1
                            for k in range(start_k, end_k):
                                professor_courses[professor_id][room_id][k] = 1


Room 'REL 404' can't find ID
Room 'CGS 427' can't find ID
Room 'WED 411' can't find ID
Room 'CFA 354' can't find ID
Room 'CFA 352' can't find ID
Room 'Health Ctr/Underserved' can't find ID
Room 'Health Ctr/Underserved' can't find ID
Room 'LAW 508' can't find ID
Room 'LAW 508' can't find ID
Room 'SAR 236' can't find ID
Room 'CGS 427' can't find ID
Room 'YAW 419' can't find ID
Room 'HAR 658' can't find ID
Room 'REL 404' can't find ID
Room 'PHO 207' can't find ID
Room 'Medical Center' can't find ID
Room 'Medical Center' can't find ID
Room 'CFA 352' can't find ID
Room 'INS 212x4' can't find ID
Room 'LAW 513' can't find ID
Room 'LSE 904' can't find ID
Room 'MCH 102' can't find ID
Room 'CGS 417' can't find ID
Room 'HAR 419' can't find ID
Room 'HAR 419' can't find ID
Room 'PLS 512' can't find ID
Room 'LAW 203' can't find ID
Room 'LAW 420' can't find ID
Room 'LAW 420' can't find ID
Room 'STH 541' can't find ID
Room 'PHO 207' can't find ID
Room 'Auburn Hospital' can't find ID
Room 'Auburn Hospi

In [47]:
professor_courses[0][94][978]

ModuleNotFoundError: No module named 'ace_tools'

array([[ 0.,  0.,  0., ..., inf, inf, inf],
       [ 0.,  0.,  0., ..., inf, inf, inf],
       [ 0.,  0.,  0., ..., inf, inf, inf],
       ...,
       [inf, inf, inf, ...,  0.,  0.,  0.],
       [inf, inf, inf, ...,  0.,  0.,  0.],
       [inf, inf, inf, ...,  0.,  0.,  0.]])

In [52]:
classroom_mapping

In [56]:
professor_mapping

Buildings with no recorded distances: {'BCN', 'HOU', 'OSW', 'INS', 'EVN', 'BAB', 'ALB', 'XBG', 'GDS', 'ABG', 'CTC', 'FPH'}


In [45]:
professor_schedule

数据已成功导出到 data_export.pkl 文件中


In [127]:
import pickle

# 假设已存在以下变量
# professor_schedule, professor_mapping, classroom_mapping, professor_courses, capacities

# 将这些变量保存到一个字典中
data_to_export = {
    "professor_schedule": professor_schedule,
    "professor_mapping": professor_mapping,
    "classroom_mapping": classroom_mapping,
    "professor_courses": professor_courses,
    "capacities": capacities
}

# 导出数据到一个 pickle 文件
with open("data_export.pkl", "wb") as file:
    pickle.dump(data_to_export, file)

print("数据已成功导出到 data_export.pkl 文件中")


数据已成功导出到 data_export.pkl 文件中


In [128]:
# 打印 professor_courses 的形状
print("professor_courses 矩阵的形状:", professor_courses.shape)


professor_courses 矩阵的形状: (2786, 624, 2016)
