In [1]:
input_file = '../data/details.json'
output_file = '../data/details_cleaned.json'
classroom_json = '../data/classroom_data.json'

<h1>Data Cleaning</h1>

In [None]:
def clean_large_json(input_file, output_file, keys_to_remove):
    with open(input_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)
    def remove_keys(obj, keys):
        if isinstance(obj, dict):
            for key in keys:
                obj.pop(key, None)
            for value in obj.values():
                remove_keys(value, keys)
        elif isinstance(obj, list):
            for item in obj:
                remove_keys(item, keys)
    remove_keys(data, keys_to_remove)
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(data, outfile, indent=4, ensure_ascii=False)
keys_to_remove = ["additionalLinks", "bookstore", "cfg", "catalog_descr", "materials", "enrollment_information", "reserve_caps", "catalog_descr", "messages", "notes"]
clean_large_json(input_file, output_file, keys_to_remove)


In [None]:
def remove_class_capacity_999():
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)

    def remove_capacity_999(obj):
        if isinstance(obj, dict):
            if obj.get("class_capacity") == "999" or obj.get("class_capacity") == 999:
                return None
            new_obj = {}
            for key, value in obj.items():
                result = remove_capacity_999(value)
                if result is not None:
                    new_obj[key] = result
            return new_obj
        elif isinstance(obj, list):
            return [remove_capacity_999(item) for item in obj if remove_capacity_999(item) is not None]
        else:
            return obj
    cleaned_data = remove_capacity_999(data)

    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(cleaned_data, outfile, indent=4, ensure_ascii=False)
remove_class_capacity_999()


In [None]:
def remove_online_instruction_mode():
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)
    def remove_online_mode(obj):
        if isinstance(obj, dict):
            if obj.get("instruction_mode") == 'Online':
                return None
            new_obj = {}
            for key, value in obj.items():
                result = remove_online_mode(value)
                if result is not None:
                    new_obj[key] = result
            return new_obj
        elif isinstance(obj, list):
            return [remove_online_mode(item) for item in obj if remove_online_mode(item) is not None]
        else:
            return obj
    cleaned_data = remove_online_mode(data)

    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(cleaned_data, outfile, indent=4, ensure_ascii=False)
remove_online_instruction_mode()


In [None]:
def remove_tba_instructors():
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)
    def clean_instructors(obj):
        if isinstance(obj, dict):
            if "meetings" in obj:
                for meeting in obj["meetings"]:
                    if "instructors" in meeting:
                        meeting["instructors"] = [
                            instructor for instructor in meeting["instructors"]
                            if instructor.get("name") != "To Be Announced"
                        ]
            for value in obj.values():
                clean_instructors(value)
        elif isinstance(obj, list):
            for item in obj:
                clean_instructors(item)
    clean_instructors(data)
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(data, outfile, indent=4, ensure_ascii=False)
remove_tba_instructors()


In [None]:
def clean_empty_instructors_tba_meets_and_empty_times():
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)
    def clean_data(obj):
        if isinstance(obj, dict):
            if "instructors" in obj and isinstance(obj["instructors"], list) and not obj["instructors"]:
                return None
            if obj.get("meets") == "TBA":
                return None
            if "meeting_time_start" in obj and obj["meeting_time_start"] == "":
                return None
            if "meeting_time_end" in obj and obj["meeting_time_end"] == "":
                return None
            new_obj = {}
            for key, value in obj.items():
                result = clean_data(value)
                if result is not None:
                    new_obj[key] = result
            return new_obj
        elif isinstance(obj, list):
            return [clean_data(item) for item in obj if clean_data(item) is not None]
        else:
            return obj
    cleaned_data = clean_data(data)
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(cleaned_data, outfile, indent=4, ensure_ascii=False)
clean_empty_instructors_tba_meets_and_empty_times()


In [None]:
def clean_classroom_names():
    with open(classroom_json, 'r', encoding='utf-8') as infile:
        data = json.load(infile)
    def remove_invalid_classrooms(obj):
        if isinstance(obj, dict):
            if "Name" in obj and '/' in obj["Name"]:
                return None
            new_obj = {}
            for key, value in obj.items():
                result = remove_invalid_classrooms(value)
                if result is not None:
                    new_obj[key] = result
            return new_obj
        elif isinstance(obj, list):
            return [remove_invalid_classrooms(item) for item in obj if remove_invalid_classrooms(item) is not None]
        else:
            return obj
    cleaned_data = remove_invalid_classrooms(data)
    with open(classroom_json, 'w', encoding='utf-8') as outfile:
        json.dump(cleaned_data, outfile, indent=4, ensure_ascii=False)
clean_classroom_names()


In [None]:
import re
def clean_classroom_names():
    with open(classroom_json, 'r', encoding='utf-8') as infile:
        data = json.load(infile)
    def process_classroom_names(obj):
        if isinstance(obj, dict):
            if "Name" in obj:
                cleaned_name = obj["Name"].replace('-', ' ')
                cleaned_name = re.sub(r'\(.*$', '', cleaned_name).strip()
                obj["Name"] = cleaned_name
            for key, value in obj.items():
                process_classroom_names(value)
        elif isinstance(obj, list):
            for item in obj:
                process_classroom_names(item)
    process_classroom_names(data)
    with open(classroom_json, 'w', encoding='utf-8') as outfile:
        json.dump(data, outfile, indent=4, ensure_ascii=False)
clean_classroom_names()


In [None]:
def clean_none_and_no_room_classrooms():
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)
    cleaned_data = []
    for course in data:
        section_info = course.get("section_info", {})
        meetings = section_info.get("meetings", [])
        if any(meeting.get("room") in [None, "NO ROOM"] for meeting in meetings):
            continue
        else:
            cleaned_data.append(course)
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(cleaned_data, outfile, indent=4, ensure_ascii=False)
clean_none_and_no_room_classrooms()


In [None]:
def clean_empty_meetings():
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)
    cleaned_data = []
    for course in data:
        section_info = course.get("section_info", {})
        meetings = section_info.get("meetings", [])
        if meetings:
            cleaned_data.append(course)
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(cleaned_data, outfile, indent=4, ensure_ascii=False)
clean_empty_meetings()


<h1>Debugger, ignore this part</h1>

In [None]:
def count_and_print_top_ten_class_capacities():
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)

    capacities = []
    capacity_999_count = 0

    def find_class_capacities(obj):
        nonlocal capacity_999_count
        if isinstance(obj, dict):
            # 检查并记录 class_capacity
            if "class_capacity" in obj:
                try:
                    capacity = int(obj["class_capacity"])
                    capacities.append(capacity)
                    if capacity == 999:
                        capacity_999_count += 1
                except ValueError:
                    pass
            for value in obj.values():
                find_class_capacities(value)
        elif isinstance(obj, list):
            for item in obj:
                find_class_capacities(item)

    find_class_capacities(data)

    print("前十节课的 class_capacity:", capacities[:10])
    print("class_capacity 为 999 的数量:", capacity_999_count)
count_and_print_top_ten_class_capacities()


In [None]:
def count_unique_instruction_modes():
    with open(output_file, 'r', encoding='utf-8') as outfile:
        data = json.load(outfile)
    instruction_modes = set()
    def find_instruction_modes(obj):
        if isinstance(obj, dict):
            if "instruction_mode" in obj:
                instruction_modes.add(obj["instruction_mode"])
            for value in obj.values():
                find_instruction_modes(value)
        elif isinstance(obj, list):
            for item in obj:
                find_instruction_modes(item)

    find_instruction_modes(data)

    print("不同的 instruction_mode 数量:", len(instruction_modes))
    print("不同的 instruction_mode 值:", instruction_modes)
count_unique_instruction_modes()


In [None]:
def display_sample_professor_schedule(professor_schedule, sample_size=10):
    print(f"Displaying schedule for the first {sample_size} professors:")
    for professor_id in range(sample_size):
        if professor_id in professor_schedule:
            print(f"\nProfessor ID {professor_id}:")
            for schedule in professor_schedule[professor_id]:
                start_time, end_time, capacity = schedule
                print(f"  Start Time (in 5-min units): {start_time}, "
                      f"End Time (in 5-min units): {end_time}, "
                      f"Capacity: {capacity}")
        else:
            print(f"\nProfessor ID {professor_id}: No schedule available")
display_sample_professor_schedule(professor_schedule)


In [None]:
def decode_time(value):
    minutes_per_day = 24 * 60 // 5
    day = value // minutes_per_day
    time_in_day = value % minutes_per_day

    hours = (time_in_day * 5) // 60
    minutes = (time_in_day * 5) % 60
    day_mapping = ["周一", "周二", "周三", "周四", "周五", "周六", "周日"]
    day_name = day_mapping[day]

    return f"{day_name} {hours:02}:{minutes:02}"
print(decode_time(978))

In [None]:
def find_courses_for_instructor(instructor_name):
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)

    instructor_courses = []

    def search_for_instructor(obj):
        if isinstance(obj, dict):
            if "meetings" in obj:
                for meeting in obj["meetings"]:
                    if "instructors" in meeting:
                        for instructor in meeting["instructors"]:
                            if instructor.get("name") == instructor_name:
                                instructor_courses.append(obj)
                                return

            for value in obj.values():
                search_for_instructor(value)
        elif isinstance(obj, list):
            for item in obj:
                search_for_instructor(item)


    search_for_instructor(data)


    print(f" '{instructor_name}' ")
    for course in instructor_courses:
        print(json.dumps(course, indent=4, ensure_ascii=False))

    return instructor_courses
find_courses_for_instructor("Min Ye")


In [None]:
def find_room(keyword):
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)

    rooms_with_keyword = []

    def search_meetings(obj):
        if isinstance(obj, dict):
            if "meetings" in obj:
                for meeting in obj["meetings"]:
                    if "room" in meeting and keyword in meeting["room"]:
                        rooms_with_keyword.append(meeting)
            for value in obj.values():
                search_meetings(value)
        elif isinstance(obj, list):
            for item in obj:
                search_meetings(item)
    search_meetings(data)
    print(f"包含 '{keyword}' 的房间信息:", rooms_with_keyword)
    return rooms_with_keyword
rooms = find_room("CFA 154")


<h1>Data Processing</h1>

<h2>capacities: int[]</h2>

In [None]:
def extract_capacity_from_additional_info():
    with open(classroom_json, 'r', encoding='utf-8') as infile:
        data = json.load(infile)

    capacities = []


    def find_capacity(obj):
        if isinstance(obj, dict):
            if "AdditionalInfo" in obj and "Capacity" in obj["AdditionalInfo"]:
                try:
                    capacities.append(int(obj["AdditionalInfo"]["Capacity"]))
                except ValueError:
                    pass

            for value in obj.values():
                find_capacity(value)
        elif isinstance(obj, list):
            for item in obj:
                find_capacity(item)

    find_capacity(data)

    return capacities
capacities = extract_capacity_from_additional_info()

<h2>name_capacity_dict: dict</h2>

In [None]:
def extract_name_capacity_dict():
    with open(classroom_json, 'r', encoding='utf-8') as infile:
        data = json.load(infile)
    name_capacity_dict = {}
    def find_name_capacity(obj):
        if isinstance(obj, dict):
            if "Name" in obj and "AdditionalInfo" in obj and "Capacity" in obj["AdditionalInfo"]:
                try:
                    name_capacity_dict[obj["Name"]] = int(obj["AdditionalInfo"]["Capacity"])
                except ValueError:
                    pass
            for value in obj.values():
                find_name_capacity(value)
        elif isinstance(obj, list):
            for item in obj:
                find_name_capacity(item)

    find_name_capacity(data)
    return name_capacity_dict
name_capacity_dict = extract_name_capacity_dict()

<h2>professor_mapping: dict</h2>

In [None]:
def extract_professor_mapping():
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)

    professor_mapping = {}
    professor_id_counter = 0


    def find_instructors(obj):
        nonlocal professor_id_counter
        if isinstance(obj, dict):

            if "meetings" in obj:
                for meeting in obj["meetings"]:
                    if "instructors" in meeting:
                        for instructor in meeting["instructors"]:
                            name = instructor.get("name")

                            if name and name not in professor_mapping:
                                professor_mapping[name] = professor_id_counter
                                professor_id_counter += 1

            for value in obj.values():
                find_instructors(value)
        elif isinstance(obj, list):
            for item in obj:
                find_instructors(item)


    find_instructors(data)


    return professor_mapping
professor_mapping = extract_professor_mapping()


<h2>professor_schedule: dict</h2>

In [None]:
def build_professor_schedule():
    with open(output_file, 'r', encoding='utf-8') as infile:
        data = json.load(infile)

    professor_schedule = {}


    day_mapping = {
        "Mo": 0, "Tu": 1, "We": 2, "Th": 3, "Fr": 4, "Sa": 5, "Su": 6
    }


    def parse_time(time_str, day):
        time_obj = datetime.strptime(time_str, "%I:%M%p")
        minutes = time_obj.hour * 60 + time_obj.minute

        return (minutes // 5) + day * (24 * 60 // 5)


    def find_meetings(obj):
        if isinstance(obj, dict):

            if "meetings" in obj and "class_availability" in obj:

                capacity = obj["class_availability"].get("class_capacity")
                if capacity is not None:
                    capacity = int(capacity)

                for meeting in obj["meetings"]:
                    if "instructors" in meeting and "days" in meeting and "meeting_time_start" in meeting and "meeting_time_end" in meeting:

                        days_str = meeting["days"]
                        days = [day_mapping[days_str[i:i+2]] for i in range(0, len(days_str), 2) if days_str[i:i+2] in day_mapping]


                        for instructor in meeting["instructors"]:
                            professor_name = instructor.get("name")
                            professor_id = professor_mapping.get(professor_name)

                            if professor_id is not None:
                                if professor_id not in professor_schedule:
                                    professor_schedule[professor_id] = []


                                for day in days:
                                    start_time = parse_time(meeting["meeting_time_start"], day)
                                    end_time = parse_time(meeting["meeting_time_end"], day)
                                    professor_schedule[professor_id].append((start_time, end_time, capacity))


            for value in obj.values():
                find_meetings(value)
        elif isinstance(obj, list):
            for item in obj:
                find_meetings(item)


    find_meetings(data)


    return professor_schedule
professor_schedule = build_professor_schedule()

<h2>classroom_mapping: dict</h2>

In [None]:
def create_classroom_mapping():
    with open(classroom_json, 'r', encoding='utf-8') as infile:
        data = json.load(infile)

    classroom_mapping = {}
    classroom_id_counter = 0


    def find_classrooms(obj):
        nonlocal classroom_id_counter
        if isinstance(obj, dict):
            if "Name" in obj:
                classroom_name = obj["Name"]

                if classroom_name not in classroom_mapping:
                    classroom_mapping[classroom_name] = classroom_id_counter
                    classroom_id_counter += 1

            for value in obj.values():
                find_classrooms(value)
        elif isinstance(obj, list):
            for item in obj:
                find_classrooms(item)


    find_classrooms(data)


    return classroom_mapping
classroom_mapping = create_classroom_mapping()

<h2>professor_courses: int[][][]</h2>

In [None]:
import numpy as np
import json
from datetime import datetime
N = len(professor_mapping)
M = len(classroom_mapping)
T = 7 * 24 * 60 // 5
professor_courses = np.zeros((N, M, T), dtype=int)
def parse_time_to_5_min_units(time_str):
    time_obj = datetime.strptime(time_str, "%I:%M%p")
    minutes = time_obj.hour * 60 + time_obj.minute
    return minutes // 5
with open(output_file, 'r', encoding='utf-8') as infile:
    data = json.load(infile)
    for course in data:
        section_info = course.get("section_info", {})
        meetings = section_info.get("meetings", [])
        for obj in meetings:
            room_field = obj.get("room", "")
            room_parts = room_field.split()
            room_name = room_parts[-2] + " " + room_parts[-1] if len(room_parts) >= 2 else None
            if room_name in [None, "NO ROOM"]:
                continue
            instructors = obj.get("instructors", [])
            for instructor in instructors:
                professor_name = instructor.get("name")
                professor_id = professor_mapping.get(professor_name)
                room_id = classroom_mapping.get(room_name)
                if professor_id is None:
                    print(f"教授 '{professor_name}' 未找到对应的 ID")
                if room_id is None:
                    print(f"Room '{room_name}' can't find ID")
                if professor_id is not None and room_id is not None:
                    days_str = obj.get("days", "")
                    start_time = parse_time_to_5_min_units(obj["meeting_time_start"])
                    end_time = parse_time_to_5_min_units(obj["meeting_time_end"])
                    day_mapping = {"Mo": 0, "Tu": 1, "We": 2, "Th": 3, "Fr": 4, "Sa": 5, "Su": 6}
                    for day_abbr in [days_str[i:i+2] for i in range(0, len(days_str), 2)]:
                        day = day_mapping.get(day_abbr)
                        if day is not None:
                            start_k = start_time + day * (24 * 60 // 5)
                            end_k = end_time + day * (24 * 60 // 5)
                            for k in range(start_k, end_k):
                                professor_courses[professor_id][room_id][k] = 1


<h2>walking_cost: float[][], time cost matrix</h2>

In [None]:
import pandas as pd
b2b_distance = pd.read_csv("../data/b2b_walking_distance.csv")
buildings = {name.split()[0]: idx for name, idx in classroom_mapping.items()}
num_classrooms = len(classroom_mapping)
walking_cost = np.full((num_classrooms, num_classrooms), np.inf)
for i in range(num_classrooms):
    for j in range(num_classrooms):
        if i == j:
            walking_cost[i][j] = 0
        elif list(classroom_mapping.keys())[i].split()[0] == list(classroom_mapping.keys())[j].split()[0]:
            walking_cost[i][j] = 0
for _, row in b2b_distance.iterrows():
    building_a, building_b, distance = row['abbreviationA'], row['abbreviationB'], row['distance']
    for classroom_a, idx_a in classroom_mapping.items():
        for classroom_b, idx_b in classroom_mapping.items():
            if classroom_a.split()[0] == building_a and classroom_b.split()[0] == building_b:
                walking_cost[idx_a][idx_b] = distance
                walking_cost[idx_b][idx_a] = distance 

<h1>Export</h1>

In [None]:
import pickle
data_to_export = {
    "professor_schedule": professor_schedule,
    "professor_mapping": professor_mapping,
    "classroom_mapping": classroom_mapping,
    "professor_courses": professor_courses,
    "capacities": capacities,
    "walking_cost": walking_cost
}

with open("data_export.pkl", "wb") as file:
    pickle.dump(data_to_export, file)
