# Collect Course Data from explorecourses

In [2]:
from explorecourses import *
from explorecourses import filters

import json
import csv
import pandas as pd
from datetime import datetime
from uuid import uuid4
import os


In [3]:
connect = CourseConnection()

year = "2022-2023"
cs_courses = connect.get_courses_by_department("CS", year=year)

### Prepare Pandas DFs

In [4]:
course_headers = [
    "id",  # primary key, referred to as `course_id` by Stanford
    "year",
    "name",
    "subject",
    "title",
    "description",
    "units_min",
    "units_max",
    "repeatable",
    "grading_basis",
    "final_exam",
    "academic_group",
    "academic_org",
    "way_aii",
    "way_aqr",
    "way_ce",
    "way_edp",
    "way_er",
    "way_fr",
    "way_sma",
    "way_si",
]
course_headers = [header.lower() for header in course_headers]
course_df = pd.DataFrame(columns=course_headers)
print(f"course_df columns:\n{course_df.columns.to_list()}\n")


instructor_headers = [
    "id",  # primary key `id` is an instrcutors sunet (tristans, jksun, etc.)
    "name",
    "first_name",
    "last_name",
    "email",
]
instructor_headers = [header.lower() for header in instructor_headers]
instructor_df = pd.DataFrame(columns=instructor_headers)
print(f"instructor_df columns:\n{instructor_df.columns.to_list()}\n")


offering_headers = [
    "id",  # primary key uuid
    "course_id",  # foreign key linked to `course` table
    "quarter",
    "start_date",  # date object
    "end_date",
    "start_time",  # time object
    "end_time",
    "monday",
    "tuesday",
    "wednesday",
    "thursday",
    "friday",
    "location",
    "primary_instructor", # foreign key linked to `instructor` table
    "instructors",  # list of instructor foreign keys
]
offering_headers = [header.lower() for header in offering_headers]
offering_df = pd.DataFrame(columns=offering_headers)
print(f"offering_df columns:\n{offering_df.columns.to_list()}\n")

course_df columns:
['id', 'year', 'name', 'subject', 'title', 'description', 'units_min', 'units_max', 'repeatable', 'grading_basis', 'final_exam', 'academic_group', 'academic_org', 'way_aii', 'way_aqr', 'way_ce', 'way_edp', 'way_er', 'way_fr', 'way_sma', 'way_si']

instructor_df columns:
['id', 'name', 'first_name', 'last_name', 'email']

offering_df columns:
['id', 'course_id', 'quarter', 'start_date', 'end_date', 'start_time', 'end_time', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'location', 'primary_instructor', 'instructors']



### strip_courses collects all the needed data for the three seperate tables

In [6]:
def strip_course(curCourse, new_courses, new_offerings, new_instructors, verbose=False):
    if type(curCourse) != dict:  # make sure curCourse is a dict
        curCourse = curCourse.__dict__

    # based on course_headers, make a new instance of a course and save it to new_courses if it is not already in the course_df
    if curCourse["course_id"] not in course_df["id"]:
        curCourseDict = {
            "id": curCourse["course_id"],
            "year": curCourse["year"],
            "name": f"{curCourse['subject'].upper()} {curCourse['code']}",
            "subject": curCourse["subject"].upper(),
            "title": curCourse["title"],
            "description": curCourse["description"],
            "units_min": curCourse["units_min"],
            "units_max": curCourse["units_max"],
            "repeatable": curCourse["repeatable"],
            "grading_basis": curCourse["grading_basis"],
            "final_exam": curCourse["final_exam"],
            "academic_group": curCourse["academic_group"],
            "academic_org": curCourse["academic_org"],
        }

        ways = [
            "way-aii",
            "way-aqr",
            "way-ce",
            "way-edp",
            "way-er",
            "way-fr",
            "way-sma",
            "way-si"
        ]
        curCourseGers = [item.lower() for item in curCourse["gers"]]

        for way in ways:
            if way in curCourseGers:
                way = way.replace("-", "_")
                curCourseDict[way] = True
            else:
                way = way.replace("-", "_")
                curCourseDict[way] = False
        
        new_courses.append(curCourseDict)

        if verbose:
            print(f"Added course {curCourseDict['name']}")

    # handle the addition of schedule and instructor data
    sections = curCourse["sections"]

    for section in sections:
        section = section.__dict__

        if (
            section["component"] == "LEC"
        ):  # only want to add lectures rn, TODO handle other classes? Introsems? etc...
            # print(section)
            quarter = section["term"].split()[-1]

            # !!! EACH schedule is considered a different offering, so we need to add a new offering for each schedule
            for schedule in section["schedules"]:
                curSchedule = schedule.__dict__

                offering_primary_instructor = None
                offering_instructors = []

                for instructor in curSchedule["instructors"]:
                    curInstructor = instructor.__dict__

                    # Add instructor ids to the curOffering
                    if curInstructor["is_primary_instructor"]:
                        offering_primary_instructor = curInstructor["sunet_id"]
                    offering_instructors.append(curInstructor["sunet_id"])

                    # add curInstructor to the set
                    curInstructorDict = {
                        "id": curInstructor["sunet_id"],
                        "name": f'{curInstructor["first_name"]} {curInstructor["last_name"]}',
                        "first_name": curInstructor["first_name"],
                        "last_name": curInstructor["last_name"],
                        "email": f'{curInstructor["sunet_id"]}@stanford.edu',
                    }

                    curInstructorDictJson = json.dumps(curInstructorDict)
                    new_instructors.add(curInstructorDictJson)

                    if verbose:
                        print(f"Added instructor {curInstructorDict['id']}")


                curOfferingDict = {
                    "id": uuid4().__str__(),  # TODO need to handle updating values while leaving the id the same?
                    "course_id": curCourse["course_id"],
                    "quarter": quarter,
                    "start_date": curSchedule["start_date"],
                    "end_date": curSchedule["end_date"],
                    "start_time": curSchedule["start_time"],
                    "end_time": curSchedule["end_time"],
                    "monday": False,
                    "tuesday": False,
                    "wednesday": False,
                    "thursday": False,
                    "friday": False,
                    "location": curSchedule["location"],
                    "primary_instructor": offering_primary_instructor,
                    "instructors": offering_instructors,
                }

                # Add the days of the week the schedule is set for
                for day in curSchedule["days"]:
                    curOfferingDict[day.lower()] = True
               
                # If a course doesn't have days, start_time or end_time assgined, it probably isn't offered so we skip it
                if len(curSchedule["days"]) == 0 and curSchedule["start_time"] == "" and curSchedule["end_time"] == "":
                    print(f"Offering for course {curCourse['subject'].upper()} {curCourse['code']} doesn't have a schedule skipping...")
                    continue
                if len(offering_instructors) == 0:
                    print(f"Offering for course {curCourse['subject'].upper()} {curCourse['code']} doesn't have an instructor skipping...")
                    continue

                new_offerings.append(curOfferingDict)

                if verbose:
                    print(
                        f"Added offering for {curCourse['subject'].upper()} {curCourse['code']} start_date: {curOfferingDict['start_date']}"
                    )

### Running the stripping and saving the data

In [7]:
new_courses = []
new_instructors = set()
new_offerings = []

for course in cs_courses:
    strip_course(course, new_courses, new_offerings, new_instructors)

new_instructors = [json.loads(instructor) for instructor in list(new_instructors)]

# create new df using headers
course_df = pd.DataFrame(new_courses, columns=course_headers)
instructor_df = pd.DataFrame(new_instructors, columns=instructor_headers)
offering_df = pd.DataFrame(new_offerings, columns=offering_headers)



Offering for course CS 105 doesn't have a schedule skipping...
Offering for course CS 173A doesn't have a schedule skipping...
Offering for course CS 197 doesn't have a schedule skipping...
Offering for course CS 197C doesn't have an instructor skipping...
Offering for course CS 197C doesn't have an instructor skipping...
Offering for course CS 198 doesn't have an instructor skipping...
Offering for course CS 198 doesn't have an instructor skipping...
Offering for course CS 198 doesn't have an instructor skipping...
Offering for course CS 198B doesn't have a schedule skipping...
Offering for course CS 198B doesn't have a schedule skipping...
Offering for course CS 207 doesn't have a schedule skipping...
Offering for course CS 247B doesn't have a schedule skipping...
Offering for course CS 348C doesn't have a schedule skipping...
Offering for course CS 371 doesn't have a schedule skipping...


In [8]:
current_time = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

os.mkdir(f"data/{current_time}")

# df.drop(df.index, inplace=True)
course_df.to_csv(f"data/{current_time}/course.csv", index=False)

# df.drop(df.index, inplace=True)
offering_df.to_csv(f"data/{current_time}/offering.csv", index=False)

# df.drop(df.index, inplace=True)
instructor_df.to_csv(f"data/{current_time}/instructor.csv", index=False)
