# Collect Course Data from explorecourses

In [1]:
from explorecourses import *
from explorecourses import filters

import json
import csv
import pandas as pd
from datetime import datetime
from uuid import uuid4
import os


### Prepare Pandas DFs

In [2]:
course_headers = [
    "course_id",  # primary key, referred to as `course_id` by Stanford
    "year",
    "name",
    "subject",
    "title",
    "description",
    "units_min",
    "units_max",
    "repeatable",
    "grading_basis",
    "final_exam",
    "academic_group",
    "academic_org",
    "way_a_ii",
    "way_aqr",
    "way_ce",
    "way_edp",
    "way_er",
    "way_fr",
    "way_sma",
    "way_si",
]
course_headers = [header.lower() for header in course_headers]
course_df = pd.DataFrame(columns=course_headers)
print(f"course_df columns:\n{course_df.columns.to_list()}\n")


instructor_headers = [
    "id",  # primary key `id` is an instrcutors sunet (tristans, jksun, etc.)
    "name",
    "first_name",
    "last_name",
    "email",
]
instructor_headers = [header.lower() for header in instructor_headers]
instructor_df = pd.DataFrame(columns=instructor_headers)
print(f"instructor_df columns:\n{instructor_df.columns.to_list()}\n")


offering_headers = [
    "id",  # primary key uuid
    "course_id",  # foreign key linked to `course` table
    "quarter",
    "start_time",  # time object
    "end_time",
    "monday",
    "tuesday",
    "wednesday",
    "thursday",
    "friday",
    "location",
    "primary_instructor_id", # foreign key linked to `instructor` table
    "primary_instructor_name", 
    "instructors",  # list of instructor foreign keys
]
offering_headers = [header.lower() for header in offering_headers]
offering_df = pd.DataFrame(columns=offering_headers)
print(f"offering_df columns:\n{offering_df.columns.to_list()}\n")

course_df columns:
['course_id', 'year', 'name', 'subject', 'title', 'description', 'units_min', 'units_max', 'repeatable', 'grading_basis', 'final_exam', 'academic_group', 'academic_org', 'way_a_ii', 'way_aqr', 'way_ce', 'way_edp', 'way_er', 'way_fr', 'way_sma', 'way_si']

instructor_df columns:
['id', 'name', 'first_name', 'last_name', 'email']

offering_df columns:
['id', 'course_id', 'quarter', 'start_time', 'end_time', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'location', 'primary_instructor_id', 'primary_instructor_name', 'instructors']



### strip_courses collects all the needed data for the three seperate tables

In [6]:
def strip_course(
    curCourse,
    new_courses,
    new_offerings,
    new_instructors,
    other_components,
    verbose=False,
):
    if type(curCourse) != dict:  # make sure curCourse is a dict
        curCourse = curCourse.__dict__

    # based on course_headers, make a new instance of a course and save it to new_courses if it is not already in the course_df
    # if curCourse["course_id"] not in course_df["id"]:
    curCourseDict = {
        "course_id": curCourse["course_id"],
        "year": curCourse["year"],
        "name": f"{curCourse['subject'].upper()} {curCourse['code']}",
        "subject": curCourse["subject"].upper(),
        "title": curCourse["title"],
        "description": curCourse["description"],
        "units_min": curCourse["units_min"],
        "units_max": curCourse["units_max"],
        "repeatable": curCourse["repeatable"],
        "grading_basis": curCourse["grading_basis"],
        "final_exam": curCourse["final_exam"],
        "academic_group": curCourse["academic_group"],
        "academic_org": curCourse["academic_org"],
    }

    ways = [
        "way-a-ii",
        "way-aqr",
        "way-ce",
        "way-edp",
        "way-er",
        "way-fr",
        "way-sma",
        "way-si",
    ]
    curCourseGers = [item.lower() for item in curCourse["gers"]]

    for way in ways:
        if way in curCourseGers:
            way = way.replace("-", "_")
            curCourseDict[way] = True
        else:
            way = way.replace("-", "_")
            curCourseDict[way] = False

    # Don't add new course till there is at least one valid offering
    # new_courses.append(curCourseDict)
    course_added = False

    if verbose:
        print(f"Added course {curCourseDict['name']}")

    # handle the addition of schedule and instructor data
    sections = curCourse["sections"]

    for section in sections:
        section = section.__dict__

        if (
            section["component"] == "LEC"
        ):  # only want to add lectures rn, TODO handle other classes? Introsems? etc...
            quarter = section["term"].split()[-1].lower()
            if quarter == "autumn":
                quarter = "fall"  # no one calls it autumn, it's fall...

            # !!! EACH schedule is considered a different offering, so we need to add a new offering for each schedule
            for schedule in section["schedules"]:
                curSchedule = schedule.__dict__

                offering_primary_instructor = None
                offering_primary_instructor_name = None
                offering_instructors = []

                for instructor in curSchedule["instructors"]:
                    curInstructor = instructor.__dict__

                    # Add instructor ids to the curOffering
                    instructor_id = curInstructor["sunet_id"]
                    if curInstructor["is_primary_instructor"]:
                        offering_primary_instructor = instructor_id
                        offering_primary_instructor_name = f'{curInstructor["first_name"]} {curInstructor["last_name"]}'

                    offering_instructors.append(instructor_id)

                    # add curInstructor to the set
                    curInstructorDict = {
                        "id": curInstructor["sunet_id"],
                        "name": f'{curInstructor["first_name"]} {curInstructor["last_name"]}',
                        "first_name": curInstructor["first_name"],
                        "last_name": curInstructor["last_name"],
                        "email": f'{curInstructor["sunet_id"]}@stanford.edu',
                    }

                    curInstructorDictJson = json.dumps(curInstructorDict)
                    new_instructors.add(curInstructorDictJson)

                    if verbose:
                        print(f"Added instructor {curInstructorDict['id']}")

                curOfferingDict = {
                    "id": uuid4().__str__(),  # TODO need to handle updating values while leaving the id the same?
                    "course_id": curCourse["course_id"],
                    "quarter": quarter,
                    "start_time": curSchedule["start_time"],
                    "end_time": curSchedule["end_time"],
                    "monday": False,
                    "tuesday": False,
                    "wednesday": False,
                    "thursday": False,
                    "friday": False,
                    "location": curSchedule["location"],
                    "primary_instructor_id": offering_primary_instructor,
                    "primary_instructor_name": offering_primary_instructor_name,
                    "instructors": offering_instructors,  # this array gets fixed later with json.dumps
                }

                # Add the days of the week the schedule is set for
                for day in curSchedule["days"]:
                    curOfferingDict[day.lower()] = True

                # If a course doesn't have days, start_time or end_time assgined, it probably isn't offered so we skip it
                if (
                    len(curSchedule["days"]) == 0
                    and curSchedule["start_time"] == ""
                    and curSchedule["end_time"] == ""
                ):
                    print(
                        f"Offering for course {curCourse['subject'].upper()} {curCourse['code']} doesn't have a schedule skipping..."
                    )
                    continue
                if len(offering_instructors) == 0:
                    print(
                        f"Offering for course {curCourse['subject'].upper()} {curCourse['code']} doesn't have an instructor skipping..."
                    )
                    continue

                curOfferingDictJson = json.dumps(curOfferingDict)
                new_offerings.add(curOfferingDictJson)
                # new_offerings.append(curOfferingDict)

                if not course_added:
                    new_courses.append(curCourseDict)
                    course_added = True

                if verbose:
                    print(
                        f"Added offering for {curCourse['subject'].upper()} {curCourse['code']} start_date: {curOfferingDict['start_date']}"
                    )

        else:
            other_components.add(section["component"])

    if not course_added:
        print(
            f"Course {curCourse['subject'].upper()} {curCourse['code']} was not added because it has no valid offerings"
        )

### Running the stripping and saving the data

In [44]:
# TESTING WITH JUST CS CLASSES
# new_courses = []
# new_instructors = set()
# new_offerings = []
# other_components = set()

# for course in cs_courses:
#     strip_course(course, new_courses, new_offerings, new_instructors, other_components)

# new_instructors = [json.loads(instructor) for instructor in list(new_instructors)]

# # create new df using headers
# course_df = pd.DataFrame(new_courses, columns=course_headers)
# instructor_df = pd.DataFrame(new_instructors, columns=instructor_headers)
# offering_df = pd.DataFrame(new_offerings, columns=offering_headers)
# offering_df['instructors'] = offering_df['instructors'].apply(json.dumps) # fixes issue with exporting an array of text

In [5]:
new_courses = []
new_instructors = set()
new_offerings = set()
other_components = set()

skip_list = ['ESF',
 'ITALIC',
 'SOAR',
 'ORALCOMM',
 'OSPGEN',
 'OSPAUSTL',
 'OSPBARCL',
 'OSPBEIJ',
 'OSPBER',
 'OSPCPTWN',
 'OSPFLOR',
 'OSPHONGK',
 'OSPISTAN',
 'OSPKYOTO',
 'OSPKYOCT',
 'OSPMADRD',
 'OSPOXFRD',
 'OSPPARIS',
 'OSPSANTG',
 'RESPROG',
 'ROTCAF',
 'ROTCARMY',
 'ROTCNAVY',
 'SINY',
 'SLE','LAW',
 'LAWGEN',
 'ANES',
 'BIOC',
 'BIODS',
 'BIOMEDIN',
 'BMP',
 'BIOS',
 'CBIO',
 'CTS',
 'CSB',
 'CHPR',
 'COMPMED',
 'DERM',
 'DBIO',
 'EMED',
 'EPI',
 'FAMMED',
 'GENE',
 'HRP',
 'IMMUNOL',
 'LEAD',
 'LIFE',
 'MED',
 'INDE',
 'MI',
 'MCP',
 'NBIO',
 'NENS',
 'NEPR',
 'NSUR',
 'OBGYN',
 'OPHT',
 'ORTHO',
 'OTOHNS',
 'PATH',
 'PEDS',
 'PAS',
 'PSYC',
 'RADO',
 'RAD',
 'SOMGEN',
 'STEMREM',
 'SBIO',
 'SURG',
 'UROL']

year = "2022-2023"
connect = CourseConnection()
for school in connect.get_schools(year):
    for dept in school.departments:
        if dept.code in skip_list:
            continue
        courses = connect.get_courses_by_department(dept.code, year=year)
        for course in courses:
            strip_course(course, new_courses, new_offerings, new_instructors, other_components)

new_instructors = [json.loads(instructor) for instructor in list(new_instructors)]
new_offerings = [json.loads(offering) for offering in list(new_offerings)]

print(f"Other components which were not added: {other_components}")

# create new df using headers
course_df = pd.DataFrame(new_courses, columns=course_headers)
course_df = course_df.drop_duplicates()

instructor_df = pd.DataFrame(new_instructors, columns=instructor_headers)

offering_df = pd.DataFrame(new_offerings, columns=offering_headers)
offering_df['instructors'] = offering_df['instructors'].apply(json.dumps) # fixes issue with exporting an array of text



Course ATHLETIC 10 was not added because it has no valid offerings
Course ATHLETIC 10 was not added because it has no valid offerings
Course ATHLETIC 10 was not added because it has no valid offerings
Course ATHLETIC 10 was not added because it has no valid offerings
Course ATHLETIC 10 was not added because it has no valid offerings
Course ATHLETIC 10 was not added because it has no valid offerings
Course ATHLETIC 10 was not added because it has no valid offerings
Course ATHLETIC 10 was not added because it has no valid offerings
Course ATHLETIC 10 was not added because it has no valid offerings
Course ATHLETIC 10 was not added because it has no valid offerings
Course ATHLETIC 10 was not added because it has no valid offerings
Course ATHLETIC 10 was not added because it has no valid offerings
Course ATHLETIC 10 was not added because it has no valid offerings
Course ATHLETIC 10 was not added because it has no valid offerings
Course ATHLETIC 10 was not added because it has no valid offer

In [7]:
current_time = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

os.mkdir(f"data/{current_time}")

course_df.to_csv(f"data/{current_time}/course.csv", index=False, quoting=csv.QUOTE_ALL)
offering_df.to_csv(f"data/{current_time}/offering.csv", index=False, quoting=csv.QUOTE_ALL)
instructor_df.to_csv(f"data/{current_time}/instructor.csv", index=False, quoting=csv.QUOTE_ALL)


In [None]:
import pandas as pd

file_path = "/Users/tristansinclair/Code/explore-courses-api/data/2023_07_08_22_36_16/course.csv"

# Read the CSV file into a DataFrame
data = pd.read_csv(file_path)

# Find duplicate values in the 'id' column
duplicates = data[data.duplicated('course_id', keep=False)]

# Group the duplicates by 'id' and get the corresponding row numbers and names
duplicate_groups = duplicates.groupby('course_id')

# Print the duplicate ID numbers, row numbers, and associated names
if not duplicates.empty:
    print("Duplicate IDs:")
    for id_num, group in duplicate_groups:
        row_numbers = ', '.join(map(str, group.index))
        names = ', '.join(group['name'])
        print(f"ID: {id_num}")
        print(f"Row numbers: {row_numbers}")
        print(f"Names: {names}")
        print()
else:
    print("No duplicates found in the 'id' column.")


Duplicate IDs:
ID: 101403
Row numbers: 893, 942, 987, 1089, 1100, 1128, 1167, 1223, 1244, 1308
Names: ACCT 691, MGTECON 691, FINANCE 691, GSBGEN 691, HRMGT 691, MKTG 691, OIT 691, OB 691, POLECON 691, STRAMGT 691

ID: 101406
Row numbers: 894, 943, 988, 1090, 1101, 1129, 1168, 1224, 1245, 1309
Names: ACCT 692, MGTECON 692, FINANCE 692, GSBGEN 692, HRMGT 692, MKTG 692, OIT 692, OB 692, POLECON 692, STRAMGT 692

ID: 101410
Row numbers: 897, 946, 991, 1093, 1102, 1132, 1171, 1227, 1248, 1310
Names: ACCT 802, MGTECON 802, FINANCE 802, GSBGEN 802, HRMGT 802, MKTG 802, OIT 802, OB 802, POLECON 802, STRAMGT 802

ID: 101486
Row numbers: 1363, 1467
Names: EDUC 149, EDUC 249

ID: 101503
Row numbers: 1366, 1505, 3682, 7676
Names: EDUC 165, EDUC 265, AMSTUD 165, HISTORY 158C

ID: 101551
Row numbers: 1389, 4977
Names: EDUC 193B, CHILATST 193B

ID: 101553
Row numbers: 1390, 4581
Names: EDUC 193F, ASNAMST 193F

ID: 101562
Row numbers: 1395, 7044, 11083
Names: EDUC 197, FEMGEN 297, SOC 134

ID: 101572


In [None]:
import csv
import psycopg2

# Connect to the PostgreSQL database
conn = psycopg2.connect(
    database="postgres",
    user="postgres",
    password="postgres",
    host="localhost",
    port="54322"
)
cursor = conn.cursor()

# Read the CSV file and update the database
with open('./data/2023_07_09_13_54_47/course.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        course_name = row['name']
        course_id = row['course_id']

        final_exam = row['final_exam']
        if final_exam == '':
            final_exam = None

        # Update the course values in the database table based on the matching criteria
        cursor.execute("""
            UPDATE course
            SET
                year = %s,
                subject = %s,
                title = %s,
                description = %s,
                units_min = %s,
                units_max = %s,
                repeatable = %s,
                grading_basis = %s,
                final_exam = %s,
                academic_group = %s,
                academic_org = %s,
                way_a_ii = %s,
                way_aqr = %s,
                way_ce = %s,
                way_edp = %s,
                way_er = %s,
                way_fr = %s,
                way_sma = %s,
                way_si = %s
            WHERE name = %s AND course_id = %s
        """, (
            row['year'],
            row['subject'],
            row['title'],
            row['description'],
            row['units_min'],
            row['units_max'],
            row['repeatable'],
            row['grading_basis'],
            final_exam,
            row['academic_group'],
            row['academic_org'],
            row['way_a_ii'],
            row['way_aqr'],
            row['way_ce'],
            row['way_edp'],
            row['way_er'],
            row['way_fr'],
            row['way_sma'],
            row['way_si'],
            course_name,
            course_id
        ))
        conn.commit()

# Close the database connection
cursor.close()
conn.close()


In [None]:

# Connect to the PostgreSQL database
conn = psycopg2.connect(
    database="postgres",
    user="postgres",
    password="postgres",
    host="localhost",
    port="54322"
)
cursor = conn.cursor()
# Read the CSV file and prepare the data for batch updates
rows = []
with open('./data/2023_07_09_13_54_47/course.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        final_exam = row['final_exam']
        if final_exam == '':
            final_exam = None

        rows.append((
            row['year'],
            row['subject'],
            row['title'],
            row['description'],
            row['units_min'],
            row['units_max'],
            row['repeatable'],
            row['grading_basis'],
            final_exam,
            row['academic_group'],
            row['academic_org'],
            row['way_a_ii'],
            row['way_aqr'],
            row['way_ce'],
            row['way_edp'],
            row['way_er'],
            row['way_fr'],
            row['way_sma'],
            row['way_si'],
            row['name'],
            row['course_id']
        ))

# Update the course values in the database table using batch updates
cursor.executemany("""
    UPDATE course
    SET
        year = %s,
        subject = %s,
        title = %s,
        description = %s,
        units_min = %s,
        units_max = %s,
        repeatable = %s,
        grading_basis = %s,
        final_exam = %s,
        academic_group = %s,
        academic_org = %s,
        way_a_ii = %s,
        way_aqr = %s,
        way_ce = %s,
        way_edp = %s,
        way_er = %s,
        way_fr = %s,
        way_sma = %s,
        way_si = %s
    WHERE name = %s AND course_id = %s
""", rows)
conn.commit()


cursor.close()
conn.close()
