In [17]:
import os
import glob
from datetime import datetime
import numpy as np
import pandas as pd
import psycopg2
from psycopg2 import sql
from psycopg2.extras import execute_values

In [18]:
df = pd.read_csv('./data/2023_07_09_14_46_39/instructor.csv')

params = {
    'dbname': 'postgres',
    'user': 'postgres',
    'password': 'postgres',
    'host': 'localhost',
    'port': 54322
}

conn = psycopg2.connect(**params)
cur = conn.cursor()

query_template = sql.SQL("""
    INSERT INTO public.instructor (id, name, first_name, last_name, email)
    VALUES %s
    ON CONFLICT (id) DO UPDATE SET
        name = excluded.name,
        first_name = excluded.first_name,
        last_name = excluded.last_name,
        email = excluded.email
""")

records = df.to_records(index=False).tolist()
execute_values(cur, query_template, records)
conn.commit()
cur.close()
conn.close()

In [19]:
def get_most_recent_folder():
    parent_dir = './data/'
    folders = glob.glob(parent_dir + '*')
    folders.sort(key=os.path.getmtime, reverse=True)
    recent_folder = folders[0]
    return recent_folder

In [33]:
# folder_path = get_most_recent_folder()
folder_path = "./data/2023_07_09_14_46_39"
df = pd.read_csv(f"{folder_path}/course.csv")
df = df.replace({np.NaN: None})
duplicates = df[df.duplicated(['name'], keep=False)]
# duplicates = duplicates.drop_duplicates()

# print the duplicate values
print("Duplicate values in 'name' column:")
print(duplicates)
duplicates.to_csv('duplicates.csv', index=False)

Duplicate values in 'name' column:
       course_id       year          name   subject  \
932       210756  2022-2023   MGTECON 617   MGTECON   
933       217303  2022-2023   MGTECON 617   MGTECON   
2820      223195  2022-2023        CS 523        CS   
2821      223196  2022-2023        CS 523        CS   
4036      130984  2022-2023    AMSTUD 163    AMSTUD   
...          ...        ...           ...       ...   
12408     222710  2022-2023      TAPS 396      TAPS   
12409     223191  2022-2023      TAPS 460      TAPS   
12410     125319  2022-2023      TAPS 802      TAPS   
12538     224886  2022-2023  WELLNESS 256  WELLNESS   
12539     224858  2022-2023  WELLNESS 256  WELLNESS   

                                                   title  \
932           Heterogeneity in Macroeconomics (ECON 237)   
933                      Heterogeneity in Macroeconomics   
2820             Research Seminar in Computer Vision + X   
2821   Research Seminar in Computer Vision and Health...   
4036

In [34]:
def csv_to_postgres_course():
    """
    take a csv file and insert into postgres
    or update if the `name` already exists

    postgres table:
    course (
        id uuid not null default gen_random_uuid (),
        course_id integer null,
        year text not null,
        name text not null,
        subject text not null,
        title text not null,
        description text null,
        units_min integer not null,
        units_max integer not null,
        repeatable boolean not null,
        grading_basis text not null,
        final_exam boolean null,
        academic_group text not null,
        academic_org text not null,
        way_a_ii boolean not null,
        way_aqr boolean not null,
        way_ce boolean not null,
        way_edp boolean not null,
        way_er boolean not null,
        way_fr boolean not null,
        way_sma boolean not null,
        way_si boolean not null,
        constraint course_pkey primary key (id)
    ) tablespace pg_default;
    """
    folder_path = get_most_recent_folder()
    df = pd.read_csv(f"{folder_path}/course.csv")
    df = df.replace({np.NaN: None})
    # print(df)

    params = {
        'dbname': 'postgres',
        'user': 'postgres',
        'password': 'postgres',
        'host': 'localhost',
        'port': 54322
    }

    conn = psycopg2.connect(**params)
    cur = conn.cursor()

    query_template = sql.SQL("""
        INSERT INTO public.course (course_id, year, name, subject, title, description, units_min, units_max, repeatable, grading_basis, final_exam, academic_group, academic_org, way_a_ii, way_aqr, way_ce, way_edp, way_er, way_fr, way_sma, way_si)
        VALUES %s
        ON CONFLICT (name) DO UPDATE SET
            course_id = excluded.course_id,
            year = excluded.year,
            subject = excluded.subject,
            title = excluded.title,
            description = excluded.description,
            units_min = excluded.units_min,
            units_max = excluded.units_max,
            repeatable = excluded.repeatable,
            grading_basis = excluded.grading_basis,
            final_exam = excluded.final_exam,
            academic_group = excluded.academic_group,
            academic_org = excluded.academic_org,
            way_a_ii = excluded.way_a_ii,
            way_aqr = excluded.way_aqr,
            way_ce = excluded.way_ce,
            way_edp = excluded.way_edp,
            way_er = excluded.way_er,
            way_fr = excluded.way_fr,
            way_sma = excluded.way_sma,
            way_si = excluded.way_si
    """)

    records = df.to_records(index=False).tolist()
    execute_values(cur, query_template, records)
    conn.commit()
    cur.close()
    conn.close()

In [36]:
csv_to_postgres_course()

In [None]:
conn = psycopg2.connect(
    database="postgres",
    user="postgres",
    password="postgres",
    host="localhost",
    port="54322"
)
cursor = conn.cursor()
# Read the CSV file and prepare the data for batch updates
rows = []
with open('./data/2023_07_09_13_54_47/course.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        final_exam = row['final_exam']
        if final_exam == '':
            final_exam = None

        rows.append((
            row['year'],
            row['subject'],
            row['title'],
            row['description'],
            row['units_min'],
            row['units_max'],
            row['repeatable'],
            row['grading_basis'],
            final_exam,
            row['academic_group'],
            row['academic_org'],
            row['way_a_ii'],
            row['way_aqr'],
            row['way_ce'],
            row['way_edp'],
            row['way_er'],
            row['way_fr'],
            row['way_sma'],
            row['way_si'],
            row['name'],
            row['course_id']
        ))

# Update the course values in the database table using batch updates
cursor.executemany("""
    UPDATE course
    SET
        year = %s,
        subject = %s,
        title = %s,
        description = %s,
        units_min = %s,
        units_max = %s,
        repeatable = %s,
        grading_basis = %s,
        final_exam = %s,
        academic_group = %s,
        academic_org = %s,
        way_a_ii = %s,
        way_aqr = %s,
        way_ce = %s,
        way_edp = %s,
        way_er = %s,
        way_fr = %s,
        way_sma = %s,
        way_si = %s
    WHERE name = %s AND course_id = %s
""", rows)
conn.commit()


cursor.close()
conn.close()