Here is the final scraper that we will be using to populate the database.  We will be pdf scraping since there were technical limitations scraping the course website. (likely protections from UCLA) 

In [2]:
#importing necessary packages
import PyPDF2
import re

In [3]:
#putting the pdf of the UCLA catalog into a string
#takes a while to run, ~1 minute depending on device
def extract_text_from_pdf(pdf_path, start_page=1, end_page=890):
    text = ""
    #opens pdf
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        start_page = max(0, start_page - 1)  
        end_page = min(end_page, len(reader.pages))
        #chooses which pages to parse through - the whole pdf in this case
        for page_number in range(start_page, end_page):
            page = reader.pages[page_number]
            text += page.extract_text()
    return text

pdf_text = extract_text_from_pdf('/Users/dayfamily/Desktop/UCLA_Catalog_2023-24.pdf', start_page=1, end_page=890)


In [8]:
#defining only the sections we need to get course codes and course names, and potentially majors associated, although we run into difficulties as demonstrated later
#also takes a while to run, ~1 minute depending on device.
class_text = extract_text_from_pdf('/Users/dayfamily/Desktop/UCLA_Catalog_2023-24.pdf', start_page=180, end_page=890)

How we are going to input what we scrape into the database:

INSERT INTO classes VALUES ('course_code', 'dept_code', 'course_num', 'course_name', 'prof', 'discussion_code');

ex:  INSERT INTO classes VALUES ('CS000331A', 'COM SCI', '33', 'Introduction to Computer Organization', 'Reinman, G.D.', '1A');

So, we will need to make an array that has the following elements: course_code, dept_code, course_num, course_name, prof, and discussion_code.  Unfortunately, due to the inability to scrape the registrar, we will have to use dummy variables for prof and discussion_code, as those things will be variate by quarter and not something we can confirm off of the pdf version of the course catalog.  

In [9]:
#a regular expression that looks for a specific pattern that matches how the pdf stores info about course_names and course_codes; not completely perfect, especially when it comes to things like fiat lux or research
course_pattern = r'(\b[A-Za-z]*\d+[A-Za-z]?)\.\s(.+?)\.'
course_matches = re.findall(course_pattern, class_text)



In [None]:
#commented out below line since it has a huge output, it also takes ~2 minutes to run depending on your device
#BEWARE SOMETIMES CRASHES
#print(course_matches) 

In [10]:
#preliminary array using what info we got so far
courses_array = []
for match in course_matches:
    course_code = match[0]
    course_name = match[1]
    courses_array.append(["", "", course_code, course_name, "", ""])




In [None]:
#prints every row - but start lower if it is crashing
for row_index, row_data in enumerate(courses_array[:12000], start=1):
    print(f"Row {row_index}: {row_data}")

There are over 10,000 classes, and there is no easy way to assign a major to each of them... I attempted to use different regular expressions to scrape major names on a given page and then I could subsequently assign the classes on that page to that major, but to no avail with the given formatting.  Thus, the only way to assign majors to each course, would be to do so manually, and I will not be doing all 11,000 courses.  Instead, I can do it "semi-manually", where I look at which range each class is, and assign the dept_code to those courses.  I could also write a program to do that based on perhaps if the next course number is lower than the previous one, and to change the dept_code accordingly, but that is prone to errors and having incorrect information seems worse than having incomplete information.  So, I will input dept_code information "manually" for the time being.  I will do the first couple of majors, and then prioritize popular majors like computer science, mathematics, and political science.  

In [5]:
#complete list of dept_codes, found on https://github.com/nnhien/uclacatalog/wiki
dept_abbr = {
    "AERO ST",
    "AF AMER",
    "AFRC ST",
    "AM IND",
    "ASL",
    "AN N EA",
    "ANES",
    "ANTHRO",
    "APPLING",
    "ARABIC",
    "ARCHEOL",
    "ARCH&UD",
    "ARMENIA",
    "ART",
    "ART HIS",
    "ART&ARC",
    "ARTS ED",
    "ASIAN",
    "ASIA AM",
    "ASTR",
    "A&O SCI",
    "BIOENGR",
    "BIOINFO",
    "BIOINFR",
    "BIOL CH",
    "BIOMATH",
    "BMD RES",
    "BIOSTAT",
    "C&EE ST",
    "CH ENGR",
    "CHEM",
    "CHICANO",
    "CHIN",
    "C&EE",
    "CLASSIC",
    "CLUSTER",
    "COMM",
    "CESC",
    "COM HLT",
    "COM LIT",
    "C&S BIO",
    "COM SCI",
    "CAEM",
    "DANCE",
    "DENT",
    "DESMA",
    "DGT HUM",
    "DIS STD",
    "DUTCH",
    "EPS SCI",
    "EA STDS",
    "EE BIOL",
    "ECON",
    "EDUC",
    "EC ENGR",
    "ENGR",
    "ENGL",
    "ESL",
    "ENGCOMP",
    "ENVIRON",
    "ENV HLT",
    "EPIDEM",
    "ETHNMUS",
    "FILIPNO",
    "FILM TV",
    "FOOD ST",
    "FRNCH",
    "GENDER",
    "GEOG",
    "GERMAN",
    "GRNTLGY",
    "GLB HLT",
    "GJ STDS",
    "GLBL ST",
    "GRAD PD",
    "GREEK",
    "HLT POL",
    "HEBREW",
    "HIN-URD",
    "HIST",
    "HNRS",
    "HUM GEN",
    "HNGAR",
    "IL AMER",
    "I E STD",
    "INDO",
    "INF STD",
    "I A STD",
    "INTL DV",
    "I M STD",
    "IRANIAN",
    "ISLM ST",
    "ITALIAN",
    "JAPAN",
    "JEWISH",
    "KOREA",
    "LBR STD",
    "LATIN",
    "LATN AM",
    "LAW",
    "UG-LAW",
    "LGBTQS",
    "LIFESCI",
    "LING",
    "MGMT",
    "MGMTEX",
    "MGMTFT",
    "MGMTFE",
    "MGMTGEX",
    "MGMTMFE",
    "MGMTMSA",
    "MGMTPHD",
    "MAT SCI",
    "MATH",
    "MECH&AE",
    "MED",
    "MIMG",
    "M E STD",
    "MIL SCI",
    "M PHARM",
    "MOL BIO",
    "MOL TOX",
    "MCD BIO",
    "MC&IP",
    "MUSC",
    "MSC IND",
    "MUSCLG",
    "NAV SCI",
    "NR EAST",
    "NEURBIO",
    "NEURLGY",
    "NEURO",
    "NEUROSC",
    "NEURSGY",
    "NURSING",
    "OBGYN",
    "OPTH",
    "ORL BIO",
    "ORTHPDC",
    "PATH",
    "PEDS",
    "PHILOS",
    "PHYSICS",
    "PBMED",
    "PHYSCI",
    "PHYSIOL",
    "POLSH",
    "POL SCI",
    "PORTGSE",
    "COMPTNG",
    "PSYCTRY",
    "PSYCH",
    "PUB AFF",
    "PUB HLT",
    "PUB PLC",
    "RAD ONC",
    "RELIGN",
    "ROMANIA",
    "RUSSN",
    "SCAND",
    "SCI EDU",
    "SEMITIC",
    "SRB CRO",
    "SLAVC",
    "SOC SC",
    "SOC THT",
    "SOC WLF",
    "SOC GEN",
    "SOCIOL",
    "S ASIAN",
    "SEASIAN",
    "SPAN",
    "STATS",
    "SURGERY",
    "SWAHILI",
    "THAI",
    "THEATER",
    "TURKIC",
    "UNIV ST",
    "URBN PL",
    "UROLOGY",
    "VIETMSE",
    "WL ARTS",
    "YIDDSH"
}

This is the beginning of "Semi-manually" assigning majors to classes - I will do the first 3 majors, and then the ones I specified above.  I may also fill in the remaining with "dummy major abreviations", aka fill them out with dept_codes even if they are inaccurate so that they populate the database.  

In [22]:
#I will essentially be copying this cell for as many majors as I fill out - replacing the dept_code and range accordingly.  I will be determining the range by looking at the last class listed for each major on the pdf of the general catalog.

#here is the first one: AERO ST

# Given dept_code to assign
dept_code = "AERO ST"

# Assigning the dept_code to the second column of however many classes it corresponds to
for row_index in range(0,5):
    courses_array[row_index][1] = dept_code



In [None]:
# Printing the updated array
for row in courses_array:
    print(row)

In [None]:
# Given dept_code to assign
dept_code = "AF AMER"

# Assigning the dept_code to the second column of however many classes it corresponds to
for row_index in range(5,102):
    courses_array[row_index][1] = dept_code

# Printing the updated array
for row in courses_array:
    print(row)

In [None]:
# Given dept_code to assign
dept_code = "POLI SCI"

# Assigning the dept_code to the second column of however many classes it corresponds to
for row_index in range(8997,9197):
    courses_array[row_index][1] = dept_code

# Printing the updated array
for row in courses_array:
    print(row)

In [None]:
# Given dept_code to assign
dept_code = "COM SCI"

# Assigning the dept_code to the second column of however many classes it corresponds to
for row_index in range(2793,2853):
    courses_array[row_index][1] = dept_code

# Printing the updated array
for row in courses_array:
    print(row)

In [None]:
# Given dept_code to assign
dept_code = "MATH"

# Assigning the dept_code to the second column of however many classes it corresponds to
for row_index in range(6975,7207):
    courses_array[row_index][1] = dept_code

# Printing the updated array
for row in courses_array:
    print(row)

Okay, using the classes that I have filled out a dept_code for, I will now make a new array and fill out dummy discussion sections for them.

In [None]:
# Create a new array containing only rows with non-empty second elements
discussions_template_array = [row for row in courses_array if row[1] != ""]

# Print the new array
for row in discussions_template_array:
    print(row)

In [None]:
for row in discussions_template_array:
    row[5] = "1A"

for row in discussions_template_array:
    print(row)

In [None]:
discussions_updated_array = []

for row in discussions_array:
    # Create a duplicate row with the value of the sixth column changed to "1B"
    duplicate_row_1b = row[:]  # Create a shallow copy of the original row
    duplicate_row_1b[5] = "1B"  # Update the value of the sixth column to "1B"
    duplicate_row_1c = row[:]  # Create a shallow copy of the original row
    duplicate_row_1c[5] = "1C"  # Update the value of the sixth column to "1B"
    duplicate_row_1d = row[:]  # Create a shallow copy of the original row
    duplicate_row_1d[5] = "1D"  # Update the value of the sixth column to "1B"
    
    # Append the original row and the duplicate row to the updated array
    discussions_updated_array.append(row)
    discussions_updated_array.append(duplicate_row_1b)
    discussions_updated_array.append(duplicate_row_1c)
    discussions_updated_array.append(duplicate_row_1d)

for row in discussions_updated_array:
    print(row)

Now that each class has 4 discussion sections, a dept code, a course code, and a course name, I will assign a dummy variable to be their professor - I will update it with more apt names when adding it to the website.  

In [None]:
for row in discussions_updated_array:
    row[4] = "Eggert, P.R."

for row in discussions_updated_array:
    print(row)

Now, all I need to have some classes completely filled out and ready to populate the database is to give each class a unique course_code.  This will look like the following:

2 letters representing the dept_code i.e. CS for COMP SCI
5 characters representing the course_num, i.e. 00001 for Introduction to Black Studies
2 characters representing the discussion_code i.e. 1C


In [46]:
for row in discussions_updated_array:
    # Extract relevant information from the row
    dept_code = row[1]
    course_num = row[2]
    discussion_code = row[5]
    
    # Construct the course code
    if dept_code == 'COM SCI':
        course_code = 'CS' + '0' * (5 - len(course_num)) + course_num + discussion_code
    elif dept_code == 'AERO ST':
        course_code = 'AS' + '0' * (5 - len(course_num)) + course_num + discussion_code
    elif dept_code == 'POLI SCI':
        course_code = 'PS' + '0' * (5 - len(course_num)) + course_num + discussion_code
    elif dept_code == 'MATH':
        course_code = 'MT' + '0' * (5 - len(course_num)) + course_num + discussion_code
    elif dept_code == 'AF AMER':
        course_code = 'AA' + '0' * (5 - len(course_num)) + course_num + discussion_code
        pass

    
    # Update the first column of the row with the constructed course code
    row[0] = course_code


In [47]:
for row in discussions_updated_array:
    print(row)

['AS0001C1A', 'AERO ST', '1C', 'Heritage and Values', 'Eggert, P.R.', '1A']
['AS0001C1B', 'AERO ST', '1C', 'Heritage and Values', 'Eggert, P.R.', '1B']
['AS0001C1C', 'AERO ST', '1C', 'Heritage and Values', 'Eggert, P.R.', '1C']
['AS0001C1D', 'AERO ST', '1C', 'Heritage and Values', 'Eggert, P.R.', '1D']
['AS000191A', 'AERO ST', '19', 'Fiat Lux Freshman Seminars', 'Eggert, P.R.', '1A']
['AS000191B', 'AERO ST', '19', 'Fiat Lux Freshman Seminars', 'Eggert, P.R.', '1B']
['AS000191C', 'AERO ST', '19', 'Fiat Lux Freshman Seminars', 'Eggert, P.R.', '1C']
['AS000191D', 'AERO ST', '19', 'Fiat Lux Freshman Seminars', 'Eggert, P.R.', '1D']
['AS0020C1A', 'AERO ST', '20C', 'Team and Leadership Fundamentals', 'Eggert, P.R.', '1A']
['AS0020C1B', 'AERO ST', '20C', 'Team and Leadership Fundamentals', 'Eggert, P.R.', '1B']
['AS0020C1C', 'AERO ST', '20C', 'Team and Leadership Fundamentals', 'Eggert, P.R.', '1C']
['AS0020C1D', 'AERO ST', '20C', 'Team and Leadership Fundamentals', 'Eggert, P.R.', '1D']
['AS

below are failed attempts to form regular expressions that capture majors so that I can associate them with courses. 

In [48]:
unique_first_column_values = {}  # Step 1: Create an empty dictionary

unique_rows = []  # Initialize a list to store unique rows

# Step 2: Iterate over the array of rows
for row in discussions_updated_array:
    # Step 3: Check if the value in the first column already exists in the dictionary
    if row[0] not in unique_first_column_values:
        # Step 4: If the value doesn't exist, add it to the dictionary and keep the row
        unique_first_column_values[row[0]] = True
        unique_rows.append(row)

# Step 5: After processing all rows, the dictionary will contain unique values from the first column.
# Step 6: Reconstruct the array with only the unique rows.
final_courses_array = unique_rows

In [49]:
for row in final_courses_array:
    print(row)

['AS0001C1A', 'AERO ST', '1C', 'Heritage and Values', 'Eggert, P.R.', '1A']
['AS0001C1B', 'AERO ST', '1C', 'Heritage and Values', 'Eggert, P.R.', '1B']
['AS0001C1C', 'AERO ST', '1C', 'Heritage and Values', 'Eggert, P.R.', '1C']
['AS0001C1D', 'AERO ST', '1C', 'Heritage and Values', 'Eggert, P.R.', '1D']
['AS000191A', 'AERO ST', '19', 'Fiat Lux Freshman Seminars', 'Eggert, P.R.', '1A']
['AS000191B', 'AERO ST', '19', 'Fiat Lux Freshman Seminars', 'Eggert, P.R.', '1B']
['AS000191C', 'AERO ST', '19', 'Fiat Lux Freshman Seminars', 'Eggert, P.R.', '1C']
['AS000191D', 'AERO ST', '19', 'Fiat Lux Freshman Seminars', 'Eggert, P.R.', '1D']
['AS0020C1A', 'AERO ST', '20C', 'Team and Leadership Fundamentals', 'Eggert, P.R.', '1A']
['AS0020C1B', 'AERO ST', '20C', 'Team and Leadership Fundamentals', 'Eggert, P.R.', '1B']
['AS0020C1C', 'AERO ST', '20C', 'Team and Leadership Fundamentals', 'Eggert, P.R.', '1C']
['AS0020C1D', 'AERO ST', '20C', 'Team and Leadership Fundamentals', 'Eggert, P.R.', '1D']
['AS

In [40]:
#defining only the sections we need to get the names of the majors
major_text = extract_text_from_pdf('/Users/dayfamily/Desktop/UCLA_Catalog_2023-24.pdf', start_page=8, end_page=15)

In [None]:
major_testing = extract_text_from_pdf('/Users/dayfamily/Desktop/UCLA_Catalog_2023-24.pdf', start_page=185, end_page=200)

In [None]:
#a regular expression that looks for a specific pattern that matches how the pdf stores info about majors; not completely perfect
major_pattern = r'(.+?)\s*(?:\nCollege of Letters and Science|$)'

major_matches = re.findall(major_pattern, major_testing, re.DOTALL)

print(major_matches)