In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
CS_df = pd.read_excel('Computer Science Program Major Requirements only.xlsx')

FileNotFoundError: [Errno 2] No such file or directory: 'Computer Science Program Major Requirements only.xlsx'

In [3]:
CS_df

Unnamed: 0,Course List#,Effective Date,Course List Description,Course ID/ Wildcard,Subject/Catalog,Course Title,Topic ID,Single Term Only,Term Description,Associated Class Code,Valid Begin Date,Valid End Date
0,452,1901-01-01 00:00:00,EECS,Wildcard,EECS,,,,,,NaT,NaT
1,672,1901-01-01 00:00:00,CMPTRSC,Wildcard,CMPTRSC,,,,,,NaT,NaT
2,2295,1901-01-01 00:00:00,INTERID,Wildcard,INTERID,,,,,,NaT,NaT
3,2612,1901-01-01 00:00:00,300 - 600 Level,Wildcard,3##,,,,,,NaT,NaT
4,,,,Wildcard,4##,,,,,,NaT,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...
175,,,,`012129,EECS 398,Special Topics,12 Sys Design of a Search Engine,2220.0,Winter 2019,,NaT,NaT
176,,,,,EECS 398,,12 Sys Design of a Search Engine,2260.0,Fall 2019,,NaT,NaT
177,,,,`007780,EECS 498,Special Topics,60 Conversational AI,2260.0,Fall 2019,,NaT,NaT
178,,,,,EECS 498,,63 Accelerators for AI & Health,2260.0,Fall 2019,,NaT,NaT


In [4]:
def clean_df(df):
    cols = [
        'Subject', ## Subject/Catalog col, the alphabetical part
        'Catalog', ## Subject/Catalog col, the nummerical part
        'Course List Description', # Backward fill or manually fill
        'Course Title',
        'Topic ID',
        'Course ID/ Wildcard', # Remove ` from nummerical
        'Workload', # Look at Computer Science Program pdf file, page 4, highlighted part to fill in 4 workload levels, "No information" for nan

        # These 9 tracks see page 7 in pdf file, under CS-LSA tracks, 0-no, 1-yes
        'AI track',
        'Bio track',
        'Data track',
        'Econ track',
        'Robot track',
        'Security track',
        'Software dev track',
        'Software sys track',
        'Theory track'
    ]
    
    
    
    # Get digits(Catalog) from Subject/Catalog
    df['Catalog'] = df['Subject/Catalog'].str.extract('(\d+)') 
    
    # Get letters(Subject) from Subject/Catalog
    df['Subject'] = df['Subject/Catalog'].str.extract(r"([a-zA-Z]+)") 
    
    # Fill blank course list description with ffill
    df['Course List Description'] = df['Course List Description'].replace(' ', np.nan).fillna(method = 'ffill')
    
    # Remove ` from Course ID/ Wildcard
    df['Course ID/ Wildcard'] = df['Course ID/ Wildcard'].str.replace("`","")
    
    # Get workload 
    workload_dict = {'Extremely heavy workload': [381, 467, 470, 473, 482, 494],
                'Heavy workload': [281, 373, 445, 483, 487, 489, 491],
                'Moderate workload': [203, 280, 285, 370, 376, 388, 442, 475, 477, 478, 481, 484, 485, 490, 492],
                'Light workload': [183, 441, 486, 493, 496, 497]}
    
    df.loc[:,'Workload'] = 'No information'
    
    for key, value in workload_dict.items():
        df.loc[df['Catalog'].astype(float).isin(value),'Workload'] = key
        
    track_cols = [
        'AI track',
        'Bio track',
        'Data track',
        'Econ track',
        'Robot track',
        'Security track',
        'Software dev track',
        'Software sys track',
        'Theory track'
    ]
    
    df.loc[:, track_cols] = 0 #initializing the columns with 0
    
    return df[cols]

In [5]:
def tracks(df, file_path):
    track_cols = [
        'AI track',
        'Bio track',
        'Data track',
        'Econ track',
        'Robot track',
        'Security track',
        'Software dev track',
        'Software sys track',
        'Theory track'
    ]
    
    
    with open(file_path) as f:
        track_text = f.read()
    track_text = track_text.replace('\n',' ').replace('Required:', '')
    track_list = re.findall('[0-9]\)\s[\w0-9\s\’\:\,\\n]+|[A-Z][\w\s]+\:', track_text)

    track_dict = {}
    for t in track_list:
        if t.replace(":","").replace(" ", "").isalpha():
            track_key = t.replace(":", "")
            track_dict[track_key] = []
        elif re.findall('[A-Za-z]*\s[0-9]{2,}', t):
            courses = re.findall('[A-Za-z]*\s[0-9]{2,}', t)
            for course in courses:
                c = course.replace('or', "")
                if re.findall('[A-Za-z]*', c)[0]:
                    sub = re.findall('[A-Za-z]*', c)[0]
                    track_dict[track_key].append(c)
                else: 
                    new_str = sub + c
                    track_dict[track_key].append(new_str)
        elif 'ULCS' in t:
            track_dict[track_key].append('ULCS')
        else:
            pass

    
    track_col_name = dict(zip( track_dict.keys(), track_cols)) #dict of col names and track names
    
    new_df = df.copy()
    
    for key, value in track_dict.items():
        for v in value:
            if v != 'ULCS':
                sub = v.strip().split(' ')[0]
                cat = v.strip().split(' ')[1]
                new_df.loc[(new_df['Subject'] == sub.upper()) & (new_df['Catalog'] == cat), track_col_name[key]] = 1
            else: 
                new_df.loc[new_df['Course List Description'] == 'Computer Science Upper Level T', track_col_name[key]] = 1
                pass
    return new_df

In [6]:
cleaned_CS_df = clean_df(CS_df)

In [7]:
final_CS_df = tracks(cleaned_CS_df, 'track_text.txt')

In [8]:
final_CS_df.to_pickle('cleaned_CS.pkl')

Unnamed: 0,Subject,Catalog,Course List Description,Course Title,Topic ID,Course ID/ Wildcard,Workload,AI track,Bio track,Data track,Econ track,Robot track,Security track,Software dev track,Software sys track,Theory track
0,EECS,,EECS,,,Wildcard,No information,0,0,0,0,0,0,0,0,0
1,CMPTRSC,,CMPTRSC,,,Wildcard,No information,0,0,0,0,0,0,0,0,0
2,INTERID,,INTERID,,,Wildcard,No information,0,0,0,0,0,0,0,0,0
3,,3,300 - 600 Level,,,Wildcard,No information,0,0,0,0,0,0,0,0,0
4,,4,300 - 600 Level,,,Wildcard,No information,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,EECS,398,Computer Science Capstone,Special Topics,12 Sys Design of a Search Engine,012129,No information,0,0,0,0,0,0,0,0,0
176,EECS,398,Computer Science Capstone,,12 Sys Design of a Search Engine,,No information,0,0,0,0,0,0,0,0,0
177,EECS,498,Computer Science Capstone,Special Topics,60 Conversational AI,007780,No information,0,0,0,0,0,0,0,0,0
178,EECS,498,Computer Science Capstone,,63 Accelerators for AI & Health,,No information,0,0,0,0,0,0,0,0,0
