In [297]:
import glob
from collections import defaultdict

import pandas as pd
import numpy as np
from easydict import EasyDict

In [264]:
excel_files = glob.glob('*.xlsx')

In [265]:
def create_course_col(df):
    subject = df['Subject']
    df['Subject'] = df['Subject'].astype(str)
    df['subject'] = [s[s.find("(") + 1:s.find(")")] for s in subject]
    df['Catalog Nbr'] = df['Catalog Nbr'].astype(str)
    df['course'] = df['subject'] + ' ' + df['Catalog Nbr']
    df.sort_values(by=['course'], inplace=True)
    return df

In [266]:
for file in excel_files:
    df = pd.read_excel(file)    
    if 'Catalog Nbr' not in df.columns:
        df.rename(columns={'Catalog': 'Catalog Nbr',
                           'Descr': 'description'}, inplace=True)
        create_course_col(df)
        df.to_csv(f'{file}.csv', index=False)
    else: print(file)

In [267]:
csv_files = glob.glob('*.csv')

In [268]:
csv_files

['RE CLST 4621.xlsx.csv',
 'RE CLST 4620.xlsx.csv',
 'QR1 CLST 1817.xlsx.csv',
 'Lang Req CLST 0179.xlsx.csv',
 'Lang Req CLST 3474.xlsx.csv',
 'RE CLST 4609.xlsx.csv',
 'QR1 CLST 1331.xlsx.csv',
 'ID Courses Offered FA21.xlsx.csv',
 'QR1 CLST 9561 Exclusions.xlsx.csv',
 'w_22_des.csv',
 'f_21_des.csv',
 'CE Courses Offered FA21.xlsx.csv',
 'Courses with ULWR Course Attributes.xlsx.csv',
 'MSA Courses Offered FA21.xlsx.csv',
 'NS Courses Offered FA21.xlsx.csv',
 'FYWR CLST 20.xlsx.csv',
 'SS Courses Offered FA21.xlsx.csv',
 'FYWR CLST 3509.xlsx.csv',
 'Lang Req CLST 0176.xlsx.csv',
 'Lang Req CLST 8140.xlsx.csv',
 'QR2 CLST 1329.xlsx.csv',
 'HU Courses Offered FA21.xlsx.csv']

### Concat Each Requirement

In [269]:
def concat_each_requirement(files):
    
    file_list = []
    
    for file in files:
        df = pd.read_csv(file)
        file_list.append(df)
    
    df = pd.concat(file_list, axis=0, ignore_index=True)
    
    return df

### Read Scraped Course Description Files

In [270]:
def read_scraped_course_des_files(file):
    df = pd.read_csv(file)
    df = df.fillna('')
    df['requirements_distribution'] = df['requirements_distribution'] + ', ' + df['other']
    df['requirements_distribution'] = [x.split(', ') for x in df['requirements_distribution']]
    return df

In [271]:
f_21 = read_scraped_course_des_files('f_21_des.csv')
w_22 = read_scraped_course_des_files('w_22_des.csv')

### Create dataframes for each requirement

In [272]:
# Look at all the requirements listed
{x for l in w_22['requirements_distribution'] for x in l}

{'',
 'BS',
 'CBL',
 'CE',
 'Experiential',
 'FYSem',
 'FYWR',
 'HU',
 'Honors',
 'ID',
 'Independent',
 'Lang Req',
 'MSA',
 'Minicourse',
 'NS',
 'QR/1',
 'QR/2',
 'RE',
 'SS',
 'Sustain',
 'ULWR'}

In [273]:
fywr = concat_each_requirement(['FYWR CLST 20.xlsx.csv', 'FYWR CLST 3509.xlsx.csv'])

In [274]:
ulwr = pd.read_csv('Courses with ULWR Course Attributes.xlsx.csv')

In [275]:
qr1 = concat_each_requirement(['QR1 CLST 9561 Exclusions.xlsx.csv', 'QR1 CLST 1817.xlsx.csv', 'QR1 CLST 1331.xlsx.csv'])

In [276]:
qr2 = pd.read_csv('QR2 CLST 1329.xlsx.csv')
re = concat_each_requirement(['RE CLST 4621.xlsx.csv', 'RE CLST 4620.xlsx.csv', 'RE CLST 4609.xlsx.csv'])
lr = concat_each_requirement(['Lang Req CLST 0179.xlsx.csv', 'Lang Req CLST 3474.xlsx.csv', 'Lang Req CLST 0176.xlsx.csv', 'Lang Req CLST 8140.xlsx.csv'])
ns = pd.read_csv('NS Courses Offered FA21.xlsx.csv')
ss = pd.read_csv('SS Courses Offered FA21.xlsx.csv')
hu = pd.read_csv('HU Courses Offered FA21.xlsx.csv')
msa = pd.read_csv('MSA Courses Offered FA21.xlsx.csv')
ce = pd.read_csv('CE Courses Offered FA21.xlsx.csv')
interdisciplinary = pd.read_csv('ID Courses Offered FA21.xlsx.csv')

In [277]:
def comprehensive_list_merge(requirement, requirement_df):
    
    w_22_requirement_df = w_22[w_22['requirements_distribution'].map(lambda x: requirement in x)]
    f_21_requirement_df = f_21[f_21['requirements_distribution'].map(lambda x: requirement in x)]
    
    # Merge requirement_df with w_22_requirement_df
    merge1 = w_22_requirement_df.merge(requirement_df[['course', 'description']], on=['course', 'description'], how='outer')
    
    # Concat this df with f_21_requirement_df
    merge2 = pd.concat([merge1, f_21_requirement_df], axis=0, ignore_index=True)
    
    # Drop duplicate courses
    merge2.drop_duplicates(subset='course', inplace=True)
    
    # Keep only necessary columns
    merge2 = merge2[['course', 'title', 'description', 'requirements_distribution']]
    
    # Fill na with the requirement
    merge2['requirements_distribution'] = merge2['requirements_distribution'].fillna(requirement)
    
    return merge2

In [278]:
fywr_df = comprehensive_list_merge('FYWR', fywr)
ulwr_df = comprehensive_list_merge('ULWR', ulwr)
qr1_df = comprehensive_list_merge('QR/1', qr1)
qr2_df = comprehensive_list_merge('QR/2', qr2)
re_df = comprehensive_list_merge('RE', re)
lr_df = comprehensive_list_merge('Lang Req', lr)
ns_df = comprehensive_list_merge('NS', ns)
ss_df = comprehensive_list_merge('SS', ss)
hu_df = comprehensive_list_merge('HU', hu)
msa_df = comprehensive_list_merge('MSA', msa)
ce_df = comprehensive_list_merge('CE', ce)
id_df = comprehensive_list_merge('ID', interdisciplinary)

In [279]:
bs_df = pd.concat([w_22[w_22['requirements_distribution'].map(lambda x: 'BS' in x)], f_21[f_21['requirements_distribution'].map(lambda x: 'BS' in x)]], axis=0, ignore_index=True)
cbl_df = pd.concat([w_22[w_22['requirements_distribution'].map(lambda x: 'CBL' in x)], f_21[f_21['requirements_distribution'].map(lambda x: 'CBL' in x)]], axis=0, ignore_index=True)
ex_df = pd.concat([w_22[w_22['requirements_distribution'].map(lambda x: 'Experiential' in x)], f_21[f_21['requirements_distribution'].map(lambda x: 'Experiential' in x)]], axis=0, ignore_index=True)
honors_df = pd.concat([w_22[w_22['requirements_distribution'].map(lambda x: 'Honors' in x)], f_21[f_21['requirements_distribution'].map(lambda x: 'Honors' in x)]], axis=0, ignore_index=True)
ind_df = pd.concat([w_22[w_22['requirements_distribution'].map(lambda x: 'Independent' in x)], f_21[f_21['requirements_distribution'].map(lambda x: 'Independent' in x)]], axis=0, ignore_index=True)
mini_df = pd.concat([w_22[w_22['requirements_distribution'].map(lambda x: 'Minicourse' in x)], f_21[f_21['requirements_distribution'].map(lambda x: 'Minicourse' in x)]], axis=0, ignore_index=True)
sus_df = pd.concat([w_22[w_22['requirements_distribution'].map(lambda x: 'Sustain' in x)], f_21[f_21['requirements_distribution'].map(lambda x: 'Sustain' in x)]], axis=0, ignore_index=True)

In [282]:
re_df

Unnamed: 0,course,title,description,requirements_distribution
0,AAS 208,Introduction to African Art,Through the study of a selected group of Afric...,"[HU, RE, ]"
1,AAS 247,Modern Africa,This course offers students the tools they nee...,"[SS, RE, ]"
2,AAS 271,Introduction to Afro-American Literature,In this course we will study the emergence and...,"[HU, RE, ]"
3,AAS 303,Race and Ethnic Relations,This course examines the central tensions unde...,"[SS, RE, ]"
4,AAS 384,Caribbean Literature,This Race & Ethnicity course introduces studen...,"[HU, RE, ]"
...,...,...,...,...
1758,SPANISH 375,Topics in Hispano/Islamic/Sephardic Cultures,This course will involve a comparative study o...,"[RE, HU, ]"
1760,WGS 215,"Contested Spaces: Art, Architecture, Politics",This course encourages students to think criti...,"[HU, RE, ]"
1763,WGS 250,"Race, Gender and Nation",This interdisciplinary course introduces stude...,"[SS, RE, ]"
1764,WGS 354,Race and Identity in Music,This course explores the parameters of racial ...,"[HU, RE, ]"


### TO-DO: Create `core_reqs` dictionary that maps numbers of classes to the DataFrames that correspond

In [None]:
core_reqs = {
    1: [fywr_df, ce_df],
    2: [...],
    3: [...],
}

In [286]:
cs_df.head()

Unnamed: 0_level_0,Course List Description,Course ID/ Wildcard,Subject/Catalog,Course Title,Require,Credits
Course ID/ Wildcard,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1794,CS-LSA major declaration Cal I,1794,MATH 115,Calculus I,One,4.0
21317,CS-LSA major declaration Cal I,21317,MATH 120,Exam Calc Credit I,One,4.0
5723,CS-LSA major declaration Cal I,5723,MATH 185,Honors Calc I,One,4.0
5725,CS-LSA major declaration Cal I,5725,MATH 295,Honors Math I,One,4.0
21318,CS-LSA major declaration Cal II,21318,MATH 121,Exam Calc Credit II,One,4.0


In [283]:
cs_df = pd.read_clipboard()

Unnamed: 0,Course List Description,Course ID/ Wildcard,Subject/Catalog,Course Title,Require,Credits
0,CS-LSA major declaration Cal I,1794,MATH 115,Calculus I,One,4.0
1,CS-LSA major declaration Cal I,21317,MATH 120,Exam Calc Credit I,One,4.0
2,CS-LSA major declaration Cal I,5723,MATH 185,Honors Calc I,One,4.0
3,CS-LSA major declaration Cal I,5725,MATH 295,Honors Math I,One,4.0
4,CS-LSA major declaration Cal II,21318,MATH 121,Exam Calc Credit II,One,4.0
...,...,...,...,...,...,...
87,Computer Science Capstone,45440,EECS 443,Senior Thesis,One,3.0
88,Computer Science Capstone,44359,EECS 441,Mbl App Dev Entrprnr,One,4.0
89,Computer Science Capstone,45917,EECS 473,Adv Embedded Sys,One,4.0
90,Computer Science Capstone,48402,EECS 495,Software for Access,One,4.0


In [285]:
cs_df = cs_df.set_index('Course ID/ Wildcard', drop=False)

In [292]:
req_dd = defaultdict(lambda: {'ids': []})

In [293]:
for course_id, row in cs_df.iterrows():
    desc = row['Course List Description']
    req = row['Require']
    req_dd[desc]['req'] = req
    req_dd[desc]['ids'].append(course_id)

In [301]:
req_d = EasyDict(req_dd)

In [303]:
req_d

{'CS-LSA major declaration Cal I': {'ids': ['1794', '21317', '5723', '5725'],
  'req': 'One'},
 'CS-LSA major declaration Cal II': {'ids': ['21318', '20037', '5724', '1795'],
  'req': 'Must'},
 'CS-LSA major declaration ': {'ids': ['12118', '12112'], 'req': 'Must'},
 'Core Probability': {'ids': ['6305',
   '21016',
   '1957',
   '9079',
   '3760',
   '45401',
   '4093',
   '44142',
   '5045'],
  'req': 'One'},
 'Core CS': {'ids': ['4297', '12126', '4323'], 'req': 'All'},
 'Upper Level CS Technical Electives (ULCS) ': {'ids': ['48880',
   '12127',
   '39380',
   '40784',
   '14742',
   '14743',
   '4322',
   '40794',
   '16836',
   '4324',
   '40795',
   '10311',
   '12156',
   '12836',
   '4327',
   '22432',
   '12157',
   '15869',
   '41262',
   '12158',
   '13862',
   '20371',
   '12159',
   '12172',
   '14747',
   '14748',
   '21033',
   ' ',
   '12187',
   ' ',
   ' ',
   ' ',
   '12189',
   '17276',
   '20372',
   '4341',
   '39779',
   '20373',
   '12841',
   '41098',
   '12842',

In [304]:
rng = np.random.default_rng(seed=1)

def gen_student(n_credits=60):
    total_credits = 0
    courses_taken = []
    
    deg_keys = list(req_d.keys())
    deg_keys_taken = rng.choice(deg_keys)
    
    while total_credits < n_credits:
        deg_key = rng.choice(deg_keys, size=1, replacement=False)
        

IndentationError: expected an indented block (689438572.py, line 8)

In [305]:
rng = np.random.default_rng(seed=1)
rng.choice()