## course_info.ipynb - Gathering the course information from the UR website and convert to the JSON or xlwt file.

In this file, using function in the requests packet to get html from the UR website. Then, using the methods in BeautifulSoup handle html text and match the text pattern by the regular expression.

Last, Converting the details to the JSON or xlwt files.

The following URL is about course details: 
https://banner.uregina.ca:17023/ssbprod/bwckctlg.p_disp_dyn_ctlg

### Part 1 - Functions about the get the html from the UR website

In [1]:
import requests
from bs4 import BeautifulSoup
import bs4
import re
import xlwt
import pandas as pd
import os
import copy
!pip install ipynb
from ipynb.fs.full.Param_SETUP import *

All parameter are set up


In [2]:
def getHTMLText(url, params = None):
    """Get HTML Text 

    Geting the HTML text form the url with params

    Args:
        url: web url
        detail_para: The parameters for the url as dirctory format.

    Returns:
        return the html text

    Raises:
        IOError: Error message will print out
    """
    try:
        kv = {'user-agent': 'Mozilla/5.0'}
        r = requests.get(url, timeout=30, headers=kv, params = params)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "Connection error"

# detail_url = "https://banner.uregina.ca:17023/ssbprod/bwckctlg.p_disp_course_detail"
# detail_para = {'cat_term_in':'202110',
#         'subj_code_in':'ENSE',
#         'crse_numb_in':'350'}
# html = getHTMLText(detail_url,params=detail_para)
# html

In [3]:
# elimiate '\n' and '*' in the text 
def eliminate(text):
    """elimiate the space, \n and *

    Remove the front and end space, \n and *

    Args:
        text: text that need to be modified

    Returns:
        return the modified text

    Raises:
        IOError:
    """
    # Remove the *
    text = text.replace('*','')

    # Remove the \n and space
    text = text.strip()
    
    return text

In [4]:
def parsePage(detail_dict, html):
    """Parsing the page

    Parsing the UoR course infomation page, and append the detail into
    the detail_dict

    Args:
        detail_para: The parameters for the url as dirctory format.
        html: the html text that need to be parsed

    Returns:
        No returns

    Raises:
        IOError: 
    """
    soup = BeautifulSoup(html, "html.parser") # Make the soup for the html

    # Find the course name and information tag
    name_tags = soup.find_all('td',class_='nttitle')
    info_tags =soup.find_all('td', class_='ntdefault')

    for name_tag, info_tag in zip(name_tags, info_tags):
        try:
            # Get the string for facutly, course number and full name
            short_name = name_tag.string.split(' - ')[0] # Eg. ['ENSE 271', 'People-Centred Design']

            title = None 
            for part in name_tag.string.split(' - ')[1:]:
                if title == None: title = part
                else: title += part
            
            faculty, course_num = short_name.split(' ') # Eg. ENSE 271

            if len(re.findall(r'\*{3}.*\*{3}', info_tag.text)) == 0:
                prerequisite = ''
                description = re.findall(r'.*\.', info_tag.text)[0]
            else:
                # Get the string of course description and prerequisit
                prere_text = re.findall(r'\*{3}.*\*{3}', info_tag.text)[0] # Eg. ***Prerequisites: CS 115***
                prerequisite = eliminate(prere_text).split(': ')[1] # Eg. CS 115
                description_text = re.findall(r'[^\*]*\*{3}', info_tag.text)[0]
                description = eliminate(description_text)
            credit_text = re.findall(r'\d\.0{3} Credit hours',info_tag.text)[0]
            credit = credit_text.split('.')[0]

            # Store all the detail into the direction
            detail_dict["short_name"].append(short_name)
            detail_dict["title"].append(title)
            detail_dict["faculty"].append(faculty)
            detail_dict["course_num"].append(course_num)
            detail_dict["credit"].append(credit)
            detail_dict["prerequisite"].append(prerequisite)
            detail_dict["description"].append(description)
        except Exception as e:
            print(short_name, "get errors")
            print(e)

# Example of the funciton:
# detail_dict = {
#         "short_name" : [],
#         "title" : [],
#         "faculty" : [],
#         "course_num" : [],
#         "credit" : [],
#         "prerequisite" : [],
#         "description" : []
#     }

# parsePage(detail_dict, open('Course_info.html').read())
# detail_dict["description"]

In [5]:
def storeInfo(detail_dict, file_name="Course DB"):
    """Store the course infomation as the excel 

    Convert the course detail dictionary to the excel

    Args:
        detail_para: The parameters for the url as dirctory format.
        file_name: The name of the output excel's name

    Returns:
        No returns

    Raises:
        IOError:
    """
    file_path = os.getcwd()+'/' + file_name +'.xlsx' # Get the file's path and name
    df = pd.DataFrame(detail_dict)
    df.to_excel(file_path, index=False)
    # print(list(detail_dict.keys()))

# storeInfo(detail_dict)

In [14]:

def crseNumLoop(url, detail_para, course_num_list, detail_dict, parse_method=parsePage):
    """Course number loop

    Go to loop the course number in the list under the same fauctly,
    and, store course infomation in the detail_dict with appending method

    Args:
        url: UoR url for this funciton.
        detail_para: The parameters for the url as dirctory format.
        course_num_list: The list of course number.
        detail_dict: The dirctory that course detail will be stored in.

    Returns:
        No returns

    Raises:
        IOError: Error course number will print out
    """
    count = 0
    total = len(course_num_list)
    num_key = list(detail_para.keys())[2]
    for course_num in course_num_list:
        detail_para[num_key] = course_num
        html = getHTMLText(url, detail_para)
        try :
            parse_method(detail_dict, html)
        except Exception as e:
            print("Error at ", detail_para)
            print(e)
            # continue
        count += 1
        # print('\rCurrent progress:{:.2f}%'.format(count*100/total),end='')

        print("Course number Progress: ",count, "/",total, " ",course_num)
    # print(detail_dict)
    

In [7]:
def subjCodeLoop(url, detail_para, subjNum_dict, detail_dict, parse_method=parsePage):
    """subject code loop

    Loop the subject code with the course number list

    Args:
        url: UoR url for this funciton.
        detail_para: The parameters for the url as dirctory format.
        course_num_list: The list of course number.
        subj_code_list: The list of the subject code.
        detail_dict: The dirctory that course detail will be stored in.

    Returns:
        No returns

    Raises:
        IOError: Error course number will print out
    """
    count = 0
    total = len(subjNum_dict)
    subj = list(detail_para.keys())[1]
    for subj_code, crse_num_list in subjNum_dict.items():
        count += 1
        print("Subject code loop :", subj_code, " ", count, "/", total)
        detail_para[subj] = subj_code
        crseNumLoop(url, detail_para, crse_num_list, detail_dict,parse_method=parse_method)
    

Store the facult of software information into table

In [8]:
SSE_course = copy.deepcopy(TempCou)    
PSE_course = copy.deepcopy(TempCou)
ISE_course = copy.deepcopy(TempCou)
EVSE_course = copy.deepcopy(TempCou)
ESE_course = copy.deepcopy(TempCou)
ALL_course = copy.deepcopy(TempCou)

cou_dict_list = [SSE_course, PSE_course, ISE_course, EVSE_course, ESE_course, ALL_course]

In [9]:
subjCodeLoop(detail_url, detail_para, software_dict, SSE_course)
subjCodeLoop(detail_url, detail_para, petroleum_dict, PSE_course)
subjCodeLoop(detail_url, detail_para, industrial_dict, ISE_course)
subjCodeLoop(detail_url, detail_para, environmental_dict, EVSE_course)
subjCodeLoop(detail_url,detail_para,electronic_dict, ESE_course)
subjCodeLoop(detail_url,detail_para,allCourse_dict, ALL_course)

Subject code loop : CHEM   1 / 12
Course number Progress:  1 / 1   104
Subject code loop : ENGG   2 / 12
Course number Progress:  1 / 5   123
Course number Progress:  2 / 5   140
Course number Progress:  3 / 5   100
Course number Progress:  4 / 5   303
Course number Progress:  5 / 5   401
Subject code loop : MATH   3 / 12
Course number Progress:  1 / 4   110
Course number Progress:  2 / 4   122
Course number Progress:  3 / 4   111
Course number Progress:  4 / 4   213
Subject code loop : CS   4 / 12
Course number Progress:  1 / 12   110
Course number Progress:  2 / 12   115
Course number Progress:  3 / 12   210
Course number Progress:  4 / 12   215
Course number Progress:  5 / 12   340
Course number Progress:  6 / 12   205
Course number Progress:  7 / 12   315
Course number Progress:  8 / 12   330
Course number Progress:  9 / 12   375
Course number Progress:  10 / 12   405
Course number Progress:  11 / 12   425
Course number Progress:  12 / 12   427
Subject code loop : ENGL   5 / 12
Cou

In [43]:
# SSE_file = 'software course'
# PSE_file = 'petroleum course'
# ISE_file = 'industrial course'
# EVSE_file = 'environmental course'
# ESE_file = 'electronic course'
# AllCourse_file = 'all course'

storeInfo(SSE_course, SSE_file)
storeInfo(PSE_course, PSE_file)
storeInfo(ISE_course, ISE_file)
storeInfo(EVSE_course, EVSE_file)
storeInfo(ESE_course, ESE_file)
storeInfo(ALL_course, AllCourse_file)

In [16]:
def parseSecPage(sec_dict,html):
    soup = BeautifulSoup(html, "html.parser") # Make the soup for the html

    # Check the class if in the this semester
    result = re.findall(r'No classes were found that meet your search criteria',soup.text)
    if len(result)==1: 
        print("The class is not available for this semester")
        return
        # raise Exception('The class is not available for this semester')

    tables = soup.find_all('table',{'class':'datadisplaytable','summary':'This layout table is used to present the sections found'})[0]
    table_list = list(tables.children)[2:] # First two elements is not needed for the parsing the section page
    
    table_feature = ['type','time','days','loc','date_range','schedule_type','instructors']
    # Looping four elements as a group, picking first and third one as title and time, other is '/n'
    for title_tag,time_table in zip(table_list[0::4],table_list[2::4]):

        # Extract titles in the page
        extra_index = 0
        titles = title_tag.text.split(' - ')
        name = eliminate(titles[0])
        if not eliminate(titles[1]).isdigit(): 
            name = name + '-' + eliminate(titles[1])
            extra_index = 1
        course_code = eliminate(titles[1 + extra_index])
        title = eliminate(titles[2 + extra_index])
        section_num = eliminate(titles[3+ extra_index])

        # Extract term info
        term_text = re.findall(r'Associated\ Term:.*', time_table.text)
        if len(term_text):
            term_text = term_text[0]
            term = term_text.split(':')[-1]
            term = eliminate(term)

        # Extract time info for the lecture, exam and lab from each table
        cells = time_table.find_all('td',class_='dddefault')[1:]
        for index, cell in enumerate(cells):
            feat_index = index % len(table_feature)
            if feat_index == 0: 
                sec_dict['name'].append(name)
                sec_dict['course_code'].append(course_code)
                sec_dict['title'].append(title)
                sec_dict['section_num'].append(section_num)
                sec_dict['term'].append(term)
            feature = table_feature[feat_index]
            sec_dict[feature].append(cell.text)


Parse the course time information page for 202110

In [17]:
# term_in = '202110'
# section_dict_202110={
#     'name':[],
#     'course_code':[],
#     'title':[],
#     'section_num':[],
#     'term':[],
#     'type':[],
#     'time':[],
#     'days':[],
#     'loc':[],
#     'date_range':[],
#     'schedule_type':[],
#     'instructors':[]
# }

sec_202110 = copy.deepcopy(TempSec)
sec_202030 = copy.deepcopy(TempSec)
sec_202020 = copy.deepcopy(TempSec)
sec_list = [sec_202110, sec_202030, sec_202020]

for section, term_in in zip(sec_list, term_ins):
    section_para['term_in'] = term_in
    subjCodeLoop(section_url, section_para, allCourse_dict, section, parse_method=parseSecPage)
    storeInfo(section, term_in)
# file_name = "setion 202110"
# storeInfo(sec_202110, file_name)

able for this semester
Course number Progress:  6 / 15   205
The class is not available for this semester
Course number Progress:  7 / 15   315
Course number Progress:  8 / 15   330
Course number Progress:  9 / 15   375
Course number Progress:  10 / 15   405
The class is not available for this semester
Course number Progress:  11 / 15   425
The class is not available for this semester
Course number Progress:  12 / 15   427
Course number Progress:  13 / 15   335
Course number Progress:  14 / 15   350
Course number Progress:  15 / 15   372
Subject code loop : ENGL   5 / 17
Course number Progress:  1 / 1   100
Subject code loop : PHYS   6 / 17
Course number Progress:  1 / 3   119
Course number Progress:  2 / 3   112
Course number Progress:  3 / 3   201
Subject code loop : ENEL   7 / 17
Course number Progress:  1 / 21   280
The class is not available for this semester
Course number Progress:  2 / 21   281
The class is not available for this semester
Course number Progress:  3 / 21   282
Co

In [22]:
import json

section_dict_202030={
    'name':[],
    'course_code':[],
    'title':[],
    'section_num':[],
    'term':[],
    'type':[],
    'time':[],
    'days':[],
    'loc':[],
    'date_range':[],
    'schedule_type':[],
    'instructors':[]
}
software_course = {
        "short_name" : [],
        "title" : [],
        "faculty" : [],
        "course_num" : [],
        "credit" : [],
        "prerequisite" : [],
        "description" : []
    }

In [23]:
def course_JSON(course_dict, section_dict, file_loc):
    course_json = {
        "term": None,
        "short_name" : None,
        "title" : None,
        "faculty" : None,
        "credit" : None,
        "description" : None,
        "prerequisite" : None,
        'section':[]
    }

    short_name_list = course_dict["short_name"]
    for index, short_name in enumerate(short_name_list):
        # print(index, short_name)
        course_json['section'], course_json['term'] = exctSectionTerm(short_name, section_dict)
        # course_json['term'] = exctTerm(short_name, section_dict)
        key_list = list(course_json.keys())
        key_list.remove('section')
        key_list.remove('term')
        for key in key_list:
            course_json[key] = course_dict[key][index]
        genJSON(course_json, file_loc + short_name + '.json')

In [24]:
def exctSectionTerm(short_name, section_dict):
    section_list = []
    sect_short_list = section_dict['title']
    sect_indexs = [i for i, sect_short_name in enumerate(sect_short_list) if sect_short_name == short_name]

    # In case the this semester has no sections for the class
    if len(sect_indexs) == 0: return [], 'No class for the term'

    for sect_index in sect_indexs:
        if section_dict['schedule_type'][sect_index] == 'Examination':
            continue

        section={
            'course_ID': section_dict['course_code'][sect_index],
            'section_num': section_dict['section_num'][sect_index],
            'instructors':section_dict['instructors'][sect_index],
            'days': section_dict['days'][sect_index],
            'time': section_dict['time'][sect_index],
            'loc': section_dict['loc'][sect_index],
            'course_type': section_dict['schedule_type'][sect_index],
            'exam_days': None,
            'exam_date': None,
            'exam_time': None 
        }
        if section['course_type'] == 'Lecture':
           section['exam_days'] = section_dict['days'][sect_index + 1]
           section['exam_date'] = section_dict['date_range'][sect_index + 1]
           section['exam_time'] = section_dict['time'][sect_index + 1]
        section_list.append(section)

    term_index = sect_indexs[0]
    term  = section_dict['term'][term_index]
    return section_list, term


In [25]:
def genJSON(course_json, file_loc):
    j = json.dumps(course_json)
    with open(file_loc,'w') as f:
        f.write(j)
        f.close()

In [26]:
file_locs = [term + '/' for term in term_ins]
# file_loc = 'JSON/202030/'
for section, file_loc in zip(sec_list, file_locs):
    course_JSON(ALL_course, section, file_loc)
# course_JSON(software_course,section_dict_202030, file_loc)

print(file_loc, "Finished")

202020/ Finished


In [27]:
def faculty_JSON(course_dict, file_loc, faculty_name):
    course_json = {
        "short_name" : None,
        "title" : None,
        "faculty" : None,
        "credit" : None,
        "description" : None,
        "prerequisite" : None,
    }
    short_name_list = course_dict["short_name"]
    json_list = []
    for index, short_name in enumerate(short_name_list):
        for key in course_json.keys():
            course_json[key] = course_dict[key][index]
        json_list.append(course_json)
        course_json = copy.deepcopy(course_json)
    genJSON(json_list, file_loc + faculty_name + '.json')

In [29]:
file_loc = ''
if not len(cou_dict_list) == len(fau_names): print('Two list have diff length')
for cou_dict, faculty in zip(cou_dict_list, fau_names):
    faculty_JSON(cou_dict, file_loc, faculty)
# faculty_JSON(software_course, file_loc, 'software')