## course_info.ipynb - Gathering the course information from the UR website and convert to the JSON or xlwt file.

In this file, using function in the requests packet to get html from the UR website. Then, using the methods in BeautifulSoup handle html text and match the text pattern by the regular expression.

Last, Converting the details to the JSON or xlwt files.

The following URL is about course details: 
https://banner.uregina.ca:17023/ssbprod/bwckctlg.p_disp_dyn_ctlg

### Part 1 - Functions about the get the html from the UR website

In [1]:
import requests
from bs4 import BeautifulSoup
import bs4
import re
import xlwt
import pandas as pd

In [2]:
def getHTMLText(url, params = None):
    """Get HTML Text 

    Geting the HTML text form the url with params

    Args:
        url: web url
        detail_para: The parameters for the url as dirctory format.

    Returns:
        return the html text

    Raises:
        IOError: Error message will print out
    """
    try:
        kv = {'user-agent': 'Mozilla/5.0'}
        r = requests.get(url, timeout=30, headers=kv, params = params)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "Connection error"

# detail_url = "https://banner.uregina.ca:17023/ssbprod/bwckctlg.p_disp_course_detail"
# detail_para = {'cat_term_in':'202110',
#         'subj_code_in':'ENSE',
#         'crse_numb_in':'350'}
# html = getHTMLText(detail_url,params=detail_para)
# html

In [3]:
# elimiate '\n' and '*' in the text 
def eliminate(text):
    """elimiate the space, \n and *

    Remove the front and end space, \n and *

    Args:
        text: text that need to be modified

    Returns:
        return the modified text

    Raises:
        IOError:
    """
    # Remove the *
    text = text.replace('*','')

    # Remove the \n and space
    text = text.strip()
    
    return text

In [34]:
def parsePage(detail_dict, html):
    """Parsing the page

    Parsing the UoR course infomation page, and append the detail into
    the detail_dict

    Args:
        detail_para: The parameters for the url as dirctory format.
        html: the html text that need to be parsed

    Returns:
        No returns

    Raises:
        IOError: 
    """
    soup = BeautifulSoup(html, "html.parser") # Make the soup for the html

    # Find the course name and information tag
    name_tags = soup.find_all('td',class_='nttitle')
    info_tags =soup.find_all('td', class_='ntdefault')

    for name_tag, info_tag in zip(name_tags, info_tags):
        try:
            # Get the string for facutly, course number and full name
            short_name = name_tag.string.split(' - ')[0] # Eg. ['ENSE 271', 'People-Centred Design']

            title = None 
            for part in name_tag.string.split(' - ')[1:]:
                if title == None: title = part
                else: title += part
            
            faculty, course_num = short_name.split(' ') # Eg. ENSE 271

            if len(re.findall(r'\*{3}.*\*{3}', info_tag.text)) == 0:
                prerequisite = ''
                description = re.findall(r'.*\.', info_tag.text)[0]
            else:
                # Get the string of course description and prerequisit
                prere_text = re.findall(r'\*{3}.*\*{3}', info_tag.text)[0] # Eg. ***Prerequisites: CS 115***
                prerequisite = eliminate(prere_text).split(': ')[1] # Eg. CS 115
                description_text = re.findall(r'[^\*]*\*{3}', info_tag.text)[0]
                description = eliminate(description_text)
            credit_text = re.findall(r'\d\.0{3} Credit hours',info_tag.text)[0]
            credit = credit_text.split('.')[0]

            # Store all the detail into the direction
            detail_dict["short_name"].append(short_name)
            detail_dict["title"].append(title)
            detail_dict["faculty"].append(faculty)
            detail_dict["course_num"].append(course_num)
            detail_dict["credit"].append(credit)
            detail_dict["prerequisite"].append(prerequisite)
            detail_dict["description"].append(description)
        except Exception as e:
            print(short_name, "get errors")
            print(e)

# Example of the funciton:
# detail_dict = {
#         "short_name" : [],
#         "title" : [],
#         "faculty" : [],
#         "course_num" : [],
#         "credit" : [],
#         "prerequisite" : [],
#         "description" : []
#     }

# parsePage(detail_dict, open('Course_info.html').read())
# detail_dict["description"]

In [35]:
def storeInfo(detail_dict, file_name="Course DB"):
    """Store the course infomation as the excel 

    Convert the course detail dictionary to the excel

    Args:
        detail_para: The parameters for the url as dirctory format.
        file_name: The name of the output excel's name

    Returns:
        No returns

    Raises:
        IOError:
    """
    file_path = os.getcwd()+'/' + file_name +'.xlsx' # Get the file's path and name
    df = pd.DataFrame(detail_dict)
    df.to_excel(file_path, index=False)
    # print(list(detail_dict.keys()))

# storeInfo(detail_dict)

In [36]:

def crseNumLoop(url, detail_para, course_num_list, detail_dict, parse_method=parsePage):
    """Course number loop

    Go to loop the course number in the list under the same fauctly,
    and, store course infomation in the detail_dict with appending method

    Args:
        url: UoR url for this funciton.
        detail_para: The parameters for the url as dirctory format.
        course_num_list: The list of course number.
        detail_dict: The dirctory that course detail will be stored in.

    Returns:
        No returns

    Raises:
        IOError: Error course number will print out
    """
    count = 0
    total = len(course_num_list)
    num_key = list(detail_para.keys())[2]
    for course_num in course_num_list:
        detail_para[num_key] = course_num
        html = getHTMLText(url, detail_para)
        if 1 :
            parse_method(detail_dict, html)
        # except Exception as e:
        #     print("Error at ", detail_para)
        #     print(e)
        #     continue
        count += 1
        print("Course number Progress: ",count, "/",total, " ",course_num)
    # print(detail_dict)
    

In [37]:
def subjCodeLoop(url, detail_para, subjNum_dict, detail_dict, parse_method=parsePage):
    """subject code loop

    Loop the subject code with the course number list

    Args:
        url: UoR url for this funciton.
        detail_para: The parameters for the url as dirctory format.
        course_num_list: The list of course number.
        subj_code_list: The list of the subject code.
        detail_dict: The dirctory that course detail will be stored in.

    Returns:
        No returns

    Raises:
        IOError: Error course number will print out
    """
    count = 0
    total = len(subjNum_dict)
    subj = list(detail_para.keys())[1]
    for subj_code, crse_num_list in subjNum_dict.items():
        count += 1
        print("Subject code loop :", subj_code, " ", count, "/", total)
        detail_para[subj] = subj_code
        crseNumLoop(url, detail_para, crse_num_list, detail_dict,parse_method=parse_method)
    

Store the facult of software information into table

In [39]:
detail_url = "https://banner.uregina.ca:17023/ssbprod/bwckctlg.p_disp_course_detail"
detail_para = {'cat_term_in':'202110',
        'subj_code_in':'ENSE',
        'crse_numb_in':'400'}
software_dict ={
    'CHEM':['104'],
    'ENGG':['123','140','100','303','410'],
    'MATH':['110','122','111','213'],
    'CS':['110','115','210','215','340','205','315','330','350','375','405','425','427'],
    'ENGL':['100'],
    'PHYS':['119'],
    'ENEL':['281','282','384','380','387','487','489'],
    'STAT':['289'],
    'ENSE':['352','374','353','470','471','475','400','472','350','477','473','474','479','480','481','482','483'],
    'BUS':['260'],
    'ECON':['201']
}
software_course = {
        "short_name" : [],
        "title" : [],
        "faculty" : [],
        "course_num" : [],
        "credit" : [],
        "prerequisite" : [],
        "description" : []
    }
test_software_dict={
    'ENGG':['123']
}
test_file_name = 'test'
file_name = "software course"
subjCodeLoop(detail_url,detail_para,software_dict,software_course)
storeInfo(software_course, file_name)

Subject code loop : CHEM   1 / 11
Course number Progress:  1 / 1   104
Subject code loop : ENGG   2 / 11
Course number Progress:  1 / 5   123
Course number Progress:  2 / 5   140
Course number Progress:  3 / 5   100
Course number Progress:  4 / 5   303
Course number Progress:  5 / 5   410
Subject code loop : MATH   3 / 11
Course number Progress:  1 / 4   110
Course number Progress:  2 / 4   122
Course number Progress:  3 / 4   111
Course number Progress:  4 / 4   213
Subject code loop : CS   4 / 11
Course number Progress:  1 / 13   110
Course number Progress:  2 / 13   115
Course number Progress:  3 / 13   210
Course number Progress:  4 / 13   215
Course number Progress:  5 / 13   340
Course number Progress:  6 / 13   205
Course number Progress:  7 / 13   315
Course number Progress:  8 / 13   330
Course number Progress:  9 / 13   350
Course number Progress:  10 / 13   375
Course number Progress:  11 / 13   405
Course number Progress:  12 / 13   425
Course number Progress:  13 / 13   42

In [9]:
def parseSecPage(sec_dict,html):
    soup = BeautifulSoup(html, "html.parser") # Make the soup for the html

    # Check the class if in the this semester
    result = re.findall(r'No classes were found that meet your search criteria',soup.text)
    if len(result)==1: 
        print("The class is not available for this semester")
        return
        # raise Exception('The class is not available for this semester')

    tables = soup.find_all('table',{'class':'datadisplaytable','summary':'This layout table is used to present the sections found'})[0]
    table_list = list(tables.children)[2:] # First two elements is not needed for the parsing the section page
    
    table_feature = ['type','time','days','loc','date_range','schedule_type','instructors']
    # Looping four elements as a group, picking first and third one as title and time, other is '/n'
    for title_tag,time_table in zip(table_list[0::4],table_list[2::4]):

        # Extract titles in the page
        titles = title_tag.text.split(' - ')
        name = eliminate(titles[0])
        course_code = eliminate(titles[1])
        title = eliminate(titles[2])
        section_num = eliminate(titles[3])

        # Extract term info
        term_text = re.findall(r'Associated\ Term:.*', time_table.text)
        if len(term_text):
            term_text = term_text[0]
            term = term_text.split(':')[-1]
            term = eliminate(term)

        # Extract time info for the lecture, exam and lab from each table
        cells = time_table.find_all('td',class_='dddefault')[1:]
        for index, cell in enumerate(cells):
            feat_index = index % len(table_feature)
            if feat_index == 0: 
                sec_dict['name'].append(name)
                sec_dict['course_code'].append(course_code)
                sec_dict['title'].append(title)
                sec_dict['section_num'].append(section_num)
                sec_dict['term'].append(term)
            feature = table_feature[feat_index]
            sec_dict[feature].append(cell.text)


Parse the course time information page for 202110

In [10]:
section_url = 'https://banner.uregina.ca:17023/ssbprod/bwckctlg.p_disp_listcrse'
section_para ={
    'term_in':'202110',
    'subj_in':'ENSE',
    'crse_in':'271',
    'schd_in':''
}
section_dict_202110={
    'name':[],
    'course_code':[],
    'title':[],
    'section_num':[],
    'term':[],
    'type':[],
    'time':[],
    'days':[],
    'loc':[],
    'date_range':[],
    'schedule_type':[],
    'instructors':[]
}
subjCodeLoop(section_url, section_para, software_dict, section_dict_202110, parse_method=parseSecPage)
file_name = "software 202110"
storeInfo(section_dict_202110, file_name)

Subject code loop : CHEM   1 / 11
Course number Progress:  1 / 1   104
Subject code loop : ENGG   2 / 11
Course number Progress:  1 / 5   123
The class is not available for this semester
Course number Progress:  2 / 5   140
Course number Progress:  3 / 5   100
The class is not available for this semester
Course number Progress:  4 / 5   303
The class is not available for this semester
Course number Progress:  5 / 5   410
Subject code loop : MATH   3 / 11
Course number Progress:  1 / 4   110
Course number Progress:  2 / 4   122
Course number Progress:  3 / 4   111
Course number Progress:  4 / 4   213
Subject code loop : CS   4 / 11
Course number Progress:  1 / 13   110
Course number Progress:  2 / 13   115
Course number Progress:  3 / 13   210
Course number Progress:  4 / 13   215
Course number Progress:  5 / 13   340
Course number Progress:  6 / 13   205
Course number Progress:  7 / 13   315
Course number Progress:  8 / 13   330
Course number Progress:  9 / 13   350
The class is not av

In [11]:
section_url = 'https://banner.uregina.ca:17023/ssbprod/bwckctlg.p_disp_listcrse'
section_para ={
    'term_in':'202030',
    'subj_in':'ENSE',
    'crse_in':'271',
    'schd_in':''
}
section_dict_202030={
    'name':[],
    'course_code':[],
    'title':[],
    'section_num':[],
    'term':[],
    'type':[],
    'time':[],
    'days':[],
    'loc':[],
    'date_range':[],
    'schedule_type':[],
    'instructors':[]
}
subjCodeLoop(section_url, section_para, software_dict, section_dict_202030, parse_method=parseSecPage)
file_name = "software 202030"
storeInfo(section_dict_202030, file_name)

Subject code loop : CHEM   1 / 11
Course number Progress:  1 / 1   104
Subject code loop : ENGG   2 / 11
Course number Progress:  1 / 5   123
Course number Progress:  2 / 5   140
Course number Progress:  3 / 5   100
Course number Progress:  4 / 5   303
The class is not available for this semester
Course number Progress:  5 / 5   410
Subject code loop : MATH   3 / 11
Course number Progress:  1 / 4   110
Course number Progress:  2 / 4   122
Course number Progress:  3 / 4   111
Course number Progress:  4 / 4   213
Subject code loop : CS   4 / 11
Course number Progress:  1 / 13   110
Course number Progress:  2 / 13   115
Course number Progress:  3 / 13   210
Course number Progress:  4 / 13   215
Course number Progress:  5 / 13   340
The class is not available for this semester
Course number Progress:  6 / 13   205
The class is not available for this semester
Course number Progress:  7 / 13   315
Course number Progress:  8 / 13   330
Course number Progress:  9 / 13   350
Course number Progr

In [19]:
import json
import copy

section_dict_202030={
    'name':[],
    'course_code':[],
    'title':[],
    'section_num':[],
    'term':[],
    'type':[],
    'time':[],
    'days':[],
    'loc':[],
    'date_range':[],
    'schedule_type':[],
    'instructors':[]
}
software_course = {
        "short_name" : [],
        "title" : [],
        "faculty" : [],
        "course_num" : [],
        "credit" : [],
        "prerequisite" : [],
        "description" : []
    }

In [13]:
def course_JSON(course_dict, section_dict, file_loc):
    course_json = {
        "term": None,
        "short_name" : None,
        "title" : None,
        "faculty" : None,
        "credit" : None,
        "description" : None,
        "prerequisite" : None,
        'section':[]
    }

    short_name_list = course_dict["short_name"]
    for index, short_name in enumerate(short_name_list):
        # print(index, short_name)
        course_json['section'], course_json['term'] = exctSectionTerm(short_name, section_dict)
        # course_json['term'] = exctTerm(short_name, section_dict)
        key_list = list(course_json.keys())
        key_list.remove('section')
        key_list.remove('term')
        for key in key_list:
            course_json[key] = course_dict[key][index]
        genJSON(course_json, file_loc + short_name + '.json')

In [14]:
def exctSectionTerm(short_name, section_dict):
    section_list = []
    sect_short_list = section_dict['title']
    sect_indexs = [i for i, sect_short_name in enumerate(sect_short_list) if sect_short_name == short_name]

    # In case the this semester has no sections for the class
    if len(sect_indexs) == 0: return [], 'No class for the term'

    for sect_index in sect_indexs:
        if section_dict['schedule_type'][sect_index] == 'Examination':
            continue

        section={
            'course_ID': section_dict['course_code'][sect_index],
            'section_num': section_dict['section_num'][sect_index],
            'instructors':section_dict['instructors'][sect_index],
            'days': section_dict['days'][sect_index],
            'time': section_dict['time'][sect_index],
            'loc': section_dict['loc'][sect_index],
            'course_type': section_dict['schedule_type'][sect_index],
            'exam_days': None,
            'exam_date': None,
            'exam_time': None 
        }
        if section['course_type'] == 'Lecture':
           section['exam_days'] = section_dict['days'][sect_index + 1]
           section['exam_date'] = section_dict['date_range'][sect_index + 1]
           section['exam_time'] = section_dict['time'][sect_index + 1]
        section_list.append(section)

    term_index = sect_indexs[0]
    term  = section_dict['term'][term_index]
    return section_list, term


In [15]:
def genJSON(course_json, file_loc):
    j = json.dumps(course_json)
    with open(file_loc,'w') as f:
        f.write(j)
        f.close()

In [16]:
file_loc = 'JSON/202030/'
course_JSON(software_course,section_dict_202030, file_loc)
print(file_loc, "Finished")

JSON/202030/ Finished


In [17]:
file_loc = 'JSON/202110/'
course_JSON(software_course,section_dict_202110, file_loc)
print(file_loc, "Finished")

JSON/202110/ Finished


In [40]:
def faculty_JSON(course_dict, file_loc, faculty_name):
    course_json = {
        "short_name" : None,
        "title" : None,
        "faculty" : None,
        "credit" : None,
        "description" : None,
        "prerequisite" : None,
    }
    short_name_list = course_dict["short_name"]
    json_list = []
    for index, short_name in enumerate(short_name_list):
        for key in course_json.keys():
            course_json[key] = course_dict[key][index]
        json_list.append(course_json)
        course_json = copy.deepcopy(course_json)
    genJSON(json_list, file_loc + faculty_name + '.json')

In [41]:
file_loc = 'JSON/'
faculty_JSON(software_course, file_loc, 'software')

In [971]:
#!/usr/bin/python
# -*- coding: UTF-8 -*-

import pymysql

connection = pymysql.connect(host = 'localhost' ,#host属性
                             user = 'root', #用户名 
                             password = 'Xl11611',  #此处填登录数据库的密码
                             db = 'URcourse' #数据库名
                             )
#创建光标对象，一个连接可以有很多光标，一个光标跟踪一种数据状态。
#光标对象作用是：、创建、删除、写入、查询等等
cur = connection.cursor()
#查看有哪些数据库，通过cur.fetchall()获取查询所有结果
print(cur.fetchall())

OperationalError: (2003, "Can't connect to MySQL server on 'localhost' ([Errno 61] Connection refused)")

Rest part is for test 

In [433]:
# Set the parameters for the functions
detail_url = "https://banner.uregina.ca:17023/ssbprod/bwckctlg.p_disp_course_detail"
detail_para = {'cat_term_in':'202110',
        'subj_code_in':'ENSE',
        'crse_numb_in':'400'}
subj_list = ['ENGL','GEOL','STAT','BUS','ECON','BIOL']

subjNum_dirct = {
    'ENGL': ['100'],
    'GEOL': ['102','270'],
    'STAT': ['289'],
    'BUS': ['260','210','250','285','302'],
    'ECON': ['201'],
    'BIOL': ['223']
}

detail_dict = {
        "short_name" : [],
        "title" : [],
        "faculty" : [],
        "course_num" : [],
        "credit" : [],
        "prerequisite" : [],
        "description" : []
    }
file_name = "/Course DB.xlsx"

In [438]:
# Main functions
parsePage(detail_dict, open('Course_info.html').read())
subjCodeLoop(detail_url,detail_para,subjNum_dirct,detail_dict)
comp_dict = {}
for subj, num in zip(detail_dict['faculty'],detail_dict['course_num']):
    if subj in comp_dict.keys():
        comp_dict[subj].append(num)
    else:
        comp_dict[subj] = [num]
storeInfo(detail_dict, file_name)

CHEM 210 get errors
CHEM 252 get errors
CHEM 360 get errors
CHEM 461 get errors
CHEM 490AJ get errors
CHEM 491AG get errors
CHEM 492AB get errors
CHEM 492AB get errors
CHEM 492AB get errors
CS 280 get errors
CS 290AL get errors
CS 290AL get errors
CS 290AL get errors
CS 290AL get errors
CS 290AL get errors
CS 290AL get errors
CS 375 get errors
CS 390AR get errors
CS 390AR get errors
CS 390AR get errors
CS 390AR get errors
CS 390AR get errors
CS 476 get errors
CS 490DG get errors
CS 491AL get errors
CS 491AL get errors
CS 491AL get errors
CS 491AL get errors
ENEL 495 get errors
ENEV 322 get errors
ENEV 484 get errors
ENGG 123 get errors
ENGG 123 get errors
ENGG 123 get errors
ENGG 411 get errors
ENIN 463 get errors
ENPE 492 get errors
ENSE 483 get errors
MATH 382 get errors
MATH 395AB get errors
MATH 395AB get errors
MATH 395AB get errors
MATH 395AB get errors
MATH 395AB get errors
MATH 426 get errors
MATH 442 get errors
MATH 485 get errors
MATH 495AE get errors
PHYS 471 get errors
PHYS

In [761]:
section_url = 'https://banner.uregina.ca:17023/ssbprod/bwckctlg.p_disp_listcrse'
section_para ={
    'term_in':'202110',
    'subj_in':'ENSE',
    'crse_in':'271',
    'schd_in':''
}
section_dict={
    'title':[],
    'term':[],
    'type':[],
    'time':[],
    'days':[],
    'loc':[],
    'date_range':[],
    'schedule_type':[],
    'instructors':[]
}
exa_dict = {
    'ENSE':['496AC','350','496AD'],
    'ENEL':['387','489']
}


In [765]:
subjCodeLoop(section_url, section_para, comp_dict, section_dict, parse_method=parseSecPage)

s:  48 / 186   411
Course number Progress:  49 / 186   412
The class is not available for this semester
Error at  {'term_in': '202110', 'subj_in': 'MATH', 'crse_in': '416', 'schd_in': ''}
The class is not available for this semester
Error at  {'term_in': '202110', 'subj_in': 'MATH', 'crse_in': '418', 'schd_in': ''}
The class is not available for this semester
Error at  {'term_in': '202110', 'subj_in': 'MATH', 'crse_in': '420', 'schd_in': ''}
The class is not available for this semester
Error at  {'term_in': '202110', 'subj_in': 'MATH', 'crse_in': '421', 'schd_in': ''}
The class is not available for this semester
Error at  {'term_in': '202110', 'subj_in': 'MATH', 'crse_in': '422', 'schd_in': ''}
The class is not available for this semester
Error at  {'term_in': '202110', 'subj_in': 'MATH', 'crse_in': '423', 'schd_in': ''}
The class is not available for this semester
Error at  {'term_in': '202110', 'subj_in': 'MATH', 'crse_in': '424', 'schd_in': ''}
The class is not available for this se

In [766]:
file_name = '20210Course time'
storeInfo(section_dict, file_name = file_name)

ValueError: arrays must all be same length

In [51]:
def postHTMLText(url, header = None data = None):
    """Post HTML Text 

    Post the HTML text form the url with params

    Args:
        url: web url
        detail_para: The parameters for the url as dirctory format.

    Returns:
        return the html text

    Raises:
        IOError: Error message will print out
    """
    try:
        kv = {'user-agent': 'Mozilla/5.0'}
        r = requests.post(url, timeout=30, data = params)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "Connection error"

list_url = "https://banner.uregina.ca:17023/ssbprod/bwckctlg.p_display_courses"
list1_url = "https://banner.uregina.ca:17023/ssbprod/bwckctlg.p_disp_cat_term_date"

header = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en-CA;q=0.8,en;q=0.7',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Length': '317',
'Content-Type': 'application/x-www-form-urlencoded',
'Cookie': 'TESTID=set; SESSID=UVI2RklMOTAxNTA1OTE=; ADMSESSID=TjJZMUlYNDMwMDkw; _ga=GA1.2.440599669.1580419316; optimizelyEndUserId=oeu1600921596326r0.6447979592683208; amplitude_id_9f6c0bb8b82021496164c672a7dc98d6_edmuregina.ca=eyJkZXZpY2VJZCI6IjgyNGNhYTQwLTRhZWMtNGE1Ny04NmMzLTc2MTVkYjZkNTE0YVIiLCJ1c2VySWQiOm51bGwsIm9wdE91dCI6ZmFsc2UsInNlc3Npb25JZCI6MTYwMDk2MjM4MDA5OSwibGFzdEV2ZW50VGltZSI6MTYwMDk2MjM4MDEwNiwiZXZlbnRJZCI6MCwiaWRlbnRpZnlJZCI6Miwic2VxdWVuY2VOdW1iZXIiOjJ9; amplitude_id_408774472b1245a7df5814f20e7484d0uregina.ca=eyJkZXZpY2VJZCI6IjA4NjU1MmIzLTE5ZTYtNDI1OS04N2I2LWU3NWY1MDQxMDEzZCIsInVzZXJJZCI6bnVsbCwib3B0T3V0IjpmYWxzZSwic2Vzc2lvbklkIjoxNjAwOTYyMzgzMDI4LCJsYXN0RXZlbnRUaW1lIjoxNjAwOTYyMzgzOTc3LCJldmVudElkIjoyLCJpZGVudGlmeUlkIjo4LCJzZXF1ZW5jZU51bWJlciI6MTB9; AMCVS_8E929CC25A1FB2B30A495C97%40AdobeOrg=1; __gads=ID=7d10d3bee108919d:T=1602036465:S=ALNI_Mavq06sa1zxYibYCqeXal0btD0a6A; s_cc=true; s_fid=4ECFC879EC4080AE-26B653769F45139C; s_sq=%5B%5BB%5D%5D; BIGipServerBanner_17023=2811762860.32578.0000; ezproxy=X9NbSGn6cGjcHPs; AMCV_8E929CC25A1FB2B30A495C97%40AdobeOrg=1687686476%7CMCIDTS%7C18543%7CMCMID%7C47337776946694685623073671934143224008%7CMCAAMLH-1607393952%7C9%7CMCAAMB-1607393952%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1606796352s%7CNONE%7CMCAID%7CNONE%7CvVersion%7C3.0.0; utag_main=v_id:017500d022ff001da811348e36d603079004107100838$_sn:9$_se:10$_ss:0$_st:1606791599032$vapi_domain:uregina.ca$ses_id:1606789144050%3Bexp-session$_pn:8%3Bexp-session; _gid=GA1.2.1174615499.1609283412',
'Host': 'banner.uregina.ca:17023',
'Origin': 'https://banner.uregina.ca:17023',
'Referer': 'https://banner.uregina.ca:17023/ssbprod/bwckctlg.p_disp_cat_term_date',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}

list_para = {
'term_in': '202110',
'call_proc_in': 'bwckctlg.p_disp_dyn_ctlg',
'sel_subj': 'dummy',
'sel_levl': 'dummy',
'sel_schd': 'dummy',
'sel_coll': 'dummy',
'sel_divs': 'dummy',
'sel_dept': 'dummy',
'sel_attr': 'dummy',
'sel_subj': 'ENSE',
'sel_subj': 'ENEL',
'sel_subj': 'CS'
# 'sel_crse_strt': '',
# 'sel_crse_end': '',
# 'sel_title': '',
# 'sel_levl': '%',
# 'sel_schd': '%',
# 'sel_coll': '%',
# 'sel_divs': '%',
# 'sel_dept': '%',
# 'sel_from_cred': '',
# 'sel_to_cred': '',
# 'sel_attr': '%'
}
list1_para = {
    'call_proc_in':'bwckctlg.p_disp_dyn_ctlg',
    'cat_term_in':'202110'
}
rely = requests.post(list_url, timeout=30, headers=header data = list_para)
# rely.text

TypeError: request() got an unexpected keyword argument 'header'

In [47]:
soup = BeautifulSoup(rely.text, 'html.parser')
# print(soup.prettify())
print(soup)


<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/transitional.dtd">

<html lang="en">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="no-cache" http-equiv="Pragma" name="Cache-Control"/>
<meta content="no-cache" http-equiv="Cache-Control" name="Cache-Control"/>
<link href="/css/web_defaultapp.css" rel="stylesheet" type="text/css"/>
<link href="/css/web_defaultprint.css" media="print" rel="stylesheet" type="text/css"/>
<title>Catalogue Entries</title>
<meta content="text/javascript" http-equiv="Content-Script-Type" name="Default_Script_Language"/>
<script language="JavaScript" type="text/javascript">
<!-- Hide JavaScript from older browsers 
window.onunload = function() {submitcount=0;}
var submitcount=0;
function checkSubmit() {
if (submitcount == 0)
   {
   submitcount++;
   return true;
   }
else
   {
alert("Your changes have already been submitted.");
   return false;
   }
}
//  End script hiding --

In [376]:
rely.request.body

'term_in=202110&call_proc_in=bwckctlg.p_disp_dyn_ctlg&sel_subj=NSLI&sel_levl=dummy&sel_schd=dummy&sel_coll=dummy&sel_divs=dummy&sel_dept=dummy&sel_attr=dummy'

Using the local course_info.html to get the list of course and infomation and prerequesties

In [385]:
course_soup=BeautifulSoup(open('course_info.html',encoding='utf-8'),features='html.parser')  #features值可为lxml

In [395]:
title_tag = course_soup.find_all('td',class_='nttitle')[2]
title_tag

<td class="nttitle" scope="colgroup"><a href="https://banner.uregina.ca:17023/ssbprod/bwckctlg.p_disp_course_detail?cat_term_in=202110&amp;subj_code_in=CHEM&amp;crse_numb_in=104">CHEM 104 - General Chemistry I</a></td>