# KeanWISE Watchdog - Crawler Demo
This is a **interactive demo** of the **web crawler** part of the project. The following code mainly solves the problem of `logging in KeanWISE`, `request http interface`, and `securely logging out the system`.

In [None]:
UserName = None
Password = None

## Prepare Environment

In [1]:
# Import libraries
import requests
import warnings
from bs4 import BeautifulSoup

In [2]:
# Disable SSL verification
# Used with verify=False
# Need to be solved: KeanWISE will cause SSL error when accessed by requests lib
import urllib3
# urllib3.disable_warnings()

In [3]:
# Urls
host = 'selfservice.kean.edu'
origin = 'https://{}'.format(host)
student_url = '{}/Student'.format(origin)
login_url = '{}/Account/Login'.format(student_url)
logoff_url = '{}/Account/LogOff'.format(student_url)
planning_url = '{}/Planning'.format(student_url)
degree_plans_url = '{}/DegreePlans'.format(planning_url)

# The basic headers
basic_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
}

## Logging in KeanWISE

In [4]:
def check_status_code(code:int|requests.Response, expection:int|list[int]=200, msg:str='Status code checking failed!', raise_exception:bool=True):
    # Get status code
    if isinstance(code, requests.Response): code = code.status_code
    # Check
    if isinstance(expection, list):
        if code not in expection: 
            if raise_exception:
                raise Exception('{} Status code: {}'.format(msg, code))
            else:
                return False
    else:
        if code != expection: 
            if raise_exception:
                raise Exception('{} Status code: {}'.format(msg, code))
            else:
                return False
    return True

# Extract the request verification token from the page source code
# Used to pass the CSRF verification
def get_request_verification_token(content:str|requests.Response, index:int=None):
    # get Response --> extract HTML doc
    if isinstance(content, requests.Response): content = content.content
    # Parse content as HTML doc
    soup = BeautifulSoup(content, 'lxml')
    token_attrs = {
        'name': '__RequestVerificationToken',
        'value': True
    }
    tokens = soup.find_all('input', attrs=token_attrs, recursive=True)
    if len(tokens) == 0: 
        warnings.warn('Found no request verification tokens')
        return None
    # Extract tokens
    tokens = [token['value'] for token in tokens]
    # Return tokens
    if index is None: return tokens if len(tokens) != 1 else tokens[0]
    else: return tokens[index]

# Perform the logging in action
# The foundation of all other functions
def get_logged_session(UserName:str=UserName, Password:str=Password, login_url:str=login_url, headers:dict[str:str]=basic_headers, returnUrl:str='', performSamlLogin:str=''):
    # Create a new requests session
    # KeanWISE uses cookies to verify user authorization
    req_session = requests.Session()
    req_session.verify = False
    # Request the login page to get Request Verification Token
    res = req_session.get(login_url, headers=headers)
    check_status_code(res, 200, msg='Request login page failed!')
    # Login request:
    #   type: POST
    #   fields: {__RequestVerificationToken, returnUrl, performSamlLogin, UserName, Password}
    data = {
        '__RequestVerificationToken': get_request_verification_token(res, index=-1),
        'returnUrl': returnUrl,
        'performSamlLogin': performSamlLogin,
        'UserName': UserName,
        'PassWord': Password,
    }
    # Request the login interface
    res = req_session.post(login_url, data=data, headers=headers)
    check_status_code(res, 200, msg='Request login interface failed!')
    if res.url == login_url: raise Exception('Login Failed! Wrong user name or password.')
    # Return the logged in session and page
    return (req_session, res)

In [5]:
def generate_headers(response:requests.Response=None, Host:bool|str=True, Origin:bool|str=True, Referer:bool|str=True, XHR:bool=False, token:bool|str=False):
    # Use the basic headers as the base
    headers = basic_headers.copy()
    # Add elements to headers
    if Host != False:
        headers['Host'] = Host if isinstance(Host, str) else host
    if Origin != False:
        headers['Origin'] = Origin if isinstance(Origin, str) else origin
    if XHR: headers['X-Requested-With'] = 'XMLHttpRequest'
    if Referer != False:
        if isinstance(Referer, str): headers['Referer'] = Referer
        elif Referer and response is not None: headers['Referer'] = response.url
        else: warnings.warn('Unable to generate referer')
    if token != False:
        if isinstance(token, str): headers['__RequestVerificationToken'] = token
        elif token and response is not None: headers['__RequestVerificationToken'] = get_request_verification_token(response, 0)
        else: warnings.warn('Unable to generate __RequestVerificationToken')
    # Return headers
    return headers

# Control the session to log off
# Prevent the possible multi-logging problem
def session_log_off(session:requests.Session, last_response:requests.Response, logoff_url:str=logoff_url):
    pass

In [6]:
req_session, res = get_logged_session()



In [7]:
get_request_verification_token(res)

['8K63dPkA5LIRmPfYVorNQV3zUf0vgJ4gQXQgT_qphbvjTGJPWD9wHR6PIMKC84GMo0K8mEAp4ot05itwwgdfZQHsfu22nrtcLfPai8PJtpk1',
 'sDKmVCAmCiSOiBEYCDwazV-0tQEfRjV4sIQlR2pNYm5DFPbUsmNMDWAV0sIMc2yzCGg-29to7mpbLkqjICByPbqKo5lL8qwyfqvmUNu7QTM1']

In [8]:
res.url

'https://selfservice.kean.edu/Student/Account/Login'