## Search Engine

### a) Create SCHEMA for Whoosh and add (pickled) documents

In [1]:
def add_doc(writer, course_information, course_tag):
    name, sem, lect, contact, study, credits, lang, masters, kwords, situating, content = \
    None, None, None, None, None, None, None, None, None, None, None
    
    keys = course_information.keys()
    
    if 'name' in keys:
        name = course_information['name']
    if 'semester' in keys:
        sem = course_information['semester']
    if 'lecturer' in keys:
        lect = course_information['lecturer']
    if 'contact_hours' in keys:
        contact = course_information['contact_hours']
    if 'study_time' in keys:
        study = course_information['study_time']
    if 'credits' in keys:
        credits = course_information['credits']
    if 'language' in keys:
        lang = course_information['language']
    if 'masters' in keys:
        masters = course_information['masters']
    if 'keywords' in keys:
        kwords = course_information['keywords']
    if 'situating' in keys:
        situating = course_information['situating']
    if 'content' in keys:
        content = course_information['content']
    if 'link' in keys:
        link = course_information['link']
        
    writer.add_document(tag=course_tag, name=name,semester=sem,
                        lecturer=lect,contact=contact,study=study,
                        credits=credits,language=lang,masters=masters,keywords=kwords,
                        situating=situating,content=content,link=link)

In [2]:
import os, sys, pickle, re

from whoosh.index import create_in, exists_in, open_dir
from whoosh.fields import Schema, TEXT, ID, KEYWORD, STORED
from whoosh.writing import CLEAR
from whoosh.analysis import StemmingAnalyzer, CompoundWordFilter
 
root = 'courses'

if not os.path.exists("indexdir"):
    os.mkdir("indexdir")
    
#keywords should be KEYWORD, but due to dirty data, better position searching with higher boost
#store content for more_like_this

stemming = StemmingAnalyzer()

schema = Schema(tag=STORED,name=TEXT(stored=True, field_boost=1.5),semester=ID,
                lecturer=TEXT(stored=True, field_boost=15.0),contact=STORED,study=STORED,
                credits=STORED,language=ID,masters=ID(stored=True),
                keywords=TEXT(analyzer=stemming),
                situating=TEXT(analyzer=stemming),content=TEXT(analyzer=stemming, stored=True),link=STORED)
    
# Creating a index writer to add document as per schema

if exists_in("indexdir"):
    ix = open_dir("indexdir")
    writer = ix.writer()
    writer.commit(mergetype=CLEAR)
else:
    ix = create_in("indexdir",schema)

writer = ix.writer()
    
filepaths = [os.path.join(root,i) for i in os.listdir(root)]
for path in filepaths:
    fp = open(path,'rb')
    course_dict = pickle.load(fp)

    print("%s\n%s\n" % (path,course_dict))
    course_tag = re.search(r'courses\\(.*?)\.pkl', path).group(1)
    
    add_doc(writer, course_dict, course_tag)
    
    fp.close()
writer.commit()

courses\1006244BNR.pkl
{'name': 'Applied Geomorphology', 'link': 'https://caliweb.cumulus.vub.ac.be/caliweb/?page=course-offer&id=002263&anchor=1&target=pr&year=1819&language=en&output=html', 'masters': ['VUBLand'], 'credits': '5', 'study_time': '150.0', 'semester': '1', 'language': 'EN', 'lecturer': 'DE MEERENDRE', 'contact_hours': '24', 'content': 'HOC This course aims at providing an in depth knowledge of the geomorphologic processes responsible for the genesis of the morphology in fluvial and arid environments. It deals with the study of geomorphologic processes through rationalising the interrelationships between environmental conditions, rock/sediment properties, transport agents and landscape forms. Focus is put on quantifying and modeling geomorphologic processes in order to understand the behaviour of complex geomorphologic systems. The lectures serve as a basis for the applications during practical’s, lab session and for understanding specific case studies from the literature

courses\4016308ENR.pkl
{'name': 'Electric Power Systems', 'link': 'https://caliweb.cumulus.vub.ac.be/caliweb/?page=course-offer&id=006928&anchor=1&target=pr&year=1819&language=en&output=html', 'masters': ['VUBElecMech'], 'credits': '5', 'study_time': '150.0', 'semester': '1', 'language': 'EN', 'lecturer': 'Claude MAUN', 'contact_hours': '30', 'content': 'The valid fiche can be found at the following link : ELEC-H413. Change the language to English in the dropdown menu on top of the page.- Electric energy systems - An overview- Steady-state single-phase models of power system components- Load flow- State estimation- Economics of electricity generation- Optimal and secure operation of transmission systemsExercisesImplementation of a load-flow algorithmProject focusing on one method of the courseAmount of hours for : Lectures : 2.5 ECTS (30h), exercises/labs/seminars : 2.5 ECTS (30h)'}

courses\4016310FNR.pkl
{'name': 'Nuclear Energy and Reactors', 'link': 'https://caliweb.cumulus.vub.ac.

courses\4017282FNR.pkl
{'name': 'Robustness of structures and reliability of materials', 'link': 'https://caliweb.cumulus.vub.ac.be/caliweb/?page=course-offer&id=007505&anchor=1&target=pr&year=1819&language=en&output=html', 'masters': ['VUBArch', 'VUBCivil', 'VUBCivil'], 'credits': '4', 'study_time': '120.0', 'semester': '1', 'language': 'EN', 'lecturer': 'Decaan IR', 'contact_hours': '24', 'content': 'The valid fiche can be found at the following link: CNST - H409. Change the language to English in the dropdown menu on top of the page.Introducing the concepts of uncertainties, robustness, and reliability in structural design, and providing a scientific basis to the determination of the actions on buildings.'}

courses\4017284FNR.pkl
{'name': 'Mechanics of Geomaterials', 'link': 'https://caliweb.cumulus.vub.ac.be/caliweb/?page=course-offer&id=007506&anchor=1&target=pr&year=1819&language=en&output=html', 'masters': ['VUBCivil', 'VUBCivil'], 'credits': '4', 'study_time': '120.0', 'semest

courses\C000819.pkl
{'name': 'Quantum electrodynamics [nl]', 'semester': '2', 'lecturer': 'Dimitri Van Neck', 'contact_hours': '52.5', 'study_time': '180', 'credits': '6', 'link': 'https://studiegids.ugent.be/2018/NL/studiefiches/C000819.pdf', 'masters': ['EMPHYS'], 'language': 'NL', 'keywords': 'Quantum mechanics, Electromagnetism, Modern Physics, Quantum Electrodynamics, Diractheorie', 'situating': 'This course aims to give a non-relativistic introduction to quantum electrodynamics. In addition, it is also intended to study electromagnetic interactions in the context of the Dirac equation. This is in line with the training competences: M.1.1, M.1.4, M.2.6, M.3.5, M.4.1', 'content': 'Quantum theory of the free em field: Maxwell equations, global and local gauge symmetry, quantization of the em field, state vectors for the em field, coherent states. Interaction between radiation and matter, dipole radiation, photon scattering of electrons, Thompson cross-section, natural line width. Se

courses\E004241.pkl
{'name': 'Modeling and optimization of industrial systems', 'semester': '1', 'lecturer': 'El-Houssaine Aghezzaf', 'contact_hours': '60', 'study_time': '180', 'credits': '6', 'link': 'https://studiegids.ugent.be/2018/NL/studiefiches/E004241.pdf', 'masters': ['EMIEOR'], 'language': 'EN', 'keywords': 'Advanced methods in Operational Research, Decomposition Techniques, Stochastic Optimization Techniques', 'situating': 'This course provides the students with a wide range of advanced methods and techniques to deepen the complex, large-scale optimization problems. After the course &quot;operational research models and methods&quot; and this course &quot;Modeling and optimization of industrial systems&quot;, students should be able to model, resolve and analyze the various decision-making problems that occur in all kinds of industrial production systems. This course further elaborates the mathematical optimization techniques and focuses on the application of these technique

courses\E039110.pkl
{'name': 'Technical thermodynamics [nl]', 'semester': '2', 'lecturer': 'Michel De Paepe', 'contact_hours': '60', 'study_time': '180', 'credits': '6', 'link': 'https://studiegids.ugent.be/2018/NL/studiefiches/E039110.pdf', 'masters': ['EMCHEM'], 'language': 'NL', 'keywords': '', 'situating': '', 'content': ''}

courses\E039161.pkl
{'name': 'Thermodynamics, heat and mass transfer', 'semester': '1', 'lecturer': 'Ivana Stankovic', 'contact_hours': '60', 'study_time': '180', 'credits': '6', 'link': 'https://studiegids.ugent.be/2018/NL/studiefiches/E039161.pdf', 'masters': ['EMFIRE', 'EMFSEN'], 'language': 'EN', 'keywords': 'first main law, state comparison, combustion, conduction, convection, radiation, mass transfer', 'situating': 'This course continues in the first year of the program. The course provides a scientific basis for thermodynamic processes, combustion, heat transfer and mass transfer in case of fire. The course supports the basic competences of the training

courses\E064920.pkl
{'name': 'Chemical and physical textile technology [nl]', 'semester': '2', 'lecturer': 'Paul Kiekens', 'contact_hours': '67.5', 'study_time': '180', 'credits': '6', 'link': 'https://studiegids.ugent.be/2018/NL/studiefiches/E064920.pdf', 'masters': ['EMCHEM', 'EMMAEN'], 'language': 'NL', 'keywords': 'Breeding / chemical and physical finishing.', 'situating': 'To familiarize the students with the chemical and physical treatments (finishing operations) which result in the textile material acquiring a number of specific new properties that give it a higher added value.', 'content': 'Deel 1: Algemene inleiding Deel 2: Voorbehandelingen - Bleken, Voorbehandelingen - Merceriseren, Voorbehandelingen - Reinigen en carboniseren van wol, Voorbehandelingen - Het verviltingsvrij (viltvrij) maken van wol Deel 3: Chemische veredeling : Inleiding, Chemische veredeling : Aanbrengen / aanbrengtechnieken, Chemische veredeling : Antistatisch maken, Chemische veredeling : Verzachters, C

courses\E900353.pkl
{'name': 'Equipment for Anaesthesiology and Resuscitation', 'semester': '1', 'lecturer': 'Karel Roubík', 'contact_hours': '0', 'study_time': '120', 'credits': '4', 'link': 'https://studiegids.ugent.be/2018/NL/studiefiches/E900353.pdf', 'masters': ['EMBIME'], 'language': 'EN', 'keywords': '', 'situating': '', 'content': ''}

courses\E900354.pkl
{'name': 'Laser Applications in Biomedicine', 'semester': '1', 'lecturer': 'Helena Jelínková', 'contact_hours': '0', 'study_time': '60', 'credits': '2', 'link': 'https://studiegids.ugent.be/2018/NL/studiefiches/E900354.pdf', 'masters': ['EMBIME'], 'language': 'EN', 'keywords': '', 'situating': '', 'content': ''}

courses\E900355.pkl
{'name': 'Diploma Thesis Proposal', 'semester': '1', 'lecturer': 'N. N.', 'contact_hours': '0', 'study_time': '150', 'credits': '5', 'link': 'https://studiegids.ugent.be/2018/NL/studiefiches/E900355.pdf', 'masters': ['EMBIME', 'EMBIME', 'EMBIME'], 'language': 'EN', 'keywords': '', 'situating': '', 

courses\G9X29AE.pkl
{'name': 'Data Mining and Neural Networks', 'link': 'https://onderwijsaanbod.kuleuven.be/syllabi/e/G9X29AE.htm', 'masters': ['CQ_50550147', 'CQ_50550147', 'CQ_50550147', 'CQ_50550147'], 'content': 'Preparatory reading to the exercise sessions', 'situating': 'The student must understand basic and more advanced techniques of neural networks for datamining, as well as related methods of nonlinear modeling. The student must be able to apply the methods to real data sets and constructively work towards good solutions.', 'semester': '1', 'lecturer': 'Suykens Johan', 'credits': '4', 'study_time': '120.0', 'contact_hours': '16 ', 'language': 'EN'}

courses\G9X47AE.pkl
{'name': 'Physical Chemistry of Polymers', 'link': 'https://onderwijsaanbod.kuleuven.be/syllabi/e/G9X47AE.htm', 'masters': ['CQ_50269006', 'CQ_50269006', 'CQ_51228258'], 'content': 'Module Introduction: - Positioning of course - The science of polymers - The chain of knowledge of polymer science Module Physica

courses\H04D6AE.pkl
{'name': 'Materials in Electrical Engineering', 'link': 'https://onderwijsaanbod.kuleuven.be/syllabi/e/H04D6AE.htm', 'masters': ['CQ_50657365', 'CQ_50657365', 'CQ_51384404', 'CQ_51384404', 'CQ_51384404'], 'content': 'Approach: The introduction focuses on the physical background of the relevant material properties. Then the course zooms in on the materials and material trends , from power generation and electrical power control to electric energy storage and utilization of electrical energy in some applications. It is clear that this is not exhaustive, but it has the objective to illustrate how certain effects discussed in the introduction can be realized through material choice. Simultaneously the advantages and disadvantages of the different materials are illustrated. Students must also understand the trends of materials research for electrical applications. It is demonstrated that the concept of nanotechnology has impact on the electric power sector. Table of Cont

courses\H07Z7AE.pkl
{'name': 'Fundamentals for Computer Science', 'link': 'https://onderwijsaanbod.kuleuven.be/syllabi/e/H07Z7AE.htm', 'masters': ['CQ_52364384'], 'content': "Automata and Formal Languages ﬁnite state machines (FSM), Non-deterministic FSMs (NDFSM) minimisation of FSM determinisation of NDFSM regular expressions, regular sets, relationship between regular expressions and FSM, Kleene algebra Closure properties; Brzozowski’s derivative Chomsky’s hierarchy of formal language context free grammars & push-down automata Application overview: compiler construction, software modelling and veriﬁcation, security, ... Computability Theory solubility and insolubility (Halting problem, Post’s correspondence problem) non-deterministic Turing machines, other Turing machine variants, and their equivalence Universal Turing Machine, Beyond Turing-computability. Complexity Theory analysis of algorithms and complexity theory the classes P and NP, the class NP-complete, example problems. top

courses\H0E51AE.pkl
{'name': 'Micro- and Nanosensors', 'link': 'https://onderwijsaanbod.kuleuven.be/syllabi/e/H0E51AE.htm', 'masters': ['CQ_51228258'], 'content': 'Fundamentals of sensors Definitions and Concepts Characteristics of a sensor Transduction Principles Fundamentals of Electronic Materials Physical Phenomena Chemical Phenomena Biological Phenomena Transduction Platforms and Measurement Electronics for Sensor Interfaceing Inter-Digital Transducer (IDT) Active Electronic Transducers Electrochemical Transducers Optical Waveguide based Transducers Acoustic Wave Transducers Cantilever Based Transducers Devices State of the art in NANOsensors Motivation for Scale Reduction Mass Sensors Based on Cantilevers Chemical Gas Sensors Photosensors', 'situating': 'General objectives of knowledge: General objectives of knowledge: Knowing and understanding the main features and parameters of a generic sensor. Knowing the main sensor platforms and sensor configurations. Understanding the main

courses\I001755.pkl
{'name': 'Modeling and Control of Waste Water Treatment Plants', 'semester': '1', 'lecturer': 'Ingmar Nopens', 'contact_hours': '30', 'study_time': '75', 'credits': '3', 'link': 'https://studiegids.ugent.be/2018/NL/studiefiches/I001755.pdf', 'masters': ['EMCHEM', 'EMMAEN'], 'language': 'EN', 'keywords': 'Modelling,simulation,model calibration,unit processes,wastewater treatment simulator,benchmarking of control strategies', 'situating': 'In this course the wide application of modelling and simulation during the design and optimisation of wastewater treatment plants is taught in a concrete way. The students will be introduced to the stepwise modelling of the different unit processes in these systems and gain insight and practical experience in the model calibration, i.e. fitting industry-standard models such as the Activated Sludge Model No.1 to the reality of a specific treatment plant. A second part of the course deals with the objective evaluation of the economic 

In [3]:
from whoosh import scoring
from whoosh.index import open_dir
 
ix = open_dir("indexdir")

with ix.searcher(weighting=scoring.TF_IDF) as searcher:
    
    with open('lexicons.txt','w') as f: 
    
        f.write('\n-----------------------------------------------------\nNAME\n-----------------------------------------------------\n')
        for word in [x.decode("utf-8") for x in list(searcher.lexicon("name"))]:
            f.write("%s\n" % word.encode("utf-8"))
        f.write('\n-----------------------------------------------------\nSEM\n-----------------------------------------------------\n')
        for word in [x.decode("utf-8") for x in list(searcher.lexicon("semester"))]:
            f.write("%s\n" % word.encode("utf-8"))
        f.write('\n-----------------------------------------------------\nLECT\n-----------------------------------------------------\n')
        for word in [x.decode("utf-8") for x in list(searcher.lexicon("lecturer"))]:
            f.write("%s\n" % word.encode("utf-8"))
        f.write('\n-----------------------------------------------------\nLANG\n-----------------------------------------------------\n')
        for word in [x.decode("utf-8") for x in list(searcher.lexicon("language"))]:
            f.write("%s\n" % word.encode("utf-8"))
        f.write('\n-----------------------------------------------------\nMAST\n-----------------------------------------------------\n')
        for word in [x.decode("utf-8") for x in list(searcher.lexicon("masters"))]:
            f.write("%s\n" % word.encode("utf-8"))
        f.write('\n-----------------------------------------------------\nKEY\n-----------------------------------------------------\n')
        for word in [x.decode("utf-8") for x in list(searcher.lexicon("keywords"))]:
            f.write("%s\n" % word.encode("utf-8"))
        f.write('\n-----------------------------------------------------\nSIT\n-----------------------------------------------------\n')
        for word in [x.decode("utf-8") for x in list(searcher.lexicon("situating"))]:
            f.write("%s\n" % word.encode("utf-8"))
        f.write('\n-----------------------------------------------------\nCONT\n-----------------------------------------------------\n')
        for word in [x.decode("utf-8") for x in list(searcher.lexicon("content"))]:
            f.write("%s\n" % word.encode("utf-8"))
    

### b) Query index

In [4]:
from whoosh.qparser import QueryParser, MultifieldParser, OrGroup
from whoosh import scoring
from whoosh.index import open_dir
from whoosh.query import Term, Or
from whoosh.searching import Results
from whoosh.analysis import RegexTokenizer
import math
 
ix = open_dir("indexdir")

query_str = "Irrigation, drainage, electromagnetic, borehole, rock magnetism, geomagnetic field, Climatology"
query_els = query_str.split(', ')
search_terms = []

tokenizer = RegexTokenizer()
for query_el in query_els:
    search_terms.append('(%s)' % (query_el))
    for token in tokenizer(query_el):
        if ('(%s)' % (token.text)) not in search_terms:
            search_terms.append('(%s)^0.1' % (token.text))

lecturer_str = "DE MEERENDRE, THIERY"
lecturers = lecturer_str.split(', ')
 
filter_sem = '2'
filter_lang= 'EN'
filter_master = ['VUBLand']

with ix.searcher(weighting=scoring.TF_IDF) as searcher:
    
    or_group_masters = []
    for master in filter_master:
        or_group_masters.append(Term("masters", master))
    
    filter_query = Term("semester", filter_sem) & Term("language", filter_lang) & Or(or_group_masters)
    
    print("FILTER Query: %s\n" %(filter_query))
    
    search_queries = []
    for term in search_terms:
        sub_query = MultifieldParser(["name", "keywords", "situating", "content"], schema=ix.schema).parse(term)
        search_queries.append(sub_query)
        
    full_query = Or(search_queries)
        
    results = searcher.search(full_query, filter=filter_query, limit=None, terms=True)
    
    print('[QUERY RESULTS]')
    for result in results:
        print('%s\n%s\n' % (result['tag'],result.matched_terms()))
    
    lector_queries = []
    for lecturer in lecturers:
        query_lector = QueryParser("lecturer", schema=ix.schema).parse(lecturer)
        lector_queries.append(query_lector)
        
    full_lector_query = Or(lector_queries)
    
    lector_results = searcher.search(full_lector_query, filter=filter_query, limit=None, terms=True)
    
    print('[LECTURER RESULTS]')
    for result in lector_results:
        print('%s\n%s\n' % (result['tag'],result.matched_terms()))
    
    results.upgrade_and_extend(lector_results)
    
    deep_cp_results = results.copy()

    for result in deep_cp_results:
        similar_courses = result.more_like_this("content", top=5, filter=filter_query, normalize=False)
        results.extend(similar_courses)
            
    print('[FINAL SEARCH RESULTS]')
    for result in results:
        print('[%s] %s - %s \t (%s)\n' %(result['tag'],result['name'],result['lecturer'],result.score))
        
    #Fetch credits and their values/weights for knapsack (curriculum)
    credits = [[]]
    scores = []
    
    for result in results:
        credits[0].append(int(result['credits']))
        scores.append(int(math.ceil(result.score)))
        
    print("CREDITS: %s" %(credits))
    print("SCORES: %s" %(scores))
            


FILTER Query: (semester:2 AND language:EN AND masters:VUBLand)

[QUERY RESULTS]
4017620DNR
[('content', b'magnet'), ('content', b'rock'), ('content', b'rock'), ('content', b'borehol'), ('content', b'field'), ('content', b'geomagnet'), ('content', b'geomagnet'), ('content', b'electromagnet'), ('content', b'magnet'), ('content', b'field')]

4017626ENR
[('name', b'irrigation'), ('name', b'drainage')]

4017616DNR
[('name', b'climatology')]

4018726FNR
[('name', b'rock'), ('content', b'rock')]

[LECTURER RESULTS]
4017295FNR
[('lecturer', b'de'), ('lecturer', b'meerendre')]

4017296FNR
[('lecturer', b'de'), ('lecturer', b'meerendre')]

9017692ENR
[('lecturer', b'thiery')]

[FINAL SEARCH RESULTS]
[4017620DNR] Applied Geophysics - Kristine WALRAVENS 	 (89.06555434265755)

[4017626ENR] Irrigation and Drainage - Valentijn PAUWELS 	 (23.085542037051482)

[4017616DNR] Meteorology and Climatology - Hans VERBEECK 	 (11.542771018525741)

[4018726FNR] Rock mechanics and underground constructions - Pie

In [5]:
from ortools.algorithms import pywrapknapsack_solver

#DP Knapsack problem for filling semester 30 ECTS with highest value

# Create the solver.
solver = pywrapknapsack_solver.KnapsackSolver(
  pywrapknapsack_solver.KnapsackSolver.
  KNAPSACK_MULTIDIMENSION_BRANCH_AND_BOUND_SOLVER,
  'curriculum')

#Initialize knapsack size on 30 ECTS-credits
credit_capacity = [30]

#Solve DP
solver.Init(scores, credits, credit_capacity)
computed_value = solver.Solve()

packed_items = [x for x in range(0, len(credits[0]))
              if solver.BestSolutionContains(x)]
packed_credits = [credits[0][i] for i in packed_items]
total_credits= sum(packed_credits)
print("Packed items: %s" % (packed_items))
print("Packed weights: %s" % (packed_credits))
print("Total value: %s" % (computed_value))
print("Total weight: %s" % (total_credits))

#can't look at values since searcher is closed

Packed items: [0, 1, 2, 4, 5, 6, 9]
Packed weights: [5, 5, 5, 3, 3, 3, 5]
Total value: 568
Total weight: 29
