# Course Content Overlap

In [20]:
import os                                                     # TO READ WIKIARTICLES.
import re
import json                                                   # TO READ FROM .json FILE.
import math                                                   # TO CALCULATE nDCG
import pandas as pd                                           # TO READ COURSE DATA.
import numpy as np                                            # TO SAVE DICTIONARY.  
from nltk.tokenize import word_tokenize                       # TO CLEAN DATA.
from nltk.corpus import stopwords                             # TO CLEAN DATA.
from sklearn.feature_extraction.text import TfidfVectorizer   # TO EXTRACT PHRASES.
import scipy.sparse                                           # TO SAVE MODEL.
from sklearn.metrics.pairwise import cosine_similarity        # FOR COSINE SIMILARITY MEASURE
from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shanu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shanu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
class Preprocessing:
    
    def __init__(self, phrases, abbreviation, with_stemming):
        self.stop_words = list(set(stopwords.words('english')))
        self.common_word = ['introduction', 'overview', 'basic', 'various', 'topics', 'review', 'course', 'student', 'content', 'academic', 'necessary', 'different']
        self.stop_words = self.stop_words + self.common_word
        self.phrases = phrases
        self.abbreviations = abbreviations
        self.with_stemming = with_stemming

        
    def cleanData(self, text):
        """
            Input   : text to be cleaned.
            Returns : cleaned text string.
        """
        text = text.replace("'"," ").replace("\""," ").replace(";"," ").replace(","," ").replace("-"," ").replace(":"," ").replace("["," ").replace("]"," ").replace("."," ").replace("/"," ").lower()
        text = word_tokenize(text)
        if self.with_stemming == 'yes':
            ps = PorterStemmer()
            text = [ps.stem(token) for token in text if (token.replace('_', '').isalnum() and not token.replace('_', '').isnumeric() and token not in self.stop_words and len(token)>1) ]
        else:
            text = [token for token in text if (token.replace('_', '').isalnum() and not token.replace('_', '').isnumeric() and token not in self.stop_words and len(token)>1) ]
        return " ".join(text)
    
    def phrase_replacement(self, intext):
        outtext = intext
        for phrase in self.phrases:
            phrase_ = phrase.replace("_", " ")
            if phrase_ in outtext.lower():
                outtext = outtext.lower().replace(" "+phrase_+" ", " "+phrase+" ")
        return outtext
    
    
    def abbreviation_replacement(self, intext):
        outtext = intext
        for abbreviation in self.abbreviations:
            if abbreviation in outtext:
                outtext = outtext.replace(" "+abbreviation+" ", " "+abbreviation+" "+self.abbreviations[abbreviation]+" ")
            elif self.abbreviations[abbreviation] in outtext:
                outtext = outtext.replace(" "+self.abbreviations[abbreviation]+" ", " "+abbreviation+" "+self.abbreviations[abbreviation]+" ")
        return outtext
    

    def readWikipediaData(self, base_directory):
        """
            Input   : directory path.
            Returns : Wikipedia article titles and contents ordered list.
        """
        files = os.listdir(base_directory)
        content = {}

        for file in files:
            file_path = base_directory + file
            data = json.load(open(file_path,"r",encoding="utf-8"))
            for title in data['pages']:
                details = data['pages'][title]['text'] + ' ' + title
                details = self.cleanData(details)
                content[title] = self.phrase_replacement(self.abbreviation_replacement(details))
        return content


    def readCourseData(self, document_filename):
        """
            Input   : File path.
            Returns : Course titles and Course details ordered list.
        """
        df = pd.read_csv(document_filename)
        content = {}
        for index,row in df.iterrows():
            details = row['description']+' '+row['content'] +' ' +row['coursename']
            details = self.cleanData(details)
            course_detail = self.phrase_replacement(self.abbreviation_replacement(details))
#             course_detail = row['description']+' '+row['content']
            title = row['courseno'] + ' - ' + row['coursename']
            content[title] = course_detail
        return content
    
    
    def readDictionary(self, base_directory):
        """
            Input   : directory path.
            Returns : Wikipedia article titles and contents ordered list.
        """
        content = {}

        data = json.load(open(base_directory,"r",encoding="utf-8"))
        for title in data:
            temp_title = ''
            temp = title.split('-')
            for i in temp:
                if not re.search('\d+', i):
                    temp_title += i
            details = data[title] + ' ' + temp_title
            details = self.cleanData(details)
            content[title] = self.phrase_replacement(self.abbreviation_replacement(details))
        return content

In [23]:
class Evaluation:
    
    def __init__(self, filename, titles, similarity_matrix, dept, scope):
        self.filename = filename
        self.titles = titles
        self.num_courses = len(titles)
        self.similarity_matrix = similarity_matrix
        self.dept1, self.dept2 = dept.split("-")
        self.scope = scope
        
    def lists_to_dict(self, keys, values):
        dictionary = {}
        for key, value in zip(keys, values):
            dictionary[key] = value
        return dictionary
        
    def createEvaluationMatrix(self):
        """
            Input   : Path to evaluation data, number of courses, list of course_titles
            Returns : A binary Relevance feedback matrix.
        """
        with open(self.filename) as file:
            eval_data = file.read().strip()
            eval_data = eval(eval_data)

        self.eval_matrix = np.zeros((self.num_courses, self.num_courses), dtype=int)
        np.fill_diagonal(self.eval_matrix, 1)

        for pair in eval_data:
            index_1  = self.titles.index(pair[0])
            index_2  = self.titles.index(pair[1])
            self.eval_matrix[index_1][index_2] = 1
            
        self.eval_matrix = np.maximum(self.eval_matrix, self.eval_matrix.T)
        return self.eval_matrix

    def getPrunedMatrix(self, matrix, dept):
        """
            Input   : matrix, list of courses, department code.
            Returns : matrix - department x all departments.
        """
        min_index = 0
        max_index = 0
        for i in range(len(self.titles)):
            if(dept in self.titles[i] and min_index==0):
                min_index = i
            elif(dept in self.titles[i] and min_index!=0):
                max_index = i
            elif(min_index!=0 and max_index!=0):
                break
        return matrix[min_index : max_index+1]

    def getNDCG(self, a, b):
        """
            Input   : The obtained rankings, and expected relevance.
            Returns : nDCG value.
        """
        dcg = 0
        for i in range(0,len(a)):
            dcg += b[a[i]]/math.log2(i+2)

        idcg = 0
        num_relevant = int(np.sum(b))
        for i in range(0,num_relevant):
            idcg += 1/math.log2(i+2)

        return dcg/idcg
    
    def evaluation(self):
        """
            Input   : The similarity matrix and evaluation matrix.
            Returns : Course wise nDCG values and average nDCG values.
        """
        ndcg_vals = []
        num_courses = len(self.eval_matrix.T[0])
        
        for i in range(num_courses):
            if sum(self.eval_matrix[i]) > 0:
                ranked_course_indices = np.argsort(-1*self.similarity_matrix[i])
                if self.scope ==  "one-vs-all":
                    ndcg_vals.append(self.getNDCG(ranked_course_indices, self.eval_matrix[i]))
                else:
                    ndcg_vals.append(self.getNDCG(ranked_course_indices, self.eval_matrix[i]))
            else:
                ndcg_vals.append(float("nan"))

        return ndcg_vals

    def printCourseWiseNDCGValues(self, ndcg_vals, titles):
        """
            Input   : Course wise nDCG values, department titles.
            Prints  : Course wise nDCG values, average nDCG vale.
        """
        for i in range(len(titles)):
            print(titles[i]," - ",ndcg_vals[i])
        count=0
        for i in ndcg_vals:
            if not math.isnan(i):
                count+=1
        print("\n Average nDCG = ", np.nansum(ndcg_vals)/count)
        
        
    def evaluate(self):
        self.eval_matrix = self.createEvaluationMatrix()
        dept_titles = self.getPrunedMatrix(self.titles, self.dept1)
        
        np.fill_diagonal(self.eval_matrix, 0)
        np.fill_diagonal(self.similarity_matrix, 0)
        if self.scope == "all-vs-all":
            pass
        elif self.scope == "one-vs-all":
            self.eval_matrix = self.getPrunedMatrix(self.eval_matrix, self.dept1)
            self.similarity_matrix = self.getPrunedMatrix(self.similarity_matrix, self.dept1)
        elif self.scope == "one-vs-one":
            self.eval_matrix = self.getPrunedMatrix(self.eval_matrix.T, self.dept1).T
            self.eval_matrix = self.getPrunedMatrix(self.eval_matrix, self.dept2)
            
            self.similarity_matrix = self.getPrunedMatrix(self.similarity_matrix.T, self.dept1).T
            self.similarity_matrix = self.getPrunedMatrix(self.similarity_matrix, self.dept2)
        
        ndcg_vals = self.evaluation()
        coursewise_ndcg = self.lists_to_dict(dept_titles, ndcg_vals)
#         self.printCourseWiseNDCGValues(ndcg_vals, dept_titles)
        
        return self.similarity_matrix, self.eval_matrix, coursewise_ndcg
    
    def printing(self, folder, method):
        try:
            cropped_similarity_matrix = np.lad(folder + self.dept + '_' + method + '_similarity_matrix.npy')
            cropped_evaluation_matrix = np.load(folder + self.dept + '_evaluation_matrix.npy')
            cropped_coursewise_ndcg = json.load(open(folder + self.dept + '_' + method + '_coursewise_ndcg.npy', 'r'))
        except:
            cropped_similarity_matrix, cropped_evaluation_matrix, cropped_coursewise_ndcg = self.evaluate()

            np.save(folder + dept + '_' + method + '_similarity_matrix.npy', cropped_similarity_matrix)
            np.save(folder + dept + '_evaluation_matrix.npy', cropped_evaluation_matrix)
            json.dump(cropped_coursewise_ndcg, open(folder + dept + '_' + method + '_coursewise_ndcg.npy', 'w'))

        count, sum_ndcg = 0, 0
        for course in cropped_coursewise_ndcg:
            ndcg_score = cropped_coursewise_ndcg[course]
            print(course, " - ", ndcg_score)
            if not math.isnan(ndcg_score):
                sum_ndcg += ndcg_score
                count+=1
        print("\n Average nDCG = ", sum_ndcg/count)

        return cropped_similarity_matrix, cropped_evaluation_matrix, cropped_coursewise_ndcg

# ESA

In [25]:
class ESA:
    def __init__(self, wiki, documents, queries, mode):
        self.wiki = wiki
        self.documents = documents
        self.queries = queries
        self.mode = mode
        
    def dict_to_list(self, dicti):
        titles, contents = [], []
        for key in dicti:
            titles.append(key)
            contents.append(dicti[key])
        return titles, contents
        
    def document_indexing(self, titles):
        content2index = {}
        index2content = {}
        for i in range(len(titles)):
            content2index[titles[i]] = i
            index2content[i] = titles[i]
        return content2index, index2content
    
    def getTFIDFModel(self, documents):
        """
            Input   : list of document contents.
            Returns : TF-IDF Model and TF-IDF Matrix.
        """
        tfidf_vectorizer = TfidfVectorizer(stop_words='english',use_idf=True)
        tfidf_vectorizer.fit(documents)
        document_tfidf_matrix = tfidf_vectorizer.transform(documents)
        vocab = tfidf_vectorizer.vocabulary_
        return document_tfidf_matrix, vocab
    
    def esa_matrix_computation(self, titles, details, vocab, document_matrix):
        num_courses = len(titles)
        esa_matrix = np.zeros((num_courses, self.num_concepts), dtype=float)
        words_not_in_wiki = []
        for i in range(len(titles)):
            i_content = set(details[i].split())
            for word in i_content:
                if word in self.wiki_vocab:
                    word_index_in_documents = vocab[word]
                    word_index_in_wiki    = self.wiki_vocab[word]
                    documents_tf_idf        = document_matrix[word_index_in_documents, i]
                    esa_matrix[i] += documents_tf_idf * self.term_concept_matrix.T[word_index_in_wiki]
                else:
                    words_not_in_wiki.append(word)
        return esa_matrix
    
    def computation(self):
        wiki_titles, wiki_contents = self.dict_to_list(self.wiki)
#         concept2index, index2concept = self.document_indexing(wiki_titles)
        self.term_concept_matrix, self.wiki_vocab = self.getTFIDFModel(wiki_contents)
        self.num_concepts = len(wiki_titles)
        
        document_titles, document_contents = self.dict_to_list(self.documents)
#         document2index, index2document = self.document_indexing(document_titles)
        term_document_matrix, doc_vocab = self.getTFIDFModel(document_contents)
        
        document_esa_matrix = self.esa_matrix_computation(document_titles, document_contents, doc_vocab, term_document_matrix.T)
        
        if self.mode != "same":
            query_titles, query_contents = self.dict_to_list(self.queries)
            query2index, index2query = self.document_indexing(query_titles)
            term_query_matrix, query_vocab = self.getTFIDFModel(query_contents)
            query_esa_matrix = self.esa_matrix_computation(query_titles, query_contents, query_vocab, term_query_matrix.T)
            return document_esa_matrix, query_esa_matrix, cosine_similarity(document_esa_matrix, query_esa_matrix)
        
        return document_esa_matrix, document_esa_matrix, cosine_similarity(document_esa_matrix)

In [26]:
folder = '../codes/dataset/saved/without_phrases/'

phrases = []
abbreviations = {}
with_stemming = 'no'

course_detail_path = "../codes/dataset/CourseData.csv"
preprocessing = Preprocessing(phrases, abbreviations, with_stemming)
document_content = preprocessing.readCourseData(course_detail_path)
try:
    wiki_document_esa_matrix = np.load(folder + 'wiki_document_esa_matrix.npy')
    wiki_query_esa_matrix = np.load(folder + 'wiki_query_esa_matrix.npy')
    wiki_esa_similarity_matrix = np.load(folder + 'wiki_esa_similarity_matrix.npy')
except:
    wikipedia_articles_path = "../codes/dataset/QueriedWikipediaArticles/"

    base_content = preprocessing.readWikipediaData(wikipedia_articles_path)
    document_content = preprocessing.readCourseData(course_detail_path)
    
    esa = ESA(base_content, document_content, document_content, "same")
    wiki_document_esa_matrix, wiki_query_esa_matrix, wiki_esa_similarity_matrix = esa.computation()

    np.save(folder + 'wiki_document_esa_matrix.npy', wiki_document_esa_matrix)
    np.save(folder + 'wiki_query_esa_matrix.npy', wiki_query_esa_matrix)
    np.save(folder + 'wiki_esa_similarity_matrix.npy', wiki_esa_similarity_matrix)

For CS vs CS courses

In [27]:
folder = '../codes/dataset/saved/without_phrases/'
dept = 'CS-CS'
scope = 'one-vs-one'
method = 'wiki_esa'

eval_matrix_path = "../codes/dataset/Revised_EvaluationData.txt"
titles, _ = ESA({}, {}, {}, "same").dict_to_list(document_content)
wiki_esa_similarity_matrix = np.load(folder + 'wiki_esa_similarity_matrix.npy')

evaluation = Evaluation(eval_matrix_path, titles, wiki_esa_similarity_matrix, dept, scope)
cs_cs_wiki_esa_similarity_matrix, cs_cs_wiki_esa_evaluation_matrix, cs_cs_wiki_esa_coursewise_ndcg = evaluation.printing(folder, method)

CS1100 - Introduction to Programming  -  0.973985563224661
CS1111 - Problem Solving using Computers  -  0.975788054854922
CS2300 - Foundations of Computer Systems Design  -  0.9604489504159912
CS2310 - Foundations of Computer Systems Design Lab  -  0.9519709359886506
CS2700 - Programming and Data Structures  -  0.9183387600570154
CS2710 - Programming and Data Structures Lab  -  0.9696643615373284
CS3100 - Paradigms of Programming  -  0.9206307738894893
CS3300 - Compiler Design  -  0.23540891336663824
CS3500 - Operating Systems  -  0.9709286432396583
CS3700 - Introduction to Database Systems  -  nan
CS5020 - Nonlinear optimisation: Theory and algorithms  -  0.8733336307416327
CS5691 - Pattern Recognition and Machine Learning  -  0.9178260148263059
CS5800 - Advanced Data Structures & Algorithms  -  0.9547851288814769
CS6014 - Computability and Complexity  -  0.8596941277243692
CS6015 - Linear Algebra and Random Processes  -  0.86034433104172
CS6024 - Algorithmic Approaches to Computation

For CS vs All courses

In [31]:
folder = '../codes/dataset/saved/without_phrases/'
dept = 'CS-ALL'
scope = 'one-vs-all'
method = 'wiki_esa'

eval_matrix_path = "../codes/dataset/Revised_EvaluationData.txt"
titles, _ = ESA({}, {}, {}, "same").dict_to_list(document_content)
wiki_esa_similarity_matrix = np.load(folder + 'wiki_esa_similarity_matrix.npy')

evaluation = Evaluation(eval_matrix_path, titles, wiki_esa_similarity_matrix, dept, scope)
cs_all_wiki_esa_similarity_matrix, cs_all_wiki_esa_evaluation_matrix, cs_all_wiki_esa_coursewise_ndcg = evaluation.printing(folder, method)

CS1100 - Introduction to Programming  -  0.9448825279832472
CS1111 - Problem Solving using Computers  -  0.9137326376230489
CS2300 - Foundations of Computer Systems Design  -  0.9514618672915927
CS2310 - Foundations of Computer Systems Design Lab  -  0.8430740355450209
CS2700 - Programming and Data Structures  -  0.9236472308614703
CS2710 - Programming and Data Structures Lab  -  0.9263397875421715
CS3100 - Paradigms of Programming  -  0.8342463567298523
CS3300 - Compiler Design  -  0.6312770006570017
CS3500 - Operating Systems  -  0.8473156139468664
CS3700 - Introduction to Database Systems  -  0.6679417426974276
CS5020 - Nonlinear optimisation: Theory and algorithms  -  0.8815466846328809
CS5691 - Pattern Recognition and Machine Learning  -  0.9306327201623718
CS5800 - Advanced Data Structures & Algorithms  -  0.8805457551189749
CS6014 - Computability and Complexity  -  0.8738940453383541
CS6015 - Linear Algebra and Random Processes  -  0.9523218863834864
CS6024 - Algorithmic Approac

For EE vs EE courses

In [33]:
folder = '../codes/dataset/saved/without_phrases/'
dept = 'EE-EE'
scope = 'one-vs-one'
method = 'wiki_esa'

eval_matrix_path = "../codes/dataset/EE_EvaluationData.txt"
titles, _ = ESA({}, {}, {}, "same").dict_to_list(document_content)
wiki_esa_similarity_matrix = np.load(folder + 'wiki_esa_similarity_matrix.npy')

evaluation = Evaluation(eval_matrix_path, titles, wiki_esa_similarity_matrix, dept, scope)
ee_ee_wiki_esa_similarity_matrix, ee_ee_wiki_esa_evaluation_matrix, ee_ee_wiki_esa_coursewise_ndcg = evaluation.printing(folder, method)

EE1100 - Basic Electrical Engineering  -  0.6212101921659198
EE1102 - Introduction to Programming  -  nan
EE1103 - Numerical Methods  -  0.5249810332008933
EE2015 - Electric Circuits & Networks  -  0.7859852846207472
EE2016 - Microprocessor Theory+Lab  -  nan
EE2025 - Engineering Electromagnetics  -  0.8503449055347546
EE3002 - Analog Circuits  -  0.9186560662081473
EE3003 - Power Systems  -  0.5
EE3004 - Control Engineering  -  0.823515575213343
EE3006 - Principles of Measurement  -  nan
EE3110 - Probability Foundations for Electrical Engineers  -  0.6583856728354838
EE3301 - Introduction to Semiconductor Devices  -  0.9372375429818208
EE3313 - Device Modelling  -  0.9350976551237904
EE4140 - Digital Communication Systems  -  0.9043980816456215
EE4371 - Introduction to Data Structures and Algorithms  -  nan
EE4502 - Optics for Engineers  -  1.0
EE4708 - Data Analytics Laboratory  -  0.43067655807339306
EE5011 - Computer Methods in Electrical Engineering  -  0.6309297535714575
EE5110 -

For EE vs All courses

In [34]:
folder = '../codes/dataset/saved/without_phrases/'
dept = 'EE-ALL'
scope = 'one-vs-all'
method = 'wiki_esa'

eval_matrix_path = "../codes/dataset/EE_EvaluationData.txt"
titles, _ = ESA({}, {}, {}, "same").dict_to_list(document_content)
wiki_esa_similarity_matrix = np.load(folder + 'wiki_esa_similarity_matrix.npy')

evaluation = Evaluation(eval_matrix_path, titles, wiki_esa_similarity_matrix, dept, scope)
ee_all_wiki_esa_similarity_matrix, ee_all_wiki_esa_evaluation_matrix, ee_all_wiki_esa_coursewise_ndcg = evaluation.printing(folder, method)

EE1100 - Basic Electrical Engineering  -  0.7168870397122132
EE1102 - Introduction to Programming  -  0.8554425550666684
EE1103 - Numerical Methods  -  0.8490218563706889
EE2015 - Electric Circuits & Networks  -  0.7774223325720087
EE2016 - Microprocessor Theory+Lab  -  0.9407662247971147
EE2025 - Engineering Electromagnetics  -  0.7883652551752491
EE3002 - Analog Circuits  -  0.901550651686529
EE3003 - Power Systems  -  0.5
EE3004 - Control Engineering  -  0.5894709850015082
EE3006 - Principles of Measurement  -  nan
EE3110 - Probability Foundations for Electrical Engineers  -  0.9049290155679591
EE3301 - Introduction to Semiconductor Devices  -  0.9790270241138814
EE3313 - Device Modelling  -  0.9296259785920711
EE4140 - Digital Communication Systems  -  0.8144393422063176
EE4371 - Introduction to Data Structures and Algorithms  -  0.9918291064614978
EE4502 - Optics for Engineers  -  0.900026351878831
EE4708 - Data Analytics Laboratory  -  0.8419451045923748
EE5011 - Computer Methods

# esa with other Universities contents

In [53]:
folder = '../codes/dataset/saved/without_phrases/'

phrases = []
abbreviations = {}
with_stemming = 'no'

course_detail_path = "../codes/dataset/CourseData.csv"
preprocessing = Preprocessing(phrases, abbreviations, with_stemming)
document_content = preprocessing.readCourseData(course_detail_path)
try:
    other_university_document_esa_matrix = np.load(folder + 'other_university_document_esa_matrix.npy')
    other_university_query_esa_matrix = np.load(folder + 'other_university_query_esa_matrix.npy')
    other_university_esa_similarity_matrix = np.load(folder + 'other_university_esa_similarity_matrix.npy')
except:
    base_path = '../codes/dataset/other_unversities_contents.json'
    base_content = preprocessing.readDictionary(base_path)

    esa = ESA(base_content, document_content, document_content, "same")
    other_university_document_esa_matrix, other_university_query_esa_matrix, other_university_esa_similarity_matrix = esa.computation()

    np.save(folder + 'other_university_document_esa_matrix.npy', other_university_document_esa_matrix)
    np.save(folder + 'other_university_query_esa_matrix.npy', other_university_query_esa_matrix)
    np.save(folder + 'other_university_esa_similarity_matrix.npy', other_university_esa_similarity_matrix)

For CS vs CS courses

In [118]:
folder = '../codes/dataset/saved/without_phrases/'
dept = 'CS-CS'
scope = 'one-vs-one'
method = 'other_university_esa'

eval_matrix_path = "../codes/dataset/Revised_EvaluationData.txt"
titles, _ = ESA({}, {}, {}, "same").dict_to_list(document_content)
other_university_esa_similarity_matrix = np.load(folder + 'other_university_esa_similarity_matrix.npy')

evaluation = Evaluation(eval_matrix_path, titles, other_university_esa_similarity_matrix, dept, scope)
cs_cs_other_university_esa_similarity_matrix, cs_cs_other_university_esa_evaluation_matrix, cs_cs_other_university_esa_coursewise_ndcg = evaluation.printing(folder, method)

CS1100 - Introduction to Programming  -  0.983303248698947
CS1111 - Problem Solving using Computers  -  0.9747429528567879
CS2300 - Foundations of Computer Systems Design  -  0.9668342083190152
CS2310 - Foundations of Computer Systems Design Lab  -  0.9485379968639579
CS2700 - Programming and Data Structures  -  0.9407574315943523
CS2710 - Programming and Data Structures Lab  -  0.9491546172962253
CS3100 - Paradigms of Programming  -  0.9242560749182924
CS3300 - Compiler Design  -  0.23137821315975915
CS3500 - Operating Systems  -  1.0
CS3700 - Introduction to Database Systems  -  nan
CS5020 - Nonlinear optimisation: Theory and algorithms  -  0.8756595247647427
CS5691 - Pattern Recognition and Machine Learning  -  0.9731203984025375
CS5800 - Advanced Data Structures & Algorithms  -  0.9572022199008637
CS6014 - Computability and Complexity  -  0.8723047062986427
CS6015 - Linear Algebra and Random Processes  -  0.9469024295259745
CS6024 - Algorithmic Approaches to Computational Biology  

In [56]:
folder = '../codes/dataset/saved/without_phrases/'
dept = 'CS-ALL'
scope = 'one-vs-all'
method = 'other_university_esa'

eval_matrix_path = "../codes/dataset/Revised_EvaluationData.txt"
titles, _ = ESA({}, {}, {}, "same").dict_to_list(document_content)
other_university_esa_similarity_matrix = np.load(folder + 'other_university_esa_similarity_matrix.npy')

evaluation = Evaluation(eval_matrix_path, titles, other_university_esa_similarity_matrix, dept, scope)
cs_all_other_university_esa_similarity_matrix, cs_all_other_university_esa_evaluation_matrix, cs_all_other_university_esa_coursewise_ndcg = evaluation.printing(folder, method)

CS1100 - Introduction to Programming  -  0.9643250515144093
CS1111 - Problem Solving using Computers  -  0.9515218318032107
CS2300 - Foundations of Computer Systems Design  -  0.9049198721403465
CS2310 - Foundations of Computer Systems Design Lab  -  0.846575407481123
CS2700 - Programming and Data Structures  -  0.9298450693204839
CS2710 - Programming and Data Structures Lab  -  0.898359252361489
CS3100 - Paradigms of Programming  -  0.8480003543575312
CS3300 - Compiler Design  -  0.31002180716128847
CS3500 - Operating Systems  -  0.9619991470595832
CS3700 - Introduction to Database Systems  -  0.7030069601745773
CS5020 - Nonlinear optimisation: Theory and algorithms  -  0.9060510245274176
CS5691 - Pattern Recognition and Machine Learning  -  0.9424055584553004
CS5800 - Advanced Data Structures & Algorithms  -  0.9012774908849868
CS6014 - Computability and Complexity  -  0.8291451474534715
CS6015 - Linear Algebra and Random Processes  -  0.9553626677401832
CS6024 - Algorithmic Approach

For EE vs EE courses

In [57]:
folder = '../codes/dataset/saved/without_phrases/'
dept = 'EE-EE'
scope = 'one-vs-one'
method = 'other_university_esa'

eval_matrix_path = "../codes/dataset/EE_EvaluationData.txt"
titles, _ = ESA({}, {}, {}, "same").dict_to_list(document_content)
other_university_esa_similarity_matrix = np.load(folder + 'other_university_esa_similarity_matrix.npy')

evaluation = Evaluation(eval_matrix_path, titles, wiki_esa_similarity_matrix, dept, scope)
ee_ee_other_university_esa_similarity_matrix, ee_ee_other_university_esa_evaluation_matrix, ee_ee_other_university_esa_coursewise_ndcg = evaluation.printing(folder, method)

EE1100 - Basic Electrical Engineering  -  0.6212101921659198
EE1102 - Introduction to Programming  -  nan
EE1103 - Numerical Methods  -  0.5249810332008933
EE2015 - Electric Circuits & Networks  -  0.7859852846207472
EE2016 - Microprocessor Theory+Lab  -  nan
EE2025 - Engineering Electromagnetics  -  0.8503449055347546
EE3002 - Analog Circuits  -  0.9186560662081473
EE3003 - Power Systems  -  0.5
EE3004 - Control Engineering  -  0.823515575213343
EE3006 - Principles of Measurement  -  nan
EE3110 - Probability Foundations for Electrical Engineers  -  0.6583856728354838
EE3301 - Introduction to Semiconductor Devices  -  0.9372375429818208
EE3313 - Device Modelling  -  0.9350976551237904
EE4140 - Digital Communication Systems  -  0.9043980816456215
EE4371 - Introduction to Data Structures and Algorithms  -  nan
EE4502 - Optics for Engineers  -  1.0
EE4708 - Data Analytics Laboratory  -  0.43067655807339306
EE5011 - Computer Methods in Electrical Engineering  -  0.6309297535714575
EE5110 -

For EE vs All courses

In [58]:
folder = '../codes/dataset/saved/without_phrases/'
dept = 'EE-ALL'
scope = 'one-vs-all'
method = 'other_university_esa'

eval_matrix_path = "../codes/dataset/EE_EvaluationData.txt"
titles, _ = ESA({}, {}, {}, "same").dict_to_list(document_content)
other_university_esa_similarity_matrix = np.load(folder + 'other_university_esa_similarity_matrix.npy')

evaluation = Evaluation(eval_matrix_path, titles, wiki_esa_similarity_matrix, dept, scope)
ee_all_other_university_esa_similarity_matrix, ee_all_other_university_esa_evaluation_matrix, ee_all_other_university_esa_coursewise_ndcg = evaluation.printing(folder, method)

EE1100 - Basic Electrical Engineering  -  0.7168870397122132
EE1102 - Introduction to Programming  -  0.8554425550666684
EE1103 - Numerical Methods  -  0.8490218563706889
EE2015 - Electric Circuits & Networks  -  0.7774223325720087
EE2016 - Microprocessor Theory+Lab  -  0.9407662247971147
EE2025 - Engineering Electromagnetics  -  0.7883652551752491
EE3002 - Analog Circuits  -  0.901550651686529
EE3003 - Power Systems  -  0.5
EE3004 - Control Engineering  -  0.5894709850015082
EE3006 - Principles of Measurement  -  nan
EE3110 - Probability Foundations for Electrical Engineers  -  0.9049290155679591
EE3301 - Introduction to Semiconductor Devices  -  0.9790270241138814
EE3313 - Device Modelling  -  0.9296259785920711
EE4140 - Digital Communication Systems  -  0.8144393422063176
EE4371 - Introduction to Data Structures and Algorithms  -  0.9918291064614978
EE4502 - Optics for Engineers  -  0.900026351878831
EE4708 - Data Analytics Laboratory  -  0.8419451045923748
EE5011 - Computer Methods

# with phrases other Universities contents ESA¶

In [63]:
folder = '../codes/dataset/Dictionary_text/'
with open(folder+'Complete_Glossary1.txt', 'r', encoding='utf-8') as file:
    glossary = file.readlines()

ps = PorterStemmer()

phrases = []
for word in glossary:
    phrases.append(ps.stem(word.replace('\n', '')))

In [64]:
folder = '../codes/dataset/saved/with_phrases/'

# phrases = []
abbreviations = {}
with_stemming = 'no'

course_detail_path = "../codes/dataset/CourseData.csv"
preprocessing = Preprocessing(phrases, abbreviations, with_stemming)
try:
    document_content = jsonload(folder + 'Course_Data_with_phrases.json')
except:
    document_content = preprocessing.readCourseData(course_detail_path)
    jsondump(document_content, folder + 'Course_Data_with_phrases.json')
    
try:
    other_university_document_esa_matrix = np.load(folder + 'other_university_document_esa_matrix.npy')
    other_university_query_esa_matrix = np.load(folder + 'other_university_query_esa_matrix.npy')
    other_university_esa_similarity_matrix = np.load(folder + 'other_university_esa_similarity_matrix.npy')
except:
    try:
        base_content = jsonload(folder + 'other_unversities_contents_with_phrases.json')
    except:
        base_path = '../codes/dataset/other_unversities_contents.json'
        base_content = preprocessing.readDictionary(base_path)
        jsondump(base_content, folder + 'other_unversities_contents_with_phrases.json')

    esa = ESA(base_content, document_content, document_content, "same")
    other_university_document_esa_matrix, other_university_query_esa_matrix, other_university_esa_similarity_matrix = esa.computation()

    np.save(folder + 'other_university_document_esa_matrix.npy', other_university_document_esa_matrix)
    np.save(folder + 'other_university_query_esa_matrix.npy', other_university_query_esa_matrix)
    np.save(folder + 'other_university_esa_similarity_matrix.npy', other_university_esa_similarity_matrix)

# For CS vs CS courses

folder = '../codes/dataset/saved/with_phrases/'
dept = 'CS-CS'
scope = 'one-vs-one'
method = 'other_university_esa'

eval_matrix_path = "../codes/dataset/Revised_EvaluationData.txt"
titles, _ = ESA({}, {}, {}, "same").dict_to_list(document_content)
wiki_esa_similarity_matrix = np.load(folder + 'other_university_esa_similarity_matrix.npy')

evaluation = Evaluation(eval_matrix_path, titles, other_university_esa_similarity_matrix, dept, scope)
cs_cs_other_university_esa_similarity_matrix, cs_cs_other_university_esa_evaluation_matrix, cs_cs_other_university_esa_coursewise_ndcg = evaluation.printing(folder, method)

# For CS vs ALL courses

folder = '../codes/dataset/saved/with_phrases/'
dept = 'CS-ALL'
scope = 'one-vs-all'
method = 'other_university_esa'

eval_matrix_path = "../codes/dataset/Revised_EvaluationData.txt"
titles, _ = ESA({}, {}, {}, "same").dict_to_list(document_content)
wiki_esa_similarity_matrix = np.load(folder + 'other_university_esa_similarity_matrix.npy')

evaluation = Evaluation(eval_matrix_path, titles, other_university_esa_similarity_matrix, dept, scope)
cs_all_other_university_esa_similarity_matrix, cs_all_other_university_esa_evaluation_matrix, cs_all_other_university_esa_coursewise_ndcg = evaluation.printing(folder, method)

# For EE vs EE courses

folder = '../codes/dataset/saved/with_phrases/'
dept = 'EE-EE'
scope = 'one-vs-one'
method = 'other_university_esa'

eval_matrix_path = "../codes/dataset/EE_EvaluationData.txt"
titles, _ = ESA({}, {}, {}, "same").dict_to_list(document_content)
wiki_esa_similarity_matrix = np.load(folder + 'other_university_esa_similarity_matrix.npy')

evaluation = Evaluation(eval_matrix_path, titles, wiki_esa_similarity_matrix, dept, scope)
ee_ee_other_university_esa_similarity_matrix, ee_ee_other_university_esa_evaluation_matrix, ee_ee_other_university_esa_coursewise_ndcg = evaluation.printing(folder, method)

# For EE vs All courses

folder = '../codes/dataset/saved/with_phrases/'
dept = 'EE-ALL'
scope = 'one-vs-all'
method = 'other_university_esa'

eval_matrix_path = "../codes/dataset/EE_EvaluationData.txt"
titles, _ = ESA({}, {}, {}, "same").dict_to_list(document_content)
wiki_esa_similarity_matrix = np.load(folder + 'other_university_esa_similarity_matrix.npy')

evaluation = Evaluation(eval_matrix_path, titles, wiki_esa_similarity_matrix, dept, scope)
ee_all_other_university_esa_similarity_matrix, ee_all_other_university_esa_evaluation_matrix, ee_all_other_university_esa_coursewise_ndcg = evaluation.printing(folder, method)

CS1100 - Introduction to Programming  -  0.983303248698947
CS1111 - Problem Solving using Computers  -  0.9669850519151657
CS2300 - Foundations of Computer Systems Design  -  0.9668342083190152
CS2310 - Foundations of Computer Systems Design Lab  -  0.9455856780856265
CS2700 - Programming and Data Structures  -  0.9210504394261853
CS2710 - Programming and Data Structures Lab  -  0.9491546172962253
CS3100 - Paradigms of Programming  -  0.9391664772645913
CS3300 - Compiler Design  -  0.23137821315975915
CS3500 - Operating Systems  -  1.0
CS3700 - Introduction to Database Systems  -  nan
CS5020 - Nonlinear optimisation: Theory and algorithms  -  0.890235195578859
CS5691 - Pattern Recognition and Machine Learning  -  0.9591390361068151
CS5800 - Advanced Data Structures & Algorithms  -  0.9559667565543964
CS6014 - Computability and Complexity  -  0.8843252268199594
CS6015 - Linear Algebra and Random Processes  -  0.9469024295259745
CS6024 - Algorithmic Approaches to Computational Biology  -

# with phrases wikipedia ESA

In [65]:
folder = '../codes/dataset/Dictionary_text/'
with open(folder+'Complete_Glossary1.txt', 'r', encoding='utf-8') as file:
    glossary = file.readlines()

ps = PorterStemmer()
    
phrases = []
for word in glossary:
    phrases.append(ps.stem(word.replace('\n', '')))

In [66]:
folder = '../codes/dataset/saved/with_phrases/'

# phrases = []
abbreviations = {}
with_stemming = 'no'

course_detail_path = "../codes/dataset/CourseData.csv"
preprocessing = Preprocessing(phrases, abbreviations, with_stemming)
try:
    document_content = jsonload(folder + 'Course_Data_with_phrases.json')
except:
    document_content = preprocessing.readCourseData(course_detail_path)
    jsondump(document_content, folder + 'Course_Data_with_phrases.json')
    
try:
    wiki_document_esa_matrix = np.load(folder + 'wiki_document_esa_matrix.npy')
    wiki_query_esa_matrix = np.load(folder + 'wiki_query_esa_matrix.npy')
    wiki_esa_similarity_matrix = np.load(folder + 'wiki_esa_similarity_matrix.npy')
except:
    try:
        base_content = jsonload(folder + 'Wikipedia_Articles_with_phrases.json')
    except:
        wikipedia_articles_path = "../codes/dataset/QueriedWikipediaArticles/"
        base_content = preprocessing.readWikipediaData(wikipedia_articles_path)
        jsondump(base_content, folder + 'Wikipedia_Articles_with_phrases.json')

    
    esa = ESA(base_content, document_content, document_content, "same")
    wiki_document_esa_matrix, wiki_query_esa_matrix, wiki_esa_similarity_matrix = esa.computation()

    np.save(folder + 'wiki_document_esa_matrix.npy', wiki_document_esa_matrix)
    np.save(folder + 'wiki_query_esa_matrix.npy', wiki_query_esa_matrix)
    np.save(folder + 'wiki_esa_similarity_matrix.npy', wiki_esa_similarity_matrix)

# For CS vs CS courses

folder = '../codes/dataset/saved/with_phrases/'
dept = 'CS-CS'
scope = 'one-vs-one'
method = 'wiki_esa'

eval_matrix_path = "../codes/dataset/Revised_EvaluationData.txt"
titles, _ = ESA({}, {}, {}, "same").dict_to_list(document_content)
wiki_esa_similarity_matrix = np.load(folder + 'wiki_esa_similarity_matrix.npy')

evaluation = Evaluation(eval_matrix_path, titles, wiki_esa_similarity_matrix, dept, scope)
cs_cs_wiki_esa_similarity_matrix, cs_cs_wiki_esa_evaluation_matrix, cs_cs_wiki_esa_coursewise_ndcg = evaluation.printing(folder, method)

# For CS vs All courses

folder = '../codes/dataset/saved/with_phrases/'
dept = 'CS-ALL'
scope = 'one-vs-all'
method = 'wiki_esa'

eval_matrix_path = "../codes/dataset/Revised_EvaluationData.txt"
titles, _ = ESA({}, {}, {}, "same").dict_to_list(document_content)
wiki_esa_similarity_matrix = np.load(folder + 'wiki_esa_similarity_matrix.npy')

evaluation = Evaluation(eval_matrix_path, titles, wiki_esa_similarity_matrix, dept, scope)
cs_all_wiki_esa_similarity_matrix, cs_all_wiki_esa_evaluation_matrix, cs_all_wiki_esa_coursewise_ndcg = evaluation.printing(folder, method)

# For EE vs EE courses

folder = '../codes/dataset/saved/with_phrases/'
dept = 'EE-EE'
scope = 'one-vs-one'
method = 'wiki_esa'

eval_matrix_path = "../codes/dataset/EE_EvaluationData.txt"
titles, _ = ESA({}, {}, {}, "same").dict_to_list(document_content)
wiki_esa_similarity_matrix = np.load(folder + 'wiki_esa_similarity_matrix.npy')

evaluation = Evaluation(eval_matrix_path, titles, wiki_esa_similarity_matrix, dept, scope)
ee_ee_wiki_esa_similarity_matrix, ee_ee_wiki_esa_evaluation_matrix, ee_ee_wiki_esa_coursewise_ndcg = evaluation.printing(folder, method)

# For EE vs All courses

folder = '../codes/dataset/saved/with_phrases/'
dept = 'EE-ALL'
scope = 'one-vs-all'
method = 'wiki_esa'

eval_matrix_path = "../codes/dataset/EE_EvaluationData.txt"
titles, _ = ESA({}, {}, {}, "same").dict_to_list(document_content)
wiki_esa_similarity_matrix = np.load(folder + 'wiki_esa_similarity_matrix.npy')

evaluation = Evaluation(eval_matrix_path, titles, wiki_esa_similarity_matrix, dept, scope)
ee_all_wiki_esa_similarity_matrix, ee_all_wiki_esa_evaluation_matrix, ee_all_wiki_esa_coursewise_ndcg = evaluation.printing(folder, method)

CS1100 - Introduction to Programming  -  0.9766196540579892
CS1111 - Problem Solving using Computers  -  0.975788054854922
CS2300 - Foundations of Computer Systems Design  -  0.9634012691943225
CS2310 - Foundations of Computer Systems Design Lab  -  0.9284591831945489
CS2700 - Programming and Data Structures  -  0.9152674746510859
CS2710 - Programming and Data Structures Lab  -  0.9603909229238758
CS3100 - Paradigms of Programming  -  0.9391222304037579
CS3300 - Compiler Design  -  0.23981246656813146
CS3500 - Operating Systems  -  0.9619991470595832
CS3700 - Introduction to Database Systems  -  nan
CS5020 - Nonlinear optimisation: Theory and algorithms  -  0.9024049875019746
CS5691 - Pattern Recognition and Machine Learning  -  0.9178260148263059
CS5800 - Advanced Data Structures & Algorithms  -  0.9547851288814769
CS6014 - Computability and Complexity  -  0.8723047062986427
CS6015 - Linear Algebra and Random Processes  -  0.9134015924715543
CS6024 - Algorithmic Approaches to Computat