## Load data

In [7]:
import os

# read the file containing all articles saved in this folder
file = open(os.getcwd()+"/AA.txt",  'r', encoding="utf-8")
text = file.read().strip()
file.close()
#print(text)

import re

dataset = []
# iterate through all the articles and get the id, url, name and text from all the articles

# use simple regular expressions to retrieve the article id, url, title and body.
article_id = re.findall('<doc id="(.*)" url', text)
article_url = re.findall(' url="(.*)" title', text)
article_title = re.findall(' title="(.*)">', text)

article_body =[]
text = re.sub("\n"," ",text)
text = re.sub(" +"," ",text)
text = re.sub("\'","'",text) # issue created by previous substitutions
for title in article_title[:250]: # we can only use 250 articles else we run out of memory
    regex = title+'"> '+title+'(.*) </doc>'
    body = re.findall(regex, text)
    article_body.append(body)
for a in range(len(article_body)):
    try:
        article_body[a] = article_body[a[0]].split('</doc>', 1)[0] # cut off at right ending
    except: 
        article_body[a] ='' # empty article
    
print(len(article_id),len(article_url),len(article_title),len(article_body))
print(article_id[:10])
print(article_url[:3])
print(article_title)
print(article_body[:3])

# create 4-tuple with meta-info per article
for j in range(len(article_body)):
    dataset.append((article_id[j],article_url[j],article_title[j],article_body[j]))     
        
N = len(dataset)

1479 1479 1479 250
['1701', '1702', '1710', '1711', '1714', '1715', '1716', '1717', '1718', '1719']
['https://en.wikipedia.org/wiki?curid=1701', 'https://en.wikipedia.org/wiki?curid=1702', 'https://en.wikipedia.org/wiki?curid=1710']
['Amazon River', 'Alfred of Beverley', 'April 22', 'August 31', 'Autpert Ambrose', 'Abu Bakr', 'Ambrose Traversari', 'Ambrosians', 'Ambrosiaster', 'Ambrosius Aurelianus', 'Ammon', 'Ammonius Hermiae', 'Ammonius Saccas', 'Book of Amos', 'Amphipolis', 'Amram', 'Amyntas I of Macedon', 'Amyntas III of Macedon', 'Anacharsis', 'Anah', 'Ānanda', 'Anaxagoras', 'Anaxarchus', 'Ancyra (genus)', 'Anastasius I', 'Anastasius II', 'Anaximenes of Lampsacus', 'Anastasius', 'Auguste and Louis Lumière', 'Acts of the Apostles', 'Assyria', 'Abijah', 'Ark', 'Aphasia', 'Aorta', 'Abimelech', 'Andrew Tridgell', 'Applesoft BASIC', 'Asterix', 'Arizona Cardinals', 'Atlanta Falcons', 'Heathenry in the United States', 'Ansible', 'Amiga', 'Absorption', 'Actinophryid', 'Abel Tasman', 'Augu

## Clean data

In [8]:
import numpy as np

def convert_lower_case(data):
    return np.char.lower(data)

from nltk.corpus import stopwords
from nltk import word_tokenize
#nltk.download('stopwords')

def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    # iterate over all the stop words and not append to the list if it’s a stop word
    new_text = ""
    for word in words:
        if word not in stop_words and len(word) > 1: # remove stop words and single characters
            new_text = new_text + " " + word
    return new_text

def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ') # remove every occurence of this symbol
        data = np.char.replace(data, "  ", " ") # remove extra spaces
    data = np.char.replace(data, ',', '') #remove comma seperately?
    return data

def remove_apostrophe(data):
    return np.char.replace(data, "'", "") #seperate?

from nltk.stem.porter import *
from nltk.stem import PorterStemmer

def stemming(data): # reduce words to its stem
    stemmer= PorterStemmer() # rule-based stemmer, identifies and removes the suffix or affix of a word
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text
# A better efficient way is to first lemmatise and then stem

import num2words

def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w),lang='en')
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [9]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    #data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    #data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

In [10]:
processed_text = []
processed_title = []
for i in dataset:
    processed_text.append(word_tokenize(str(preprocess(i[3]))))
    processed_title.append(word_tokenize(str(preprocess(i[2]))))

## Build model

In [11]:
DF = {}
# iterate through all the words in all the documents and store the document id’s for each word.
for i in range(N):
    tokens = processed_text[i] # body of the document
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

    tokens = processed_title[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}
            
for i in DF:
    DF[i] = len(DF[i]) # unique words, we don’t actually need the list of docs, we just need the count

total_vocab_size = len(DF)
total_vocab = [x for x in DF]

#getter
def doc_freq(word):
    try:
        return DF[word]
    except:
        return 0

In [12]:
# let’s use dictionary with (document, token) pair as key and any TF-IDF score as the value
# tf_idf dictionary is for body, we will use the same logic for to build a dictionary tf_idf_title for the words in title.
from collections import Counter

# Calculate TF-IDF for Body for all docs
doc = 0
tf_idf = {}
#iterate over all documents
for i in range(N):  
    tokens = processed_text[i]
    counter = Counter(tokens + processed_title[i])  
    words_count = len(tokens + processed_title[i])
    # Counter can give us the frequency of the tokens, calculate tf and idf and finally store as a (doc, token) pair in tf_idf.
    for token in np.unique(tokens):      
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1)) #numerator is added 1 to avoid negative values
        tf_idf[doc, token] = tf*idf
    doc += 1

# Calculate TF-IDF for title for all docs   
doc = 0
tf_idf_title = {}
for i in range(N):
    tokens = processed_title[i]
    counter = Counter(tokens + processed_text[i])
    words_count = len(tokens + processed_text[i])
    for token in np.unique(tokens):
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1)) #numerator is added 1 to avoid negative values
        tf_idf_title[doc, token] = tf*idf
    doc += 1

In [13]:
alpha = 0.3
# Merging the TF-IDF according to weights
# multiply the Body TF-IDF with alpha
for i in tf_idf:
    tf_idf[i] *= alpha
# Iterate Title IF-IDF for every (doc, token)
# if token is in body, replace the Body(doc, token) value with the value in Title(doc, token)
for i in tf_idf_title:
    tf_idf[i] = tf_idf_title[i]

## Ranking

In [14]:
# (pre)process question and print
def query_tokenize(query):
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))
    
    print("\nQuery:", query)
    print(tokens)
    print("")
    
    return tokens

#### option 1:

In [15]:
# theoretical concept: add tf_idf values of the tokens that are in query for every document.
# Iterate over all values in the dictionary and check if the value is present in the token.
# As our dictionary is a (document, token) key, when we find a token which is in query we will
# add the document id to another dictionary along with the tf-idf value
def matching_score(k, query):
    print(">>Matching score")
    
    tokens = query_tokenize(query)
    
    query_weights = {}
    for key in tf_idf:  
        if key[1] in tokens:
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    
    # return the top k articles
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True) # sort by score/relevance
    ranking =[]
    print("Most related articles:")
    for i in query_weights[:k]: # i[0] is index of article in dataset
        print(dataset[i[0]][0], dataset[i[0]][2], i[1]) # print article_id, article_title and score
        ranking.append(dataset[i[0]][2]) # article_title
    return ranking

#### option 2:

In [16]:
import math

# vectorize documents
D = np.zeros((N, total_vocab_size))
for i in tf_idf:
    try:
        ind = total_vocab.index(i[1]) # generate a index for each token
        D[i[0]][ind] = tf_idf[i] # document vectors
    except:
        pass

# vectorize query
def query_vectorize(tokens):
    Q = np.zeros((len(total_vocab)))
    counter = Counter(tokens)
    words_count = len(tokens)
    query_weights = {}
    for token in np.unique(tokens):
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = math.log((N+1)/(df+1))
        try:
            ind = total_vocab.index(token)
            Q[ind] = tf*idf
        except:
            pass
    return Q

In [22]:
def cosine_similarity(k, query):
    print(">>Cosine Similarity")
    
    tokens = query_tokenize(query)
    
    query_vector = query_vectorize(tokens)
    
    # measure similarity
    d_cosines = []
    for d in D:
        similarity = np.dot(query_vector, d)/(np.linalg.norm(query_vector)*np.linalg.norm(d))
        if math.isnan(similarity):
            d_cosines.append(0.0)
        else:
            d_cosines.append(similarity)

    # return the top k articles
    ranking = np.array(d_cosines).argsort()[-k:][::-1].tolist() 
    
    print("Most related articles:")
    for i in ranking:
        print(dataset[i][0], dataset[i][2], d_cosines[i]) # print article_id, article_title and score
    return [dataset[i][2] for i in ranking]

## Test

In [45]:
# create tuples of test cases (question, correct article)
questions = [("What is anarchism?","Anarchism"),("What does ASD stand for?","Autism"),("From what age is Autism diagnosable?","Autism"),("Is there a cure for Autism?","Autism"),("Can Autism be treated?","Autism"),("Where is Achilles famous for?","Achilles"),("What is the difference between antisymmetry and asymmetry?","Antisymmetric relation"),("What are the green stones from the Amazon River called?","Amazonite"),("What is the largest river in the world?","Amazon River"),("What is the translation of 'acropolis'","Acropolis")]
print(len(questions))
X = [pair[0] for pair in questions]
print(X)
y = [pair[1] for pair in questions]
print(y)

10
['What is anarchism?', 'What does ASD stand for?', 'From what age is Autism diagnosable?', 'Is there a cure for Autism?', 'Can Autism be treated?', 'Where is Achilles famous for?', 'What is the difference between antisymmetry and asymmetry?', 'What are the green stones from the Amazon River called?', 'What is the largest river in the world?', "What is the translation of 'acropolis'"]
['Anarchism', 'Autism', 'Autism', 'Autism', 'Autism', 'Achilles', 'Antisymmetric relation', 'Amazonite', 'Amazon River', 'Acropolis']


In [46]:
rankings_ms =[]
rankings_cs =[]
for question in X: # retrieve 10 most related articles
    ranking_ms = matching_score(10,question)
    rankings_ms.append(ranking_ms)
    ranking_cs = cosine_similarity(10,question)
    rankings_cs.append(ranking_cs)

>>Matching score

Query: What is anarchism?
['anarchism']

Most related articles:
12 Anarchism 4.832305758571839
>>Cosine Similarity

Query: What is anarchism?
['anarchism']

Most related articles:
12 Anarchism 1.0
2487 Amazonite 0.0
2607 Antoine Thomson d'Abbadie 0.0
2624 Aegean civilization 0.0
2623 Aegadian Islands 0.0
2622 Aedui 0.0
2621 Aedicula 0.0
2620 Aedesius 0.0
2619 Aeclanum 0.0
2618 Aeacus 0.0
>>Matching score

Query: What does ASD stand for?
['asd', 'stand']

Most related articles:
>>Cosine Similarity

Query: What does ASD stand for?
['asd', 'stand']

Most related articles:
2487 Amazonite 0.0
2607 Antoine Thomson d'Abbadie 0.0
2626 Aegeus 0.0
2624 Aegean civilization 0.0
2623 Aegadian Islands 0.0
2622 Aedui 0.0
2621 Aedicula 0.0
2620 Aedesius 0.0
2619 Aeclanum 0.0
2618 Aeacus 0.0
>>Matching score

Query: From what age is Autism diagnosable?
['age', 'autism', 'diagnosable']

Most related articles:
25 Autism 4.832305758571839
>>Cosine Similarity

Query: From what age is Auti

  # This is added back by InteractiveShellApp.init_path()


The cosine similarity always spits out 10 articles. When all similarity scores are 0.0, the list seems partially random, apart from the fact that it is every time the same articles in the same order. The "most related" article, Amazonite, is namely neither the longest nor the shortest article; the same holds for the other articles in the list. Furthermore Amazonite is somewhere in the middle of the corpus, and not followed by any of the other articles from the ranking. However, the remaining 9 titles are subsequent in the dataset, as expected from the id's.

The matching score only outputs one or more articles if it is sufficiently confident that it is relevant.

In [40]:
def score(y_correct,y_predicted):
    correct = 0
    score = 0
    for i in range(len(y_correct)): # test case i
        if y_correct[i] in y_predicted[i]: # in top 10
            correct+=1
            score+=y_predicted[i].index(y_correct[i]) # the lower the index the higher ranked
        else:
            score +=11 # at least not in top 10
    print("Correctly retrieved:", correct, "out of", len(y_correct), ". Minimized score:", score)       

In [48]:
print("Matching score performance:")
score(y,rankings_ms)
print("Cosine similarity performance:")
score(y,rankings_cs)

Matching score performance:
Correctly retrieved: 6 out of 10 . Minimized score: 44
Cosine similarity performance:
Correctly retrieved: 6 out of 10 . Minimized score: 44


i.e. if it retrieves the correct document, it is always retrieved as the first, best article. If it is not the first document, it will also not be lower within the top-10. This is also demonstrated by one of the cases in the next section.

### Interesting cases

#### influence of formulation

In [19]:
query = "Is there a cure for Autism?"
ranking_ms = matching_score(10,query)
print("")
ranking_cs = cosine_similarity(10,query)

>>Matching score

Query: Is there a cure for Autism?
['cure', 'autism']

Most related articles:
25 Autism 0.05116389300761864

>>Cosine Similarity

Query: Is there a cure for Autism?
['cure', 'autism']

Most related articles:
25 Autism 0.696424239695315
305 Achilles 0.0
303 Alabama 0.0
290 A 0.0
39 Albedo 0.0
12 Anarchism 0.0


In [37]:
query = "Can Autism be treated?" # same question as previous one, different formulation
ranking_ms = matching_score(10,query)
print("")
ranking_cs = cosine_similarity(10,query)

>>Matching score

Query: Can Autism be treated?
['autism', 'treated']

Most related articles:
25 Autism 4.832305758571839

>>Cosine Similarity

Query: Can Autism be treated?
['autism', 'treated']

Most related articles:
25 Autism 1.0
2487 Amazonite 0.0
2624 Aegean civilization 0.0
2623 Aegadian Islands 0.0
2622 Aedui 0.0
2621 Aedicula 0.0
2620 Aedesius 0.0
2619 Aeclanum 0.0
2618 Aeacus 0.0
2616 Adware 0.0


  # This is added back by InteractiveShellApp.init_path()


#### title (not) mentioned

In [41]:
query = "What are the green stones from the Amazon River called?"
# mentiones an article title, but not the one from the article in which the answer is
ranking_ms = matching_score(10,query)
print("")
ranking_cs = cosine_similarity(10,query)

>>Matching score

Query: What are the green stones from the Amazon River called?
['green', 'stones', 'amazon', 'river', 'called']

Most related articles:
1701 Amazon River 4.629573204517756
1362 Anadyr River 2.213420325231837

>>Cosine Similarity

Query: What are the green stones from the Amazon River called?
['green', 'stones', 'amazon', 'river', 'called']

Most related articles:
1701 Amazon River 1.0
1362 Anadyr River 0.45629304842126417
2606 Abatis 0.0
2624 Aegean civilization 0.0
2623 Aegadian Islands 0.0
2622 Aedui 0.0
2621 Aedicula 0.0
2620 Aedesius 0.0
2619 Aeclanum 0.0
2618 Aeacus 0.0


  # This is added back by InteractiveShellApp.init_path()


In [42]:
query = "What is the largest river in the world?"
# question does not have article title in it
ranking_ms = matching_score(10,query)
print("")
ranking_cs = cosine_similarity(10,query)

>>Matching score

Query: What is the largest river in the world?
['largest', 'river', 'world']

Most related articles:
1701 Amazon River 2.213420325231837
1362 Anadyr River 2.213420325231837

>>Cosine Similarity

Query: What is the largest river in the world?
['largest', 'river', 'world']

Most related articles:
1701 Amazon River 0.6754946694247588
1362 Anadyr River 0.6754946694247588
2606 Abatis 0.0
2624 Aegean civilization 0.0
2623 Aegadian Islands 0.0
2622 Aedui 0.0
2621 Aedicula 0.0
2620 Aedesius 0.0
2619 Aeclanum 0.0
2618 Aeacus 0.0


  # This is added back by InteractiveShellApp.init_path()


In [47]:
query = "What is the translation of 'acropolis'?"
# question does have article title in it
ranking_ms = matching_score(10,query)
print("")
ranking_cs = cosine_similarity(10,query)

>>Matching score

Query: What is the translation of 'acropolis'
['translation', 'acropolis']

Most related articles:

>>Cosine Similarity

Query: What is the translation of 'acropolis'
['translation', 'acropolis']

Most related articles:
2487 Amazonite 0.0
2607 Antoine Thomson d'Abbadie 0.0
2626 Aegeus 0.0
2624 Aegean civilization 0.0
2623 Aegadian Islands 0.0
2622 Aedui 0.0
2621 Aedicula 0.0
2620 Aedesius 0.0
2619 Aeclanum 0.0
2618 Aeacus 0.0


  # This is added back by InteractiveShellApp.init_path()
