## Data

In [12]:
import os
import re
import bz2

dataset = []
# iterate to all the folders and get the title and file name from all the index.html files

# read the file from index files
file = open(os.getcwd()+"/wiki_00.txt", 'r')
file = bz2.open("wiki_00.bz2", "rb")
text = file.read().strip().decode("utf-8") 
#print(text)
file.close()

# use simple regular expression to retrieve the article id, url, title and body.
article_id = re.findall('<doc id="(.*)" ', text)
article_url = re.findall(' url="(.*)" ', text)
article_title = re.findall(' title="(.*)">', text)
print(article_title)
regex = '>'+article_title+'(.*)</doc>'
article_body = re.findall(regex, text)

print(len(article_id),len(article_url),len(article_title),len(article_body))

for j in range(len(article_id)):
    dataset.append(article_id,article_url,article_title,article_body)     
        
N = len(dataset)
print(dataset)

['Anarchism', 'Autism', 'Albedo', 'A', 'Alabama', 'Achilles']


TypeError: must be str, not list

## Clean

In [None]:
import numpy as np

def convert_lower_case(data):
    return np.char.lower(data)

from nltk.corpus import stopwords
from nltk import word_tokenize
#nltk.download('stopwords')

def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    # iterate over all the stop words and not append to the list if it’s a stop word
    new_text = ""
    for word in words:
        if word not in stop_words and len(word) > 1: # remove stop words and single characters
            new_text = new_text + " " + word
    return new_text

def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ') # remove every occurence of this symbol
        data = np.char.replace(data, "  ", " ") # remove extra spaces
    data = np.char.replace(data, ',', '') #remove comma seperately at last?
    return data

def remove_apostrophe(data):
    return np.char.replace(data, "'", "") #seperate?

from nltk.stem.porter import *
from nltk.stem import PorterStemmer

def stemming(data): # reduce words to its stem
    stemmer= PorterStemmer() # rule-based stemmer, identifies and removes the suffix or affix of a word
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text
# A better efficient way to proceed is to first lemmatise and then stem

from num2words import num2words

def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w),lang='en')
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

In [None]:
processed_text = []
processed_title = []
for i in dataset:
    processed_text.append(word_tokenize(str(preprocess(i[3]))))
    processed_title.append(word_tokenize(str(preprocess(i[2]))))

## Model

In [None]:
DF = {}
# iterate through all the words in all the documents and store the document id’s for each word.
for i in range(N):
    tokens = processed_text[i] # body of the document
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

    tokens = processed_title[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}
            
for i in DF:
    DF[i] = len(DF[i]) # unique words, we don’t actually need the list of docs, we just need the count

total_vocab_size = len(DF)
total_vocab = [x for x in DF]

#getter
def doc_freq(word):
    try:
        return DF[word]
    except:
        return 0

In [None]:
# let’s use dictionary with (document, token) pair as key and any TF-IDF score as the value
# tf_idf dictionary is for body, we will use the same logic for to build a dictionary tf_idf_title for the words in title.
from collections import Counter

# Calculate TF-IDF for Body for all docs
doc = 0
tf_idf = {}
#iterate over all documents
for i in range(N):  
    tokens = processed_text[i]
    counter = Counter(tokens + processed_title[i])  
    words_count = len(tokens + processed_title[i])
    # Counter can give us the frequency of the tokens, calculate tf and idf and finally store as a (doc, token) pair in tf_idf.
    for token in np.unique(tokens):      
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1)) #numerator is added 1 to avoid negative values
        tf_idf[doc, token] = tf*idf
    doc += 1

# Calculate TF-IDF for title for all docs   
doc = 0
tf_idf_title = {}
for i in range(N):
    tokens = processed_title[i]
    counter = Counter(tokens + processed_text[i])
    words_count = len(tokens + processed_text[i])
    for token in np.unique(tokens):
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1)) #numerator is added 1 to avoid negative values
        tf_idf_title[doc, token] = tf*idf
    doc += 1

In [None]:
alpha = 0.3
# Merging the TF-IDF according to weights
# multiply the Body TF-IDF with alpha
for i in tf_idf:
    tf_idf[i] *= alpha
# Iterate Title IF-IDF for every (doc, token)
# if token is in body, replace the Body(doc, token) value with the value in Title(doc, token)
for i in tf_idf_title:
    tf_idf[i] = tf_idf_title[i]

In [None]:
# vectorize documents
D = np.zeros((N, total_vocab_size))
for i in tf_idf:
    try:
        ind = total_vocab.index(i[1]) # generate a index for each token
        D[i[0]][ind] = tf_idf[i] # document vectors
    except:
        pass

## Ranking

In [None]:
# theoretical concept: add tf_idf values of the tokens that are in query for every document.
# Iterate over all values in the dictionary and check if the value is present in the token.
# As our dictionary is a (document, token) key, when we find a token which is in query we will
# add the document id to another dictionary along with the tf-idf value
def matching_score(k, query):
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    query_weights = {}
    for key in tf_idf:  
        if key[1] in tokens:
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    print(query_weights)
    # take the top k documents
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)
    print("")
    ranking = []
    for i in query_weights[:k]:
        print(i)
        ranking.append(i[0])
    print(ranking)
    for i in ranking:
        print(i, dataset[i][0])

In [None]:
import math
# vectorize query
def gen_vector(tokens):
    Q = np.zeros((len(total_vocab)))
    counter = Counter(tokens)
    words_count = len(tokens)
    query_weights = {}
    for token in np.unique(tokens):
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = math.log((N+1)/(df+1))
        try:
            ind = total_vocab.index(token)
            Q[ind] = tf*idf
        except:
            pass
    return Q

def cosine_similarity(k, query):
    print(">>Cosine Similarity")
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))
    print("\nQuery:", query)
    print(tokens)   
    query_vector = gen_vector(tokens)
    d_cosines = []
    for d in D:
        s = np.dot(query_vector, d)/(np.linalg.norm(query_vector)*np.linalg.norm(d))
        d_cosines.append(s)
    # take the top k documents
    ranking = np.array(d_cosines).argsort()[-k:][::-1].tolist()  
    print("") 
    print(ranking)
    for i in ranking:
        print(i, corpus[i])

## Test

In [None]:
query = "What is unsupervised learning?"
matching_score(10,query)
print("")
cosine_similarity(10,query)