In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import os
import time
from nltk.tokenize import word_tokenize

In [2]:
raw_doc_collection = []

path = '../dataset-retrieval/talk.politics.misc.450/'
files = os.listdir(path)
for file in files:
    f = open(path+file)
    raw_doc_collection.append(f.read())

In [3]:
import nltk.data
from nltk.lm.preprocessing import pad_both_ends

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
padded_sentences_per_doc = []
for i in range(len(raw_doc_collection)):
    temp = tokenizer.tokenize(raw_doc_collection[i])
    padded_sentences_per_doc.append([])
    for j in range(len(temp)):
        padded_sentences_per_doc[i].append(''.join(list(pad_both_ends(temp[j], n=2, left_pad_symbol="<s> ", right_pad_symbol=" </s>"))))

In [4]:
from nltk.tokenize import MWETokenizer

mwtokenizer = MWETokenizer(separator='')
mwtokenizer.add_mwe(('<','s', '>'))
mwtokenizer.add_mwe(('<','/s','>'))

def get_vocabulary(padded_sentences_per_doc) :
    s = []
    for i in range(len(padded_sentences_per_doc)) :
        s.append([])
        for j in range(len(padded_sentences_per_doc[i])):
            temp = [token.lower() for token in word_tokenize(padded_sentences_per_doc[i][j])]
            mw = mwtokenizer.tokenize(temp)
            for k in range(len(mw)):
                if mw[k] == '<s>' or mw[k] == '</s>':
                    s[i].append(mw[k])
                else:
                    if mw[k].isalpha():
                        s[i].append(mw[k])
            
    return s

In [39]:
vocab_per_doc = get_vocabulary(padded_sentences_per_doc)
print(vocab_per_doc[1])

['<s>', 'today', 'marks', 'the', 'anniversary', 'of', 'the', 'armenian', 'genocide', 'of', 'million', 'turks', 'and', 'kurds', 'in', 'eastern', 'anatolia', 'and', 'armenia', '</s>', '<s>', 'the', 'following', 'letter', 'which', 'represents', 'a', 'small', 'portion', 'of', 'the', 'full', 'text', 'along', 'with', 'more', 'than', 'pages', 'of', 'historical', 'documents', 'scholarly', 'sources', 'eyewitness', 'accounts', 'and', 'photographs', 'was', 'sent', 'to', 'president', 'bill', 'clinton', 'members', 'of', 'congress', 'editors', 'program', 'directors', 'and', 'columnists', 'of', 'major', 'newspapers', 'journals', 'and', 'stations', 'for', 'the', 'anniversary', 'of', 'the', 'armenian', 'genocide', 'of', 'million', 'muslim', 'people', '</s>', '<s>', 'on', 'april', 'of', 'every', 'year', 'the', 'people', 'of', 'turkiye', 'remember', 'their', 'dead', '</s>', '<s>', 'they', 'grieve', 'for', 'lost', 'family', 'and', 'the', 'lost', 'homes', 'of', 'their', 'grandfathers', '</s>', '<s>', 'this

In [6]:
def CountFrequency(my_list):
 
    # Creating an empty dictionary
    freq = {}
    for item in my_list:
        if (item in freq):
            freq[item] += 1
        else:
            freq[item] = 1
    return freq

In [40]:
freq_per_doc = []
for vocab in vocab_per_doc:
    freq = CountFrequency(vocab)
    freq_per_doc.append(freq)

print(freq_per_doc[1])

{'<s>': 21, 'today': 2, 'marks': 1, 'the': 89, 'anniversary': 3, 'of': 48, 'armenian': 10, 'genocide': 14, 'million': 5, 'turks': 2, 'and': 35, 'kurds': 1, 'in': 14, 'eastern': 1, 'anatolia': 1, 'armenia': 10, '</s>': 21, 'following': 2, 'letter': 2, 'which': 3, 'represents': 1, 'a': 7, 'small': 1, 'portion': 1, 'full': 2, 'text': 1, 'along': 1, 'with': 4, 'more': 1, 'than': 1, 'pages': 1, 'historical': 4, 'documents': 1, 'scholarly': 1, 'sources': 1, 'eyewitness': 1, 'accounts': 1, 'photographs': 1, 'was': 2, 'sent': 1, 'to': 18, 'president': 1, 'bill': 1, 'clinton': 1, 'members': 1, 'congress': 1, 'editors': 1, 'program': 1, 'directors': 1, 'columnists': 1, 'major': 1, 'newspapers': 1, 'journals': 1, 'stations': 1, 'for': 6, 'muslim': 5, 'people': 10, 'on': 3, 'april': 1, 'every': 1, 'year': 2, 'turkiye': 1, 'remember': 1, 'their': 6, 'dead': 1, 'they': 2, 'grieve': 1, 'lost': 2, 'family': 1, 'homes': 1, 'grandfathers': 1, 'this': 4, 'turkish': 28, 'nation': 1, 'is': 4, 'mourning': 1

In [61]:
def find_word_in_vocab_per_doc(vocab, term, prev_term):
    count = 0
    for i in range(len(vocab)):
        if vocab[i] == term:
            if vocab[i-1] == prev_term:
                count += 1
    
    # add one smoothing
    if count == 0:
        count = 0 + 1
    return count

def find_term_frequency_in_doc(vocab_freq, term, vocab):
    term_freq = vocab_freq.get(term)
    if term_freq is None:
        term_freq = 0 + len(vocab)
        
    return term_freq
            

In [97]:
def get_top_three_docs(v_input):
    # padding input with <s></s>
    pad_input = ''.join(list(pad_both_ends(v_input, n=2, left_pad_symbol="<s> ", right_pad_symbol=" </s>")))
    # print(pad_input)

    # tokenize + mwe
    mwtokenizer = MWETokenizer(separator='')
    mwtokenizer.add_mwe(('<','s', '>'))
    mwtokenizer.add_mwe(('<','/s','>'))

    temp = [token.lower() for token in word_tokenize(pad_input)]
    mw_input = mwtokenizer.tokenize(temp)
    # print(mw_input)

    # loop token find probability given token,token-1
    length = len(mw_input)

    probability_per_doc = {}

    for doc_index in range(len(vocab_per_doc)):
        i = 0
        prob_term = 0.0
        while (i+1 < length):
            temp_count = find_word_in_vocab_per_doc(vocab_per_doc[doc_index], mw_input[i+1], mw_input[i])
            if prob_term == 0.0:
                prob_term = temp_count / find_term_frequency_in_doc(freq_per_doc[doc_index], mw_input[i], vocab_per_doc[doc_index])
            else:
                prob_term *= temp_count / find_term_frequency_in_doc(freq_per_doc[doc_index], mw_input[i], vocab_per_doc[doc_index])
            i+=1
        probability_per_doc[doc_index] = prob_term

    # print(probability_per_doc)

    # rank doc based on probability
    import operator

    ranked_doc = dict( sorted(probability_per_doc.items(), key=operator.itemgetter(1),reverse=True))
    # print(ranked_doc)

    # print top 3 documents
    for i in range(3):
        print('doc ',i+1,' :')
        print(raw_doc_collection[list(ranked_doc.keys())[i]])

In [98]:
input_query = str(input())
print('==========================================================')
get_top_three_docs(input_query)

today marks the anniversary of the armenian genocide of million turks and kurds in eastern anatolia and armenia
doc  1  :
Today marks the 78th anniversary of the Armenian genocide of
2.5 million Turks and Kurds in Eastern Anatolia and x-Soviet
Armenia. The following letter, which represents a small portion 
of the full text, along with more than 200 pages of historical 
documents, scholarly sources, eyewitness accounts and photographs, 
was sent to President Bill Clinton, members of Congress, editors, 
program directors and columnists of major newspapers, journals and 
radio/TV stations for the 78th anniversary of the Armenian genocide 
of 2.5 million Muslim people. On April 23 of every year, the people 
of Turkiye remember their dead. They grieve for lost family and the 
lost homes of their grandfathers. This year the Turkish Nation is 
mourning and praying again for her fallen heroes who gave their 
lives generously and with altruism, so that the future generations 
may live on that 