In [1]:
import sys
from essential_generators import DocumentGenerator
import io
import re
import numpy as np
import nltk
nltk.download('punkt')
import os
import math


def gen_data():
    """
    Data generator
    """
    gen = DocumentGenerator()
    for i in range(1, 1101):
        with io.open(f"sample_text/text_{i}.txt", "w", encoding="utf-8") as f:
            # for j in range(1, 10):
            p = gen.paragraph()
            f.write(p)
            f.close()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\surja\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def create_dict():
    dictionary={}
    for i in range(1, 1101):
        with io.open(f"sample_text/text_{i}.txt", "r", encoding="utf-8") as f:
            data = f.read()
            f.close()
            data = data.lower()
            data = re.sub(r'[^\w\s]','',data)
            data = re.sub('[0-9]','',data)
            tokenized_words=nltk.word_tokenize(data)
            for word in tokenized_words:
                if word in dictionary.keys():
                    dictionary[word]+=1
                else:
                    dictionary[word]=1
    return dictionary

In [3]:
dictionary = create_dict()
len(dictionary)

14319

In [4]:
def bag_of_words_vector(file):
    vector_dict={}
    for w in dictionary.keys():
        vector_dict[w]=0
    f = io.open(file,encoding="utf8")
    data = f.read()
    f.close()
    data = data.lower()
    data = re.sub(r'[^\w\s]','',data)
    data = re.sub('[0-9]','',data)
    words=nltk.word_tokenize(data)
    for w in words:
        vector_dict[w]+=1
    return vector_dict

In [5]:
def term_by_document_matrix():
    files = os.listdir('sample_text/')
    t_b_d_m = []
    for i in range(1, 1101):
        file_name = f'sample_text/text_{i}.txt'
        bow_vector=bag_of_words_vector(file_name)
        t_b_d_m.append(bow_vector)
    return t_b_d_m

In [6]:
term_by_document_mtx = term_by_document_matrix()


In [7]:
def get_count_of_word(word):
    c = 0
    for i in term_by_document_mtx:
        if i.get(word)>0:
            c+=1
    return c

In [8]:
def multiply_by_IDF():
    for word in dictionary.keys():
        n_w = get_count_of_word(word)
        m = math.log(len(term_by_document_mtx)/n_w)
        for dic in term_by_document_mtx:
            dic[word] = dic.get(word)*m

In [9]:
multiply_by_IDF()


In [10]:
def create_vector(data):
    vector_dict={}
    for w in dictionary.keys():
        vector_dict[w]=0
    data = data.lower()
    data = re.sub(r'[^\w\s]','',data)
    data = re.sub('[0-9]','',data)
    words=nltk.word_tokenize(data)
    for w in words:
        if w in vector_dict:
            vector_dict[w]+=1
    return list(vector_dict.values())

def get_closest_documents(data,k):
    data_vect = create_vector(data)
    holder = 0
    index = None
    to_return = []
    for idx,i in enumerate(term_by_document_mtx):
        cos = np.matmul(np.array(data_vect).T,np.array(list(i.values())))/(np.linalg.norm(data_vect)*np.linalg.norm(list(i.values())))
        if len(to_return)<k:
            to_return.append((idx,cos))
            to_return = sorted(to_return, key=lambda x: x[1])
        else:
            if cos > to_return[0][1] or math.isnan(to_return[0][1]):
                to_return[0] = (idx,cos)
                to_return = sorted(to_return, key=lambda x: x[1])
    return to_return


        


In [11]:
t = get_closest_documents("And extends industry awards, the juno awards, which were as important as the quilombo of. Was down and hemp plantations. these men.. Spaces is terrain. unlike most mammals, when cats bring. History, 1815-1970. australia (6.4 percent), saudi. Verification of savannah from the swampland, were widespread during the nonbreeding. Monarch of classroom. in 2013, the beach handball world championships. News media regional airport, bert mooney airport and the plant and animal species. Kamerun. later, basin, red lodge, and whitefish mountain resort near libby whitefish. Fever were extreme emotional."
                          ,3)
def read_file(index):
    with io.open(f"sample_text/text_{index+1}.txt", "r") as f:
        print("______________________________________")
        print(f.read())
        print("______________________________________")

print(t)
for (idx,cos) in t:
    read_file(idx)

[(318, 0.08172827634973914), (702, 0.1519243455746266), (8, 0.6824081772328199)]
______________________________________
An old populism, that were. To language. golbery. with the implementation of classical music.. Portal climate oil deposits were formed in the. And are this definition. Or kingdoms, broadly to include certain types of. Are Ă©dith on seattle in 1941. this left an extensive. Kalahari desert river, then the world's. Though whether germans live abroad. jews are. Asia. colonialism plantations. these men, women and could not obtain identification and leave. Which winds morelos, who occupied. Most ancient including rainier beach, van asselt, rainier, and jefferson south of. Through e-mail. and 1940s..
______________________________________
______________________________________
Precipitating deck and whitefish mountain resort near red lodge. Festivals, colonial lakes account for news. By aristippus other types. for example, the. Hadron and 0.14%. 
 
 the world of coca-cola, f

In [12]:
from sklearn.preprocessing import normalize

In [13]:
def normalize_matrix():
    to_return = []
    for idx,i in enumerate(term_by_document_mtx):
        to_return.append(normalize([list(i.values())],norm="l1"))
    return to_return
    

In [14]:
normalized_A = normalize_matrix()

In [15]:
def get_cos_vector(data):
    data_vector_normalized = normalize([create_vector(data)],norm="l1")
    to_return = []
    for idx,i in enumerate(normalized_A):
        cos = np.matmul(np.array(data_vector_normalized[0]).T,np.array(i[0]))/(np.linalg.norm(data_vector_normalized)*np.linalg.norm(i))
        to_return.append(cos)
    return to_return

    

In [16]:
cos_vector = get_cos_vector("Coalition force transportation, however, relies on these subfields is given. Came the reforms. this. 57.71% of styles, with e.g. fritz höger, erich mendelsohn, dominikus böhm and. Psychology, examines adidas, porsche, and dhl.. First letter inuit art have been. Accountability — clouds. by contrast, howard used universally accepted latin, which caught on in the. Generated per justine henin both were ranked in the income inequality is very large.. Collaborators from have excellent night vision and a priori – which. Cargo and napoleon iii. he multiplied french interventions abroad, especially in crimea!. ")
print(cos_vector)

[0.7102965217114164, 0.017652372368133582, 0.0574140079630231, 0.033000801031290376, 0.012654538525532086, 0.7102965217114164, 0.016148714464710813, 0.03335132439477266, 0.03921611125069165, 0.02703687592551752, 0.02509725066431335, 0.04214546698312822, 0.03090044624316164, 0.036620894621750576, 0.014883336657672178, 0.020823336292569883, 0.025615000488097072, 0.041401943680191185, 0.0, 0.05040225734366928, 0.018121541956079353, 0.028929343982031775, 0.012347912687646354, 0.03536589691636184, 0.043781621509063735, 0.0328124882955614, 0.027442057225124047, 0.038608145305993984, 0.04231990133508279, 0.009374877354061074, 0.024807157449735585, 0.029526170433778132, 0.019317380541369916, 0.03859399060371769, 0.023990663708946612, 0.043748934226019616, 0.04001386568507232, 0.020267026408890592, 0.04283924923292809, 0.03225986535384419, 0.012247375046546915, 0.03937402047148346, 0.033191209438616774, 0.026952506782663435, 0.039693378565827345, 0.03867820738641992, 0.027585805388492562, 0.028