In [1]:
import nltk
from nltk.tokenize import RegexpTokenizer
from collections import Counter, defaultdict, OrderedDict
import re
import sys
import os
import math
import string
import time
import operator
from itertools import islice

nltk.download('stopwords')
import pickle

import warnings
warnings.filterwarnings("ignore")

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dongj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dongj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
start_time = time.time()

In [3]:
book_content_w_ch =[]
with open('kjv.txt') as f:
    next(f)
    for line in f:
        book_content_w_ch.append(line)

In [4]:
book_content =[]
for i in book_content_w_ch:
    book_content.append(i.replace(i.split(' ')[0], ''))

In [5]:
def normalization(word):
    word= word.lower()
    word = word.replace('\n','')
    word  = re.sub(r'[^\w\s]', '', word)
    word = re.sub('[0-9]', '', word)
    return word

In [6]:
def preprocess(data):
    result = []
    for line in data:
        word = normalization(line)
        word = word.lower().strip().split()
        stopwords = nltk.corpus.stopwords.words("english")
        word = [lemmatizer.lemmatize(w) for w in word if not w in stopwords]
        word = " ".join(word)
        result.append(word)
    return result

In [7]:
def calculate_terms(listed):
    normalized_text = []
    collection_frequency = Counter()
    document_frequency = Counter()
    output_wordlist_dict ={}
    terms_frequency = defaultdict(lambda: Counter([]))
    
    normalized_text = preprocess(listed)
    
    for i in range(len(normalized_text)):
        tokenized_list= []
        for j in normalized_text[i].split():
            tokenized_list.append(j)
        output_wordlist_dict[i] = Counter(tokenized_list)
        collection_frequency.update(tokenized_list)
        document_frequency.update(set(tokenized_list))
        
    for key, value in output_wordlist_dict.items():
        for term, term_cnt in value.items():
            terms_frequency[term][key] += term_cnt
    
    return normalized_text, collection_frequency, document_frequency, output_wordlist_dict, terms_frequency

In [8]:
def dictionary_list(listed):
    sort_dict = {}
    result_sort_dict = {}
    offset_sum = 0
    offset_i = 0
    sort_dict = OrderedDict(sorted(listed.items()))
    for i, value in enumerate(sort_dict.keys()):
        offset_i = len(sort_dict[value]) * 2 
        result_sort_dict[value] = len(sort_dict[value].values()),offset_sum
        offset_sum = offset_sum + offset_i 
    return result_sort_dict

In [9]:
def idf_corpus(dict_corpus,N_corpus):
    idf_dict = {}
    for key_i in dict_corpus.keys():
        tf_i = dict_corpus.get(key_i)[0]
        idf_i = math.log2(N_corpus/tf_i)
        idf_dict[key_i] = idf_i
    return idf_dict

In [10]:
def tf_idf(post_list,idf_matrix):
    weight_matrix =[]
    for i, j in post_list.items():
        idf ={}
        for k in j:
            if k not in idf_matrix:
                idf_matrix[k] = 0
            else:
                idf[k] = idf_matrix[k]*j[k]
        weight_matrix.append(idf)                       
    return weight_matrix

In [11]:
def vector_length(weight):
    length_matrix = {}
    for doc_i in range(len(weight)):
        length_matrx = []
        for i in weight[doc_i].values():
            length_matrx.append(i)
        sum_of_squares = sum(map(lambda k : k * k, length_matrx))
        vlength = math.sqrt(sum_of_squares)
        length_matrix[doc_i] = vlength
    return length_matrix

In [12]:
def cosine_similarities(doc_weight, query_weight, doc_length, query_length, query_term_freq):
    N = len(doc_weight)
    cos_score = []
    for i in range(len(query_term_freq)):
        cos_score.append([0]*N)
        for j  in query_term_freq[i].keys():
            query_tfidf = 0
            if query_weight[i].get(j):
                query_tfidf = query_weight[i].get(j)
            for k in range(len(doc_weight)):
                if(query_length[i] != 0) & (doc_length[k] != 0):
                    if(doc_weight[k].get(j)):
                        #Document Length * Query Length
                        doc_query_length = doc_length[k] * query_length[i]
                        # tf-idf weight of term in document * tf-idf weight of term in query
                        doc_query_vector = doc_weight[k].get(j) * query_tfidf
                        cos_score[i][k] += doc_query_vector / doc_query_length  
    return cos_score

In [13]:
with open('pickle_set/book_norm_text.pickle', 'rb') as f:
    book_normalized_text = pickle.load(f)

with open('pickle_set/book_collec_freq.pickle', 'rb') as f:
    book_collec_freq = pickle.load(f)

with open('pickle_set/book_doc_freq.pickle', 'rb') as f:
    book_doc_freq = pickle.load(f)
    
with open('pickle_set/book_term_freq.pickle', 'rb') as f:
    book_term_freq = pickle.load(f)

with open('pickle_set/book_dict_pos_output.pickle', 'rb') as f:
    book_dict_pos_output = pickle.load(f)

In [14]:
book_idf_matrix = idf_corpus(book_dict_pos_output,len(book_normalized_text))
book_weight = tf_idf(book_term_freq, book_idf_matrix)
book_length = vector_length(book_weight)

In [15]:
query_content= ['Jesus saves us']

In [16]:
q_normalized_text, q_collec_freq, q_doc_freq, q_term_freq, q_posting_list_output = calculate_terms(query_content)

In [17]:
q_dict_pos_output = dictionary_list(q_posting_list_output)
q_idf_matrix = idf_corpus(q_dict_pos_output,len(q_normalized_text))
q_weight = tf_idf(q_term_freq, book_idf_matrix)
q_length = vector_length(q_weight)

In [18]:
cos_score_q = cosine_similarities(book_weight, q_weight, book_length, q_length, q_term_freq)

In [19]:
ranking_list =[]
for i in range(len(cos_score_q)):
    ranked = []
    for j in range(len(cos_score_q[i])):
        ranked.append([j, cos_score_q[i][j]])
    ranking_list.append(ranked)

In [20]:
for i in ranking_list:
    i.sort(key=lambda x: x[1], reverse=True)

In [21]:
every_result =[]
for i in ranking_list:
    result =[]
    for j in i[:10]:
        result.append(book_content_w_ch[j[0]])
    every_result.append(result)

In [22]:
print(query_content)

['Jesus saves us']


In [23]:
for i in every_result[0]:
    print(i)

2Sm22:32 For who is God, save the LORD? and who is a rock, save our God?

Psa18:31 For who is God save the LORD? or who is a rock save our God?

Mat17:8 And when they had lifted up their eyes, they saw no man, save Jesus only.

Psa55:16 As for me, I will call upon God; and the LORD shall save me.

Mark9:8 And suddenly, when they had looked round about, they saw no man any more, save Jesus only with themselves.

Mat1:21 And she shall bring forth a son, and thou shalt call his name JESUS: for he shall save his people from their sins.

Hos1:7 But I will have mercy upon the house of Judah, and will save them by the LORD their God, and will not save them by bow, nor by sword, nor by battle, by horses, nor by horsemen.

Psa69:1 Save me, O God; for the waters are come in unto my soul.

1Cor2:2 For I determined not to know any thing among you, save Jesus Christ, and him crucified.

Mat13:57 And they were offended in him. But Jesus said unto them, A prophet is not without honour, save in his ow

In [24]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 2.50607967376709 seconds ---
