In [1]:
import nltk
from nltk.tokenize import RegexpTokenizer
from collections import Counter, defaultdict, OrderedDict
import re
import sys
import os
import math
import string
import time
import operator
from itertools import islice

nltk.download('stopwords')

import pickle

import warnings
warnings.filterwarnings("ignore")

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
ps =PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dongj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dongj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
book_content_w_ch =[]
with open('kjv.txt') as f:
    next(f)
    for line in f:
        book_content_w_ch.append(line)

In [3]:
book_content =[]
for i in book_content_w_ch:
    book_content.append(i.replace(i.split(' ')[0], ''))

In [4]:
def normalization(word):
    word= word.lower()
    word = word.replace('\n','')
    word  = re.sub(r'[^\w\s]', '', word)
    word = re.sub('[0-9]', '', word)
    return word

In [5]:
def preprocess(data):
    result = []
    for line in data:
        word = normalization(line)
        word = word.lower().strip().split()
        stopwords = nltk.corpus.stopwords.words("english")
        word = [lemmatizer.lemmatize(lemmatizer.lemmatize(w , 'v'), 'a') for w in word if not w in stopwords]
        word = " ".join(word)
        result.append(word)
    return result

In [6]:
def calculate_terms(listed):
    normalized_text = []
    collection_frequency = Counter()
    document_frequency = Counter()
    output_wordlist_dict ={}
    terms_frequency = defaultdict(lambda: Counter([]))
    
    normalized_text = preprocess(listed)
    
    for i in range(len(normalized_text)):
        tokenized_list= []
        for j in normalized_text[i].split():
            tokenized_list.append(j)
        output_wordlist_dict[i] = Counter(tokenized_list)
        collection_frequency.update(tokenized_list)
        document_frequency.update(set(tokenized_list))
        
    for key, value in output_wordlist_dict.items():
        for term, term_cnt in value.items():
            terms_frequency[term][key] += term_cnt
    
    return normalized_text, collection_frequency, document_frequency, output_wordlist_dict, terms_frequency

In [7]:
def dictionary_list(listed):
    sort_dict = {}
    result_sort_dict = {}
    offset_sum = 0
    offset_i = 0
    sort_dict = OrderedDict(sorted(listed.items()))
    for i, value in enumerate(sort_dict.keys()):
        offset_i = len(sort_dict[value]) * 2 
        result_sort_dict[value] = len(sort_dict[value].values()),offset_sum
        offset_sum = offset_sum + offset_i 
    return result_sort_dict

In [8]:
def idf_corpus(dict_corpus,N_corpus):
    idf_dict = {}
    for key_i in dict_corpus.keys():
        tf_i = dict_corpus.get(key_i)[0]
        idf_i = math.log2(N_corpus/tf_i)
        idf_dict[key_i] = idf_i
    return idf_dict

In [9]:
def tf_idf(post_list,idf_matrix):
    weight_matrix =[]
    for i, j in post_list.items():
        idf ={}
        for k in j:
            if k not in idf_matrix:
                idf_matrix[k] = 0
            else:
                idf[k] = idf_matrix[k]*j[k]
        weight_matrix.append(idf)                       
    return weight_matrix

In [10]:
def vector_length(weight):
    length_matrix = {}
    for doc_i in range(len(weight)):
        length_matrx = []
        for i in weight[doc_i].values():
            length_matrx.append(i)
        sum_of_squares = sum(map(lambda k : k * k, length_matrx))
        vlength = math.sqrt(sum_of_squares)
        length_matrix[doc_i] = vlength
    return length_matrix

In [11]:
def cosine_similarities(doc_weight, query_weight, doc_length, query_length, query_term_freq):
    N = len(doc_weight)
    cos_score = []
    for i in range(len(query_term_freq)):
        cos_score.append([0]*N)
        for j  in query_term_freq[i].keys():
            query_tfidf = 0
            if query_weight[i].get(j):
                query_tfidf = query_weight[i].get(j)
            for k in range(len(doc_weight)):
                if(query_length[i] != 0) & (doc_length[k] != 0):
                    if(doc_weight[k].get(j)):
                        #Document Length * Query Length
                        doc_query_length = doc_length[k] * query_length[i]
                        # tf-idf weight of term in document * tf-idf weight of term in query
                        doc_query_vector = doc_weight[k].get(j) * query_tfidf
                        cos_score[i][k] += doc_query_vector / doc_query_length  
    return cos_score

In [12]:
book_normalized_text, book_collec_freq, book_doc_freq, book_term_freq, book_posting_list_output = calculate_terms(book_content)

In [13]:
book_dict_pos_output = dictionary_list(book_posting_list_output)

In [14]:
with open('pickle_set/book_norm_text.pickle', 'wb') as f:
    pickle.dump(book_normalized_text, f)

with open('pickle_set/book_collec_freq.pickle', 'wb') as f:
    pickle.dump(book_collec_freq, f)

with open('pickle_set/book_doc_freq.pickle', 'wb') as f:
    pickle.dump(book_doc_freq, f)

with open('pickle_set/book_term_freq.pickle', 'wb') as f:
    pickle.dump(book_term_freq, f)

with open('pickle_set/book_dict_pos_output.pickle', 'wb') as f:
    pickle.dump(book_dict_pos_output, f)

In [15]:
book_idf_matrix = idf_corpus(book_dict_pos_output,len(book_normalized_text))
book_weight = tf_idf(book_term_freq, book_idf_matrix)
book_length = vector_length(book_weight)

In [16]:
query_content = ['jesus']

In [17]:
q_normalized_text, q_collec_freq, q_doc_freq, q_term_freq, q_posting_list_output = calculate_terms(query_content)

In [18]:
q_dict_pos_output = dictionary_list(q_posting_list_output)
q_idf_matrix = idf_corpus(q_dict_pos_output,len(q_normalized_text))
q_weight = tf_idf(q_term_freq, book_idf_matrix)
q_length = vector_length(q_weight)

In [19]:
cos_score_q = cosine_similarities(book_weight, q_weight, book_length, q_length, q_term_freq)

In [20]:
len(cos_score_q)

1

In [21]:
ranking_list =[]
for i in range(len(cos_score_q)):
    ranked = []
    for j in range(len(cos_score_q[i])):
        ranked.append([j, cos_score_q[i][j]])
    ranking_list.append(ranked)

In [22]:
for i in ranking_list:
    i.sort(key=lambda x: x[1], reverse=True)

In [23]:
every_result =[]
chapter_result=[]
score_result =[]
for i in ranking_list:
    result =[]
    for j in i[:10]:
        chapter_result.append(j[0])
        result.append(book_content_w_ch[j[0]])
        score_result.append(j[1])
    every_result.append(result)

In [24]:
score_result

[0.5800317141770351,
 0.5660812308164758,
 0.5453317562279316,
 0.5339655815058644,
 0.51452203158029,
 0.5083034417835299,
 0.4947734725020178,
 0.49376078527543704,
 0.4928976184694945,
 0.4733574037957402]

In [25]:
len(every_result[0])

10

In [26]:
print(query_content)

['jesus']


In [27]:
every_result

[["2Cor4:5 For we preach not ourselves, but Christ Jesus the Lord; and ourselves your servants for Jesus' sake.\n",
  'John20:14 And when she had thus said, she turned herself back, and saw Jesus standing, and knew not that it was Jesus.\n',
  'John11:35 Jesus wept.\n',
  '1Th4:14 For if we believe that Jesus died and rose again, even so them also which sleep in Jesus will God bring with him.\n',
  'John21:4 But when the morning was now come, Jesus stood on the shore: but the disciples knew not that it was Jesus.\n',
  'Mat26:50 And Jesus said unto him, Friend, wherefore art thou come? Then came they, and laid hands on Jesus, and took him.\n',
  'Mark10:47 And when he heard that it was Jesus of Nazareth, he began to cry out, and say, Jesus, thou son of David, have mercy on me.\n',
  "John13:23 Now there was leaning on Jesus' bosom one of his disciples, whom Jesus loved.\n",
  'John4:26 Jesus saith unto her, I that speak unto thee am he.\n',
  '1Cor5:4 In the name of our Lord Jesus Chri

In [28]:
with open('pickle_set/bible_chapter_list.pickle', 'rb') as f:
    bible_chapter_list = pickle.load(f)

In [29]:
	for i in range(len(every_result[0])):
		print(str(bible_chapter_list[chapter_result[i]])+ ' ' + str(every_result[0][i]))

2 Corinthians 4:5 2Cor4:5 For we preach not ourselves, but Christ Jesus the Lord; and ourselves your servants for Jesus' sake.

John 20:14 John20:14 And when she had thus said, she turned herself back, and saw Jesus standing, and knew not that it was Jesus.

John 11:35 John11:35 Jesus wept.

1 Thessalonians 4:14 1Th4:14 For if we believe that Jesus died and rose again, even so them also which sleep in Jesus will God bring with him.

John 21:4 John21:4 But when the morning was now come, Jesus stood on the shore: but the disciples knew not that it was Jesus.

Matthew 26:50 Mat26:50 And Jesus said unto him, Friend, wherefore art thou come? Then came they, and laid hands on Jesus, and took him.

Mark 10:47 Mark10:47 And when he heard that it was Jesus of Nazareth, he began to cry out, and say, Jesus, thou son of David, have mercy on me.

John 13:23 John13:23 Now there was leaning on Jesus' bosom one of his disciples, whom Jesus loved.

John 4:26 John4:26 Jesus saith unto her, I that speak 