In [1]:
import gensim
from gensim import corpora
from gensim import similarities
from gensim import models
from gensim.models import CoherenceModel

import preprocess

# Load LDA Model

In [2]:
lda_disk=gensim.models.ldamodel.LdaModel.load("Model/finalmodel_5Topics")

In [3]:
id2word = corpora.Dictionary.load('Model/finalmodel_Dictionary')

# Extract & Preprocess Test Data

In [4]:
test_corpus_1974 = preprocess.load_corpus('Data/Test/Chapters/1974')

test_ids = test_corpus_1974.fileids()
chapters_name = [id.replace('.txt','') for id in test_ids]

test_docs_1974 = preprocess.corpus2docs(test_corpus_1974)

In [5]:
def make_bigrams(bigram_mod, texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(bigram_mod, trigram_mod, texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [6]:
bigram = gensim.models.Phrases(test_docs_1974, min_count=5, threshold=50) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[test_docs_1974], threshold=50)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

docs_bigrams = make_bigrams(bigram_mod, test_docs_1974)
data_bigrams_trigrams = make_trigrams(bigram_mod, trigram_mod, docs_bigrams)

In [7]:
test_vecs_1974 = preprocess.docs2vecs(data_bigrams_trigrams, id2word)
similarity = similarities.MatrixSimilarity(lda_disk[test_vecs_1974])

In [8]:
for i in range(0,len(test_vecs_1974)):
    vector = lda_disk[test_vecs_1974[i]]
    sim_topic = max(vector,key=lambda item:item[1])
    print("test doc" + str(i) + ": " + str(vector))
    print("Closest Topic: Topic " + str(sim_topic[0]))

test doc0: [(0, 0.01218065), (1, 0.35396844), (2, 0.11819526), (3, 0.48850587), (4, 0.027149793)]
Closest Topic: Topic 3
test doc1: [(0, 0.06205687), (1, 0.1359066), (2, 0.06236717), (3, 0.69744503), (4, 0.04222432)]
Closest Topic: Topic 3
test doc2: [(0, 0.03600633), (1, 0.34787714), (2, 0.0643663), (3, 0.4853912), (4, 0.066358976)]
Closest Topic: Topic 3
test doc3: [(0, 0.052230693), (1, 0.28845716), (2, 0.033839863), (3, 0.5125502), (4, 0.112922125)]
Closest Topic: Topic 3
test doc4: [(0, 0.025988597), (1, 0.042162128), (2, 0.12814552), (3, 0.7267955), (4, 0.07690824)]
Closest Topic: Topic 3
test doc5: [(0, 0.041118626), (1, 0.114718914), (2, 0.082272336), (3, 0.6925426), (4, 0.0693475)]
Closest Topic: Topic 3
test doc6: [(0, 0.09242194), (1, 0.31327662), (2, 0.1459558), (3, 0.36468288), (4, 0.083662726)]
Closest Topic: Topic 3
test doc7: [(0, 0.035176884), (2, 0.110975124), (3, 0.74904734), (4, 0.10462521)]
Closest Topic: Topic 3
test doc8: [(0, 0.035921), (1, 0.13626826), (2, 0.09

In [9]:
topic_word = lda_disk.show_topic(0, topn=len(id2word))
topic_word[0:20]

[('conception', 0.03775606),
 ('zarathustra', 0.028077127),
 ('thou', 0.024449555),
 ('hath', 0.018459603),
 ('pure', 0.016960025),
 ('phenomenon', 0.015507126),
 ('thee', 0.0153419515),
 ('unto', 0.01463752),
 ('cognition', 0.013255156),
 ('intuition', 0.013230192),
 ('empirical', 0.010757555),
 ('social', 0.010513507),
 ('activity', 0.010492589),
 ('education', 0.0104921805),
 ('representation', 0.009958939),
 ('priori', 0.009342978),
 ('verily', 0.009038341),
 ('doth', 0.008945876),
 ('space', 0.008808358),
 ('transcendental', 0.008680221)]

## Model 1 - Using Similarity Score

### Top Keywords

In [10]:
chap_num = 4
vector = lda_disk[test_vecs_1974[chap_num]]
sim_topic = max(vector,key=lambda item:item[1])
top_topic = sim_topic[0]
top_topic
topic_word = lda_disk.show_topic(top_topic, topn=len(id2word))
# topic_word

In [11]:
selected_words = [id2word[i[0]] for i in test_vecs_1974[chap_num]]
selected_words[0:10]

['action',
 'admits',
 'admitted',
 'already',
 'also',
 'author',
 'change',
 'character',
 'common',
 'confine']

In [22]:
key_words = []

keyword_type = 'bigrams'
for word in topic_word:
    if(keyword_type == 'unigrams'):
        if (len(key_words) < 5) & (word [0] in selected_words):
            key_words.append(word)
    else:
        if ('_' in word[0]) & (len(key_words) < 5) & (word [0] in selected_words):
            key_words.append(word)

key_words

[('epic_poetry', 0.00058412395)]

### Recommender

In [26]:
chap_num = 4

chosen_chapter = chapters_name[chap_num]
recommendation_scores = []

for i in range(0,len(test_vecs_1974)):
    vector = lda_disk[test_vecs_1974[i]]
    sim_topic = max(vector,key=lambda item:item[1])
    
    if(i == chap_num):
        sims = similarity[vector]
        sims = list(enumerate(sims))
        for sim in sims:
            chapter_num = sim[0]
            recommendation_score = [chapters_name[chapter_num], sim[1]]
            recommendation_scores.append(recommendation_score)
        
recommendation_scores = sorted(recommendation_scores, key=lambda x: x[1], reverse=True)     
recommendation = []


print('Top 3 Recommended Chapters:')
for i in range(1,4):
    recommendation.append((recommendation_scores[i][0], recommendation_scores[i][1]))
    print(f'{i}. {str(recommendation_scores[i][0])}({str(recommendation_scores[i][1])})')

Top 3 Recommended Chapters:
1. XVIII(0.9980166)
2. XI(0.9974043)
3. XXIV(0.9947127)


## Model 2 - Using Distribution of Topics

In [25]:
# if first digit in tuple matches, store doc
# reco_docs -- dont remove user input
# - get index of user input within reco_docs
# - compare difference between first topic probability and store as list; e.g. {4: 0.002,...}
# - take top 5 minimum difference

j = 4
vector_selected = lda_disk[test_vecs_1974[j]]
vector_selected.sort(key=lambda x: x[1], reverse=True)
vector_selected

reco_docs = {}

for i in range(0,len(test_vecs_1974)):
    vector = lda_disk[test_vecs_1974[i]]
    vector.sort(key=lambda x: x[1], reverse=True)
    
    if (vector[0][0]==vector_selected[0][0]) & (vector[1][0] == vector_selected[1][0]):
        reco_docs[i] = vector
#         print(vector[0][1])

# print(reco_docs)

input_topic = reco_docs[j]

# Remove user input
if j in reco_docs.keys():
    reco_docs.pop(j)

diff = ""
diff_dict = {}
for x, y in reco_docs.items():
    diff = abs(input_topic[0][1] - reco_docs[x][0][1])
    diff_dict[x] = diff

diff_dict = sorted(diff_dict.items(), key=lambda x:x[1], reverse=False)
diff_dict = dict(diff_dict)
# print(diff_dict)

lst = list(diff_dict.keys())
lst[0:5]

print('Top 3 Recommended Chapters:')
for i in range(0,3):
    print(f'{i+1}. {chapters_name[i]}')

Top 3 Recommended Chapters:
1. I
2. III
3. IV
