In [1]:
import gensim
from gensim import corpora
from gensim import similarities
from gensim import models
from gensim.models import CoherenceModel

import preprocess

# Load LDA Model

In [2]:
lda_disk=gensim.models.ldamodel.LdaModel.load("Model/finalmodel_5Topics")

In [3]:
id2word = corpora.Dictionary.load('Model/finalmodel_Dictionary')

# Extract & Preprocess Test Data

In [4]:
test_corpus = preprocess.load_corpus('Data/Test/Chapters/5827')

test_ids = test_corpus.fileids()
chapters_name = [id.replace('.txt','') for id in test_ids]

test_docs = preprocess.corpus2docs(test_corpus)

In [5]:
bigram = gensim.models.Phrases(test_docs, min_count=5, threshold=50) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[test_docs], threshold=50)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

docs_bigrams = preprocess.make_bigrams(bigram_mod, test_docs)
data_bigrams_trigrams = preprocess.make_trigrams(bigram_mod, trigram_mod, docs_bigrams)

In [6]:
test_vecs = preprocess.docs2vecs(data_bigrams_trigrams, id2word)


In [7]:
for i in range(0,len(test_vecs)):
    vector = lda_disk[test_vecs[i]]
    sim_topic = max(vector,key=lambda item:item[1])
    print("test doc" + str(i) + ": " + str(vector))
    print("Closest Topic: Topic " + str(sim_topic[0]))

test doc0: [(0, 0.8512395), (1, 0.031325232), (2, 0.042352553), (3, 0.02898035), (4, 0.04610237)]
Closest Topic: Topic 0
test doc1: [(0, 0.71457255), (1, 0.0607099), (2, 0.050569046), (3, 0.03013943), (4, 0.14400911)]
Closest Topic: Topic 0
test doc2: [(0, 0.79104424), (2, 0.1284489), (3, 0.017346406), (4, 0.05698666)]
Closest Topic: Topic 0
test doc3: [(0, 0.82206583), (1, 0.019720402), (2, 0.023332983), (3, 0.10450603), (4, 0.030374799)]
Closest Topic: Topic 0
test doc4: [(0, 0.78624094), (1, 0.023834568), (2, 0.07907826), (3, 0.061529584), (4, 0.04931661)]
Closest Topic: Topic 0
test doc5: [(0, 0.8121431), (1, 0.013335185), (2, 0.093561254), (3, 0.04968349), (4, 0.031277)]
Closest Topic: Topic 0
test doc6: [(0, 0.7700966), (1, 0.017697612), (2, 0.07986363), (3, 0.08581666), (4, 0.046525497)]
Closest Topic: Topic 0
test doc7: [(0, 0.7267715), (1, 0.035118613), (2, 0.07545919), (3, 0.1138732), (4, 0.04877754)]
Closest Topic: Topic 0
test doc8: [(0, 0.92338794), (1, 0.012864977), (2, 0

In [8]:
topic_word = lda_disk.show_topic(0, topn=len(id2word))
topic_word[0:20]

[('conception', 0.027145082),
 ('government', 0.022720162),
 ('sensation', 0.018105727),
 ('image', 0.01726499),
 ('phenomenon', 0.015763616),
 ('pure', 0.01214446),
 ('belief', 0.011896276),
 ('proposition', 0.011019741),
 ('cognition', 0.010935685),
 ('city', 0.009880032),
 ('property', 0.009632613),
 ('intuition', 0.009512458),
 ('perception', 0.009453786),
 ('consciousness', 0.009450355),
 ('physical', 0.009419074),
 ('object', 0.008926799),
 ('empirical', 0.008017107),
 ('emotion', 0.007886184),
 ('representation', 0.007865984),
 ('space', 0.007719732)]

## Model 1 - Using Similarity Score

### Top Keywords

In [9]:
chap_num = 4
vector = lda_disk[test_vecs[chap_num]]
sim_topic = max(vector,key=lambda item:item[1])
top_topic = sim_topic[0]
top_topic
topic_word = lda_disk.show_topic(top_topic, topn=len(id2word))
# topic_word

In [10]:
selected_words = [id2word[i[0]] for i in test_vecs[chap_num]]
selected_words[0:10]

['absurdity',
 'accept',
 'actually',
 'acute',
 'admit',
 'advance',
 'agree',
 'almost',
 'already',
 'also']

In [11]:
key_words = []

keyword_type = 'bigrams'
for word in topic_word:
    if(keyword_type == 'unigrams'):
        if (len(key_words) < 5) & (word [0] in selected_words):
            key_words.append(word)
    else:
        if ('_' in word[0]) & (len(key_words) < 5) & (word [0] in selected_words):
            key_words.append(word)

key_words

[]

### Recommender

In [12]:
chapters_name

['CHAPTER_III__THE_NATURE_OF_MATTER',
 'CHAPTER_II__THE_EXISTENCE_OF_MATTER',
 'CHAPTER_IV__IDEALISM',
 'CHAPTER_IX__THE_WORLD_OF_UNIVERSALS',
 'CHAPTER_I__APPEARANCE_AND_REALITY',
 'CHAPTER_VIII__HOW__A_PRIORI__KNOWLEDGE_IS_POSSIBLE',
 'CHAPTER_VII__ON_OUR_KNOWLEDGE_OF_GENERAL_PRINCIPLES',
 'CHAPTER_VI__ON_INDUCTION',
 'CHAPTER_V__KNOWLEDGE_BY_ACQUAINTANCE_AND_KNOWLEDGE_BY_DESCRIPTION',
 'CHAPTER_XIII__KNOWLEDGE,_ERROR,_AND_PROBABLE_OPINION',
 'CHAPTER_XII__TRUTH_AND_FALSEHOOD',
 'CHAPTER_XIV__THE_LIMITS_OF_PHILOSOPHICAL_KNOWLEDGE',
 'CHAPTER_XI__ON_INTUITIVE_KNOWLEDGE',
 'CHAPTER_X__ON_OUR_KNOWLEDGE_OF_UNIVERSALS']

In [13]:
chap_num = 4

chosen_chapter = chapters_name[chap_num]
recommendation_scores = []
similarity = similarities.MatrixSimilarity(lda_disk[test_vecs])

for i in range(0,len(test_vecs)):
    vector = lda_disk[test_vecs[i]]
    sim_topic = max(vector,key=lambda item:item[1])
    
    if(i == chap_num):
        sims = similarity[vector]
        sims = list(enumerate(sims))
        for sim in sims:
            chapter_num = sim[0]
            recommendation_score = [chapters_name[chapter_num], sim[1]]
            recommendation_scores.append(recommendation_score)
        
recommendation_scores = sorted(recommendation_scores, key=lambda x: x[1], reverse=True)     
recommendation = []


print('Top 3 Recommended Chapters:')
for i in range(1,4):
    recommendation.append((recommendation_scores[i][0], recommendation_scores[i][1]))
    print(f'{i}. {str(recommendation_scores[i][0])}({str(recommendation_scores[i][1])})')

Top 3 Recommended Chapters:
1. CHAPTER_VII__ON_OUR_KNOWLEDGE_OF_GENERAL_PRINCIPLES(0.9994335)
2. CHAPTER_VIII__HOW__A_PRIORI__KNOWLEDGE_IS_POSSIBLE(0.9993707)
3. CHAPTER_XIV__THE_LIMITS_OF_PHILOSOPHICAL_KNOWLEDGE(0.99810404)


## Model 2 - Using Distribution of Topics

In [14]:
chapters_name

['CHAPTER_III__THE_NATURE_OF_MATTER',
 'CHAPTER_II__THE_EXISTENCE_OF_MATTER',
 'CHAPTER_IV__IDEALISM',
 'CHAPTER_IX__THE_WORLD_OF_UNIVERSALS',
 'CHAPTER_I__APPEARANCE_AND_REALITY',
 'CHAPTER_VIII__HOW__A_PRIORI__KNOWLEDGE_IS_POSSIBLE',
 'CHAPTER_VII__ON_OUR_KNOWLEDGE_OF_GENERAL_PRINCIPLES',
 'CHAPTER_VI__ON_INDUCTION',
 'CHAPTER_V__KNOWLEDGE_BY_ACQUAINTANCE_AND_KNOWLEDGE_BY_DESCRIPTION',
 'CHAPTER_XIII__KNOWLEDGE,_ERROR,_AND_PROBABLE_OPINION',
 'CHAPTER_XII__TRUTH_AND_FALSEHOOD',
 'CHAPTER_XIV__THE_LIMITS_OF_PHILOSOPHICAL_KNOWLEDGE',
 'CHAPTER_XI__ON_INTUITIVE_KNOWLEDGE',
 'CHAPTER_X__ON_OUR_KNOWLEDGE_OF_UNIVERSALS']

In [15]:
# if first digit in tuple matches, store doc
# reco_docs -- dont remove user input
# - get index of user input within reco_docs
# - compare difference between first topic probability and store as list; e.g. {4: 0.002,...}
# - take top 5 minimum difference

# chap_num = 0
vector_selected = lda_disk[test_vecs[chap_num]]
vector_selected.sort(key=lambda x: x[1], reverse=True)
vector_selected

reco_docs = {}

for i in range(0,len(test_vecs)):
    vector = lda_disk[test_vecs[i]]
    vector.sort(key=lambda x: x[1], reverse=True)
    
    if (vector[0][0]==vector_selected[0][0]) & (vector[1][0] == vector_selected[1][0]):
        reco_docs[i] = vector
        # print(vector[0][1])

# print(reco_docs)
input_topic = reco_docs[chap_num]

# Remove user input
if chap_num in reco_docs.keys():
    reco_docs.pop(chap_num)

# if there's no 2 same topics
if len(reco_docs) < 3:
    for i in range(0,len(test_vecs)):
        vector = lda_disk[test_vecs[i]]
        vector.sort(key=lambda x: x[1], reverse=True)
        
        if (vector[0][0]==vector_selected[0][0]):
            reco_docs[i] = vector

    # Remove user input
    if chap_num in reco_docs.keys():
        reco_docs.pop(chap_num)

diff = ""
diff_dict = {}
for x, y in reco_docs.items():
    diff = abs(input_topic[0][1] - reco_docs[x][0][1])
    diff_dict[x] = diff

diff_dict = sorted(diff_dict.items(), key=lambda x:x[1], reverse=False)
diff_dict = dict(diff_dict)
# print(diff_dict)

lst = list(diff_dict.keys())
# print(lst[0:5])

print('Top 3 Recommended Chapters:')
for i in range(0,len(lst)):
    if i < 3:
        print(f'{i+1}. {chapters_name[lst[i]]}')

Top 3 Recommended Chapters:
1. CHAPTER_XIV__THE_LIMITS_OF_PHILOSOPHICAL_KNOWLEDGE
2. CHAPTER_IV__IDEALISM
3. CHAPTER_XIII__KNOWLEDGE,_ERROR,_AND_PROBABLE_OPINION
