<h1>Train a Doc2Vec Model on the JFK Assassination Files and Deploy it as an Azure Function Ap</h2>

In [34]:
import gensim, logging, os, pickle
from utils.dataclean import dataclean
from utils.pickle_it import pickle_it
from utils.whoopsie import whoopsie

In [8]:
# Read a Document from a filename, tagging each document w a supplied tag list
def read_a_doc(fname, tags=[None], tokens_only=False):
    try:
        with open(fname, encoding='utf-8') as f:
            doc = f.read()
    except Exception as exc:
        whoopsie(str(exc))
        return None
    # send that through the datacleaner 
    doc = dataclean(doc)
    # tokenize the doc
    tokens = gensim.utils.simple_preprocess(doc)
    if tokens_only:
        return tokens
    else:
        # For training data, add an index tag
        return gensim.models.doc2vec.TaggedDocument(tokens, tags)
        # Really? The 'tag' is supposed to a be a list of a single integer?

In [16]:
def read_corpus (corp_path) :
    corpus = []
    files = os.listdir(corp_path)
    n = len(files)
    for i in range(0, n):
        fn = files[i]
        try:
            tdoc = read_a_doc(corp_path + "/" + fn, tags=[fn[:-4]])
        except Exception as exc:
            whoopsie(str(exc))
        if isinstance(tdoc, gensim.models.doc2vec.TaggedDocument):
            corpus.append(tdoc)
        if i % 1000 == 0:
            print(f"Reading Corpus: {i} files read.")
    print(f"Done reading {i} files into the corpus.")
    pickle_it(corpus, 'jfk_corpus.pkl')
    print(f"Done saving corpus.")
    return corpus

In [None]:
# Load a trimmed corpus here
try:
    corpus = read_corpus("Corpus")
except Exception as exc:
    whoopsie("Corpus build failure: " + str(exc))

In [31]:
with open("jfk_corpus.pkl", "rb") as f:
    corpus = pickle.load(f)

In [33]:
try:
    print("Begin training model.")
    model = gensim.models.doc2vec.Doc2Vec(documents=corpus, vector_size=400, min_count=10, epochs=40)
    print(f"Done training model {model}.")
except Exception as exc:
    whoopsie("Model build failure: " + str(exc))
pickle_it(model, "jfk_model.pkl")
print("Done saving model.")

Begin training model.
Done training model Doc2Vec(dm/m,d400,n5,w5,mc10,s0.001,t3).
Done saving model.


<h2>Query the Model w a Photo Doc</h2>

In [35]:
photo_doc = "a red car parked on grass near a road and trees others outdoor grass vehicle outdoor land vehicle sky wheel transport car red convertible field sunset"

In [36]:
sims = model.dv.most_similar(positive=[model.infer_vector(gensim.utils.simple_preprocess(dataclean(photo_doc)))], topn=5)

In [37]:
sims

[('docid-32273016', 0.4066036343574524),
 ('docid-32353931', 0.40359169244766235),
 ('docid-32351674', 0.39982426166534424),
 ('104-10160-10192', 0.3992921710014343),
 ('docid-32351663', 0.39185264706611633)]

In [41]:
for sim in sims:
    with open("Corpus" + "/" + sim[0] + ".txt", encoding="utf-8") as f:
        print(f"{sim[0]}\n{f.read()}\n\n")

docid-32355845
 

HW 53199 Docld: 32355845 Page 2 : “ .
 

AGENCY

RECORD NUMBER
RECORD SERIES
AGENCY FILE NUMBER

AGENCY ORIGINATOR

FROM

“a TO
TITLE

DATE

PAGES

SUBJECTS

DOCUMENT TYPE
CLASSIFICATION
RESTRICTIONS
CURRENT STATUS

DATE OF LAST REVIEW
COMMENTS

" 80T01357A

JFK ASSASSINATION SYSTEM
IDENTIFICATION FORM

AGENCY INFORMATION Released under the John
’ Kennedy
CIA ; Kssassination Records
104-10098-10176 . ollection Act of 1992
JFK : (44 USC 2107 Note).

ase#:NU 53199 Date:
6-13-2017

 

DOCUMENT INFORMATION

CIA

COS, MEXICO

HEADQUARTERS (CHIEF, ? DIVISION)
PROCESSING OF TRAVEL INFORMATION
10/18/1963

1

TRAVEL DATA

PAPER
SECRET
1B
RELEASED IN PART PUBLIC - RELEASED WITH DELETIONS
06/21/96
JFK32 : Fll : 1996.06.21.18:50:58:440100

[R] - ITEM IS RESTRICTED 104-10098-10176

HW 53199 DocId:32355845 Page 1



104-10172-10040
 

 

104-10172-10040

 

 

Image Temporarily Not Available

 

 



docid-32199189
 
 

 

 

 

 

 

AGENCY:
RECORD NUMBER:
RECORD SERIES:

AGENCY F

In [39]:
docSims = []
# docSims is a doc_id-indexed list of top 10 most similar tuples, where the first is the doc's ID and the second is its similarity score to the queried doc.
# Careful! model.most_similar apparently maps to words, while model.docvecs.most_similar maps to documents!
for doc_id in range(len(corpus)):
    inferred_vector = model.infer_vector(corpus[doc_id].words)
    docSims.append(model.dv.most_similar([inferred_vector], topn=1))

pickle_it(docSims, 'docSims.pkl')

# How often is the first-most similar doc the same doc?
firstSims = list()
for i in range(0, len(corpus)):
    # Is the doc sim at i equal to 
    firstSims.append(docSims[i][0][0]==corpus[i].tags[0])
# print(f"vecSize: {vecSize}, minCount: {minCount}, numEpochs: {numEpochs}, Accuracy: {round(sum(firstSims)/len(docs), 3)}\n")
print(f"Model: {model}, Accuracy: {round(sum(firstSims)/len(corpus), 3)}\n")


  docSims.append(model.docvecs.most_similar([inferred_vector], topn=1))


Model: Doc2Vec(dm/m,d400,n5,w5,mc10,s0.001,t3), Accuracy: 0.995



In [43]:
test_doc = "Please present a demand for the extradition of Rolando Masferrer. Documents in this case were sent to your embassy some time agao. At the same time, you will ask for the provisional detention of Masferrer, which will not be difficult to obtain since he is referred to as a fugitive on parole who has to report periodically to the proper offices in Washington, where you should be able to get information. Acknowledge receipt by cable and let us know the result promptly."
sims = model.dv.most_similar(positive=[model.infer_vector(gensim.utils.simple_preprocess(dataclean(test_doc)))], topn=5)
sims

[('docid-33060064', 0.5931869149208069),
 ('104-10071-10255', 0.3542172610759735),
 ('docid-32202071', 0.3445645868778229),
 ('104-10185-10094', 0.34139484167099),
 ('docid-32308878', 0.3325188457965851)]