In [11]:
import os
import logging
import json
import re
import pandas as pd
import numpy as np
from collections import Counter
from functools import reduce
from itertools import chain, combinations

from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from string import punctuation

import gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.utils import smart_open, simple_preprocess
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
LDA = gensim.models.ldamodel.LdaModel


from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn import preprocessing
from matplotlib import pyplot as plt

import logging
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

## Data 
Getting the data from the texts.

In [12]:
ot_df = pd.read_csv("../data/old-testament-verses.csv")
nt_df = pd.read_csv("../data/new-testament-verses.csv")

#rename from AyahText to VerseText
q_df = pd.read_csv("../data/quran-verses.csv")
q_df.columns = ['DatabaseID', 'SuraID', 'VerseID', 'VerseText']

In [13]:
def get_lines(df):
    return list(df["VerseText"])

In [14]:
all_stops = [w for w in STOPWORDS]

In [15]:
# make the stop words the most common words...?
new_stops = ['hath', "'s",'an', 'let','behold', 'went','o','hast','thine','like','thing','things','quot','and', 'in', 'thou', 'thee', 'thy', 'unto', 'ye', 'said', 'saith', 'shall', 'shalt', 'yea', 'thereof']
all_stops += new_stops

In [16]:
# Helper functions
regex = re.compile('[%s]' % re.escape(punctuation))

def strip_punc(s):  # From Vinko's solution, with fix.
    return regex.sub('', s)

def clean_text(text):
    # basic nlp clean up
    lm = WordNetLemmatizer()
    st = SnowballStemmer("english")

    base = [strip_punc(t).lower() for t in word_tokenize(text)
            if t not in punctuation
            and t.lower() not in all_stops]
    
    lemmatized = [lm.lemmatize(w) for w in base]
#     stemmed = [st.stem(w) for w in lemmatized]
    no_chars = [w for w in lemmatized if len(w) > 1]
    
    return no_chars

## Vocab preparation

Constructing a holistic Doc2Vec model for all of the texts put together. Basically, this is just Word2Vec but tagging each word with which religion it belongs to (according to a probability).

In [17]:
def get_vocab(df):
    vocab = []
    for l in get_lines(df):
        tokens = tokenize(l)
        vocab += tokens
    return set(vocab)

In [18]:
def get_shared(sets):
    # Finds the intersection of all the input sets
    return reduce((lambda set1,set2: set1&set2), sets)

In [19]:
def get_sym_diffs(sets):
    # Finds the symm etric difference of a list of sets
    return reduce((lambda set1, set2: set1.symmetric_difference(set2)), sets)

In [198]:
def get_all_uniques(sets):
    # Gets the unique elements in each set
    # return a dictionary with labels of each
    intersection = get_shared(sets)
    sym_diffs = get_sym_diffs(sets)
    return sym_diffs - intersection

In [199]:
def get_unique_vocabs(vocabs_dict):
    all_uniques = get_all_uniques(vocabs_dict.values())
    unique_dict = {}
    
    for tag, vocab in vocabs_dict.items():
        unique_dict[tag] = []
        for w in vocab:
            if w in all_uniques:
                unique_dict[tag].append(w)
    
    return unique_dict

In [200]:
def build_dict(vocab, tag):
    d = {}
    for w in vocab:
        d[w] = tag
    return d

In [201]:
def merge(dicts): 
    super_dict = {}
    for d in dicts:
        for k, v in d.items():
            super_dict[k] = v
    return super_dict

In [202]:
def have_common(l1,l2):
    result = False
    for x in l1: 
        for y in l2: 
            if x == y:
                print("Got same:", x, y)
                result = True
                return result  

## Another Word2Vec attempt.

In [203]:
def normalize(l):
    lmin = min(l)
    lmax = max(l)
    return [(v-lmin)/(lmax-lmin) for v in l]

In [26]:
def sort_dict(d, rev):
    return sorted(d.items(), key=lambda kv: kv[1], reverse=rev)

In [27]:
def get_word2vec_docs(df):
    verses = list(df["VerseText"])
    docs = [clean_text(v) for v in verses]
    return docs

In [28]:
def normalize_origin(model, origin_word):
    
    # gets the original vector for the chosen origin word
    origin_vec = model.wv.get_vector(origin_word)
    
    # used to calculate the new origin
    zero_vec = np.zeros(origin_vec.shape)
    
    # vector to shift each point by everything by
    transformation_vec = zero_vec - origin_vec
    
    # dict to store all the new vectors
    transformed_vecs = {}
    
    for w in model.wv.vocab:
        # original vector for the word
        w_vec = model.wv.get_vector(w)
        
        # shifted by the transformation
        transformed_w_vec = w_vec + transformation_vec
        
        # store
        transformed_vecs[w] = transformed_w_vec
        
    return transformed_vecs

In [29]:
# get all the documents from each text.
ot_docs = get_word2vec_docs(ot_df)
nt_docs = get_word2vec_docs(nt_df)
q_docs = get_word2vec_docs(q_df)

In [30]:
# Global hyper parameters
hp = {
    "size": 150, # size of the one-hot-encoded word vectors
    "window": 20, # context size
    "min_count": 2,
    "workers": 4,
    "iter": 10
}

In [46]:
def get_model(docs):
    model = gensim.models.Word2Vec(
        docs,
        size=hp["size"],
        window=hp["window"],
        min_count=hp["min_count"],
        workers=hp["workers"])
    
    # initialize similarities
    model.train(docs, total_examples=len(docs), epochs=50)
    return model

## Aligning models to compare similar words across religions

Note that for the new Gensim versions, calls for .index2word, .vocab, .syn0 and .syn0norm should be replaced with .wv.index2word, .wv.vocab, .wv.syn0 and .wv.syn0norm respectively.

In [171]:
def intersection_align_gensim(models, words=None):
    """
    Intersect three gensim word2vec models, m1 and m2.
    Generalized from original two-way intersection.
    
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """

    # Get the vocab for each model
    vocabs = [set(m.wv.vocab.keys()) for m in models]

    # Find the common vocabulary
    common_vocab = reduce((lambda vocab1,vocab2: vocab1&vocab2), vocabs)
    if words: common_vocab&=set(words)

    # If no alignment necessary because vocab is identical...
    
    # This was generalized from:
    # if not vocab_m1-common_vocab and not vocab_m2-common_vocab and not vocab_m3-common_vocab:
    #   return (m1,m2,m3)
    if all(not vocab-common_vocab for vocab in vocabs):
        print("All identical!")
        return models
        
    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: sum([m.wv.vocab[w].count for m in models]),reverse=True)
    
    # Then for each model...
    for m in models:
        
        # Replace old vectors_norm array with new one (with common vocab)
        indices = [m.wv.vocab[w].index for w in common_vocab]
                
        old_arr = m.wv.vectors_norm
                
        new_arr = np.array([old_arr[index] for index in indices])
        m.wv.vectors_norm = m.wv.syn0 = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        m.wv.index2word = common_vocab
        old_vocab = m.wv.vocab
        new_vocab = {}
        for new_index,word in enumerate(common_vocab):
            old_vocab_obj=old_vocab[word]
            new_vocab[word] = gensim.models.word2vec.Vocab(index=new_index, count=old_vocab_obj.count)
        m.wv.vocab = new_vocab

    return models

In [147]:
def measure_semantic_shift_by_neighborhood(model1,model2,word,k=25,verbose=False):
    """
    Basic implementation of William Hamilton (@williamleif) et al's measure of semantic change
    proposed in their paper "Cultural Shift or Linguistic Drift?" (https://arxiv.org/abs/1606.02821),
    which they call the "local neighborhood measure." They find this measure better suited to understand
    the semantic change of nouns owing to "cultural shift," or changes in meaning "local" to that word,
    rather than global changes in language ("linguistic drift") use that are better suited to a
    Procrustes-alignment method (also described in the same paper.)
    
    Arguments are:
    - `model1`, `model2`: Are gensim word2vec models.
    - `word` is a sting representation of a given word.
    - `k` is the size of the word's neighborhood (# of its closest words in its vector space).
    """
    # Import function for cosine distance
    from scipy.spatial.distance import cosine
    
    # Check that this word is present in both models
    if not word in model1.wv.vocab or not word in model2.wv.vocab:
        print("!! Word %s not present in both models." % word)
        return None
    
    # Get the two neighborhoods
    neighborhood1 = [w for w,c in model1.most_similar(word,topn=k)]
    neighborhood2 = [w for w,c in model2.most_similar(word,topn=k)]
    
    # Print?
    if verbose:
        print('>> Neighborhood of associations of the word "%s" in model1:' % word)
        print(', '.join(neighborhood1))
        print('>> Neighborhood of associations of the word "%s" in model2:' % word)
        print(', '.join(neighborhood2))
    
    # Get the 'meta' neighborhood (both combined)
    meta_neighborhood = list(set(neighborhood1)|set(neighborhood2))
    
    # Filter the meta neighborhood so that it contains only words present in both models
    meta_neighborhood = [w for w in meta_neighborhood if w in model1.wv.vocab and w in model2.wv.vocab]
    
    # For both models, get a similarity vector between the focus word and all of the words in the meta neighborhood
    vector1 = [model1.similarity(word,w) for w in meta_neighborhood]
    vector2 = [model2.similarity(word,w) for w in meta_neighborhood]
    
    # Compute the cosine distance *between* those similarity vectors
    dist=cosine(vector1,vector2)
    
    # Return this cosine distance -- a measure of the relative semantic shift for this word between these two models
    return dist
    
"""
Example usage:
model1 = [a gensim model I have for text published in the 1750s]
model2 = [a gensim model I have for text published in the 1850s]
# The word 'god' does not change much in meaning:
    In [61]: measure_semantic_shift_by_neighborhood(model1,model2,'god',k=10,verbose=True)
    
    >> Neighborhood of associations of the word "god" in model1:
    almighty, jehovah, creator, uncreated, omniscient, logos, righteousness, christ, redeemer, salvation
    >> Neighborhood of associations of the word "god" in model2:
    almighty, heaven, jehovah, creator, redeemer, christ, divine, righteousness, providence, saviour
    
    Out[61]: 0.011609088245951749
# The word 'matter' does, moving from meaning mainly the "matter" of the universe to "what is the matter":
    In [62]: measure_semantic_shift_by_neighborhood(model1,model2,'matter',k=10,verbose=True)
    
    >> Neighborhood of associations of the word "matter" in model1:
    cohesion, sediment, menstruum, purulent, conceivable, gelatinous, morbific, compression, cerebellum, divisible
    >> Neighborhood of associations of the word "matter" in model2:
    matters, question, subject, affair, substance, concernment, concerns, questions, controversy, discussion
    
    Out[62]: 0.0847526073498025
# The word 'station' changes even more, moving from meaning one's social rank or "station", to a train station:
    In [63]: measure_semantic_shift_by_neighborhood(model1,model2,'station',k=10,verbose=True)
    
    >> Neighborhood of associations of the word "station" in model1:
    stations, dation, sphere, employments, deg, vocation, personate, lowest, district, apprenticeship
    >> Neighborhood of associations of the word "station" in model2:
    stations, train, posts, position, situation, town, carriage, stationed, rank, cab
    
    Out[63]: 0.14173381265358098
"""

'\nExample usage:\nmodel1 = [a gensim model I have for text published in the 1750s]\nmodel2 = [a gensim model I have for text published in the 1850s]\n# The word \'god\' does not change much in meaning:\n    In [61]: measure_semantic_shift_by_neighborhood(model1,model2,\'god\',k=10,verbose=True)\n    \n    >> Neighborhood of associations of the word "god" in model1:\n    almighty, jehovah, creator, uncreated, omniscient, logos, righteousness, christ, redeemer, salvation\n    >> Neighborhood of associations of the word "god" in model2:\n    almighty, heaven, jehovah, creator, redeemer, christ, divine, righteousness, providence, saviour\n    \n    Out[61]: 0.011609088245951749\n# The word \'matter\' does, moving from meaning mainly the "matter" of the universe to "what is the matter":\n    In [62]: measure_semantic_shift_by_neighborhood(model1,model2,\'matter\',k=10,verbose=True)\n    \n    >> Neighborhood of associations of the word "matter" in model1:\n    cohesion, sediment, menstruum

# Create and train the models on each text

In [174]:
ot_model = get_model(ot_docs)
ot_model.init_sims()

nt_model = get_model(nt_docs)
nt_model.init_sims()

q_model = get_model(q_docs)
q_model.init_sims()

INFO : collecting all words and their counts
INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO : PROGRESS: at sentence #10000, processed 97904 words, keeping 5861 word types
INFO : PROGRESS: at sentence #20000, processed 182562 words, keeping 8873 word types
INFO : collected 9377 word types from a corpus of 214613 raw words and 23145 sentences
INFO : Loading a fresh vocabulary
INFO : min_count=2 retains 6304 unique words (67% of original 9377, drops 3073)
INFO : min_count=2 leaves 211540 word corpus (98% of original 214613, drops 3073)
INFO : deleting the raw counts dictionary of 9377 items
INFO : sample=0.001 downsamples 44 most-common words
INFO : downsampling leaves estimated 184285 word corpus (87.1% of prior 211540)
INFO : estimated required memory for 6304 words and 150 dimensions: 10716800 bytes
INFO : resetting layer weights
INFO : training model with 4 workers on 6304 vocabulary and 150 features, using sg=0 hs=0 sample=0.001 negative=5 window=20
INF

INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 14 : training on 214613 raw words (184343 effective words) took 0.1s, 1431091 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 15 : training on 214613 raw words (184191 effective words) took 0.1s, 1495515 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 16 : training on 214613 raw words (184247 effective words) took 0.1s, 1623910 effective words/s
INFO : worker thr

INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 36 : training on 214613 raw words (184246 effective words) took 0.1s, 1854841 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 37 : training on 214613 raw words (184365 effective words) took 0.1s, 1866688 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 38 : training on 214613 raw words (184508 effective words) took 0.1s, 1727642 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thr

INFO : training on a 302755 raw words (260365 effective words) took 0.2s, 1256647 effective words/s
INFO : precomputing L2-norms of word weight vectors
INFO : training model with 4 workers on 3417 vocabulary and 150 features, using sg=0 hs=0 sample=0.001 negative=5 window=20
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 1 : training on 60551 raw words (52043 effective words) took 0.0s, 1660990 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 2 : training on 60551 raw words (51992 effective words) took 0.0s, 1676564 effect

INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 22 : training on 60551 raw words (52041 effective words) took 0.0s, 1703794 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 23 : training on 60551 raw words (52033 effective words) took 0.0s, 1810132 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 24 : training on 60551 raw words (52079 effective wo

INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 44 : training on 60551 raw words (52092 effective words) took 0.0s, 1779732 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 45 : training on 60551 raw words (52039 effective words) took 0.0s, 1600738 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 46 : training on 60551 raw words (52026 effective words) took 0.0s, 1501947 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread fi

INFO : EPOCH - 7 : training on 56975 raw words (48198 effective words) took 0.0s, 1706383 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 8 : training on 56975 raw words (48097 effective words) took 0.0s, 1631510 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 9 : training on 56975 raw words (48063 effective words) took 0.0s, 1722471 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finis

INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 30 : training on 56975 raw words (48021 effective words) took 0.0s, 1739336 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 31 : training on 56975 raw words (48152 effective words) took 0.0s, 1734268 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
IN

In [175]:
# INTERSECT THE MODELS ON THE COMMON VOCABULARY AND ALIGN.
intersected_model = intersection_align_gensim([ot_model,nt_model,q_model])



In [176]:
# REASSIGN NEW INTERSECTED MODELS
ot_model = intersected_model[0]
nt_model = intersected_model[1]
q_model = intersected_model[2]

In [178]:
# comparing old testament to new testament, then new testament to quran because of the chronology of the religions
word = 'light'
ot_to_nt = measure_semantic_shift_by_neighborhood(ot_model,nt_model,word,k=25,verbose=True)
nt_to_q = measure_semantic_shift_by_neighborhood(nt_model,q_model,word,k=25,verbose=True)

>> Neighborhood of associations of the word "light" in model1:
darkness, lamp, shining, dark, preserved, clear, blind, path, cloud, feareth, step, pure, night, morning, unjust, shine, prayer, shadow, perfect, guide, righteousness, rising, walk, living, tarry
>> Neighborhood of associations of the word "light" in model2:
shine, darkness, lighten, shadow, sun, moon, dark, arise, shining, crooked, pain, chamber, marvel, folly, glorious, weight, suddenly, graf, heat, burning, star, burned, walk, lack, end
>> Neighborhood of associations of the word "light" in model1:
shine, darkness, lighten, shadow, sun, moon, dark, arise, shining, crooked, pain, chamber, marvel, folly, glorious, weight, suddenly, graf, heat, burning, star, burned, walk, lack, end
>> Neighborhood of associations of the word "light" in model2:
darkness, depth, lamp, glass, glorious, lead, star, wonder, walk, lowest, similitude, run, oil, olive, seed, knewest, sun, abide, garden, guide, parable, blessed, double, draweth, mo



In [179]:
ot_to_nt, nt_to_q

(0.33415612515597537, 0.4437865219092505)

# Dimensionality reduction

In [180]:
pca = PCA(n_components=2)

ot_X = ot_model[ot_model.wv.vocab]
ot_reduced = pca.fit_transform(ot_X)

nt_X = nt_model[nt_model.wv.vocab]
nt_reduced = pca.fit_transform(nt_X)

q_X = q_model[q_model.wv.vocab]
q_reduced = pca.fit_transform(q_X)

  This is separate from the ipykernel package so we can avoid doing imports until
  
  if __name__ == '__main__':


In [181]:
wv_dict = {}
wv_dict["ot"] = {}
wv_dict["nt"] = {}
wv_dict["q"] = {}

for i, word in enumerate(list(ot_model.wv.vocab)):
    wv_dict["ot"][word] = [float(ot_reduced[i][0]),float(ot_reduced[i][1])]
    
for i, word in enumerate(list(nt_model.wv.vocab)):
    wv_dict["nt"][word] = [float(nt_reduced[i][0]),float(nt_reduced[i][1])]
    
for i, word in enumerate(list(q_model.wv.vocab)):
    wv_dict["q"][word] = [float(q_reduced[i][0]),float(q_reduced[i][1])]

## Polygon construction from Vectors

In [182]:
from shapely import geometry as g
from dxfwrite import DXFEngine as dxf
from dxfwrite.const import CENTER

In [183]:
topics = ['god', 'heaven', 'hell', 'love', 'hate', 'free', 'light', 'darkness', 'peace', 'war', 'life', 'death', 'man', 'woman', 'child', 'eat']

In [193]:
topic_vecs = {}

for t in topics:
    topic_vecs[t] = {}

    # go through each religion
    for rel in wv_dict:
        
        model = None
        if rel == 'ot':
            model = ot_model
        elif rel == 'nt':
            model = nt_model
        elif rel == 'q':
            model = q_model
            
        most_sim = model.most_similar(positive=[t], topn=5)
        topic_vecs[t][rel] = {
            "vec": wv_dict[rel][t],
            "sim": {}
        }
        
        for w in most_sim:
            wv = wv_dict[rel][w[0]]
            topic_vecs[t][rel]["sim"][w[0]] = wv_dict[rel][w[0]]



In [196]:
# Normalize the vectors around each chosen word so every chosen word becomes (0,0)
for t in topic_vecs:
    rel_data = topic_vecs[t]
    for d in rel_data:
        vecs = rel_data[d]
        
        # this is what you'll normalize around
        topic_vec = vecs['vec']
        zero_vec = np.zeros(len(topic_vec))
        
        transform_vec = zero_vec - topic_vec
        
        closest = vecs['sim'] 
        for word in closest:
            
            # grab the original vector
            og_sim_vec = closest[word]
            
            # transform the vector so it's centered around the current topic
            transformed_sim_vec = og_sim_vec + transform_vec
            
            # reassign the transformed vector
            closest[word] = list(transformed_sim_vec)
        
        # finally, reassign the original vector...
        vecs['vec'] = list(zero_vec)

In [197]:
topic_vecs

{'god': {'ot': {'vec': [0.0, 0.0],
   'sim': {'lord': [-0.07347187446430326, 0.38287484645843506],
    'bless': [0.09206427587196231, 0.6182343810796738],
    'worshipped': [-0.26532349782064557, 0.5583777129650116],
    'quickly': [-0.17355159716680646, 0.6220635175704956],
    'worship': [-0.027384320739656687, 0.6505152434110641]}},
  'nt': {'vec': [0.0, 0.0],
   'sim': {'fellowship': [0.10126769542694092, -0.3102370798587799],
    'salvation': [0.039955079555511475, -0.28650394082069397],
    'true': [0.2374594658613205, -0.5676029026508331],
    'glorify': [0.27052345871925354, -0.37814679741859436],
    'dominion': [0.08419904112815857, -0.5259913802146912]}},
  'q': {'vec': [0.0, 0.0],
   'sim': {'worship': [-0.08062776923179626, 0.1715126633644104],
    'idol': [0.2813589090947062, 0.2282082810997963],
    'forbear': [0.20230995118618011, -0.037356674671173096],
    'worshipper': [0.09774824976921082, -0.11156845092773438],
    'taketh': [0.40792398154735565, 0.2301667630672454

In [79]:
"""
word {
    "ot": [],
    "nt"
}
"""
polygon_data = {}

for t in topic_vecs:
    
    rel_data = topic_vecs[t]
    polygon_data[t] = {}
    
    for d in rel_data:
        vecs = rel_data[d]
        
        ref_vec = vecs['vec']
        sim_vecs = list(vecs['sim'].items())
                
        polygon_data[t][d] = {
            "poly": g.Polygon([(v[1][0], v[1][1]) for v in sim_vecs]),
            "words": [v[0] for v in sim_vecs]
        }

In [81]:
print ("Creating polygon / DXF data")

for topic in polygon_data:
    
    print(topic)
    all_meta = polygon_data[topic]
    
    for rel in all_meta:
        
        # grab the specific religions meta data
        rel_meta = all_meta[rel]
        
        # all the words
        rel_words = rel_meta['words']
        
        # grab the convex hull and regular polygon data 
        rel_convex = list(rel_meta['poly'].convex_hull.exterior.coords)
        rel_poly = list(rel_meta['poly'].exterior.coords)
        
        
        #####################################
        # DXF 
        #####################################
        
        # CONFIGURE THE FILE PATHS FOR SAVING
        folder_path = '../data/analyzed/dxf/religion-specific/{}'.format(topic)

        if not os.path.exists(folder_path):
            os.mkdir(folder_path)
            
        drawing = dxf.drawing('../data/analyzed/dxf/religion-specific/{}/{}-{}.dxf'.format(topic,topic,rel))
        
        line = dxf.polyline(linetype='DOT', layer ='INTERNAL')
        outline = dxf.polyline(linetype='CONTINUOUS', layer = 'OUTLINE')
        line.add_vertices(rel_poly)
        outline.add_vertices(rel_convex)
        
        text_layer = dxf.layer('TEXT')
        drawing.layers.add(text_layer)

        # ADD THE TEXT
        for p in zip(rel_words, rel_poly):
            t = dxf.text(p[0], p[1], height=0.0125, rotation=0, layer = 'TEXT')
            drawing.add(t)
            
        # Close the lines
        outline.close()
        line.close()
        
        drawing.add(outline)
        drawing.add(line)
        drawing.save()

Creating polygon / DXF data
god
heaven
hell
love
hate
free
light
darkness
peace
war
life
death
man
woman
child
eat
