In [1]:
import os
import logging
import json
import re
import pandas as pd
import numpy as np
from collections import Counter
from functools import reduce
from itertools import chain, combinations

from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from string import punctuation

import gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.utils import smart_open, simple_preprocess
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
LDA = gensim.models.ldamodel.LdaModel


from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn import preprocessing
from matplotlib import pyplot as plt

import logging
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

## Data 
Getting the data from the texts.

In [2]:
ot_df = pd.read_csv("../data/old-testament-verses.csv")
nt_df = pd.read_csv("../data/new-testament-verses.csv")

#rename from AyahText to VerseText
q_df = pd.read_csv("../data/quran-verses.csv")
q_df.columns = ['DatabaseID', 'SuraID', 'VerseID', 'VerseText']

In [3]:
otv = ot_df[ot_df['VerseText'].str.contains("believe")]
ntv = nt_df[nt_df['VerseText'].str.contains("believe")]
qv = q_df[q_df['VerseText'].str.contains("believe")]

In [5]:
def get_lines(df):
    return list(df["VerseText"])

In [6]:
all_stops = [w for w in STOPWORDS]

In [7]:
# make the stop words the most common words...?
new_stops = ['hath', "'s",'an', 'let','behold', 'went','o','hast','thine','like','thing','things','quot','and', 'in', 'thou', 'thee', 'thy', 'unto', 'ye', 'said', 'saith', 'shall', 'shalt', 'yea', 'thereof']
all_stops += new_stops

In [8]:
# Helper functions
regex = re.compile('[%s]' % re.escape(punctuation))

def strip_punc(s):  # From Vinko's solution, with fix.
    return regex.sub('', s)

def clean_text(text):
    # basic nlp clean up
    lm = WordNetLemmatizer()
    st = SnowballStemmer("english")

    base = [strip_punc(t).lower() for t in word_tokenize(text)
            if t not in punctuation
            and t.lower() not in all_stops]
    
    lemmatized = [lm.lemmatize(w) for w in base]
#     stemmed = [st.stem(w) for w in lemmatized]
    no_chars = [w for w in lemmatized if len(w) > 1]
    
    return no_chars

## Vocab preparation

Constructing a holistic Doc2Vec model for all of the texts put together. Basically, this is just Word2Vec but tagging each word with which religion it belongs to (according to a probability).

In [9]:
def get_vocab(df):
    vocab = []
    for l in get_lines(df):
        tokens = tokenize(l)
        vocab += tokens
    return set(vocab)

In [10]:
def get_shared(sets):
    # Finds the intersection of all the input sets
    return reduce((lambda set1,set2: set1&set2), sets)

In [11]:
def get_sym_diffs(sets):
    # Finds the symm etric difference of a list of sets
    return reduce((lambda set1, set2: set1.symmetric_difference(set2)), sets)

In [12]:
def get_all_uniques(sets):
    # Gets the unique elements in each set
    # return a dictionary with labels of each
    intersection = get_shared(sets)
    sym_diffs = get_sym_diffs(sets)
    return sym_diffs - intersection

In [13]:
def get_unique_vocabs(vocabs_dict):
    all_uniques = get_all_uniques(vocabs_dict.values())
    unique_dict = {}
    
    for tag, vocab in vocabs_dict.items():
        unique_dict[tag] = []
        for w in vocab:
            if w in all_uniques:
                unique_dict[tag].append(w)
    
    return unique_dict

In [14]:
def build_dict(vocab, tag):
    d = {}
    for w in vocab:
        d[w] = tag
    return d

In [15]:
def merge(dicts): 
    super_dict = {}
    for d in dicts:
        for k, v in d.items():
            super_dict[k] = v
    return super_dict

In [16]:
def have_common(l1,l2):
    result = False
    for x in l1: 
        for y in l2: 
            if x == y:
                print("Got same:", x, y)
                result = True
                return result  

## Another Word2Vec attempt.

In [17]:
def normalize(l):
    lmin = min(l)
    lmax = max(l)
    return [(v-lmin)/(lmax-lmin) for v in l]

In [18]:
def sort_dict(d, rev):
    return sorted(d.items(), key=lambda kv: kv[1], reverse=rev)

In [19]:
def get_word2vec_docs(df):
    verses = list(df["VerseText"])
    docs = [clean_text(v) for v in verses]
    return docs

In [20]:
def normalize_origin(model, origin_word):
    
    # gets the original vector for the chosen origin word
    origin_vec = model.wv.get_vector(origin_word)
    
    # used to calculate the new origin
    zero_vec = np.zeros(origin_vec.shape)
    
    # vector to shift each point by everything by
    transformation_vec = zero_vec - origin_vec
    
    # dict to store all the new vectors
    transformed_vecs = {}
    
    for w in model.wv.vocab:
        # original vector for the word
        w_vec = model.wv.get_vector(w)
        
        # shifted by the transformation
        transformed_w_vec = w_vec + transformation_vec
        
        # store
        transformed_vecs[w] = transformed_w_vec
        
    return transformed_vecs

In [21]:
# get all the documents from each text.
ot_docs = get_word2vec_docs(ot_df)
nt_docs = get_word2vec_docs(nt_df)
q_docs = get_word2vec_docs(q_df)

In [42]:
# Global hyper parameters
hp = {
    "size": 150, # size of the one-hot-encoded word vectors
    "window": 5, # context size
    "min_count": 2,
    "workers": 4,
    "iter": 10
}

In [43]:
def get_model(docs):
    model = gensim.models.Word2Vec(
        docs,
        size=hp["size"],
        window=hp["window"],
        min_count=hp["min_count"],
        workers=hp["workers"])
    
    # initialize similarities
    model.train(docs, total_examples=len(docs), epochs=50)
    return model

## Aligning models to compare similar words across religions

Note that for the new Gensim versions, calls for .index2word, .vocab, .syn0 and .syn0norm should be replaced with .wv.index2word, .wv.vocab, .wv.syn0 and .wv.syn0norm respectively.

In [44]:
def align_models(models, words=None):
    """
    Intersect any number of gensim models.
    Generalized from original two-way intersection.
    
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """

    # Get the vocab for each model
    vocabs = [set(m.wv.vocab.keys()) for m in models]

    # Find the common vocabulary
    common_vocab = reduce((lambda vocab1,vocab2: vocab1&vocab2), vocabs)
    if words: common_vocab&=set(words)

    # If no alignment necessary because vocab is identical...
    
    # This was generalized from:
    # if not vocab_m1-common_vocab and not vocab_m2-common_vocab:
    #   return (m1,m2)
    if all(not vocab-common_vocab for vocab in vocabs):
        print("All identical!")
        return models
        
    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: sum([m.wv.vocab[w].count for m in models]),reverse=True)
    
    # Then for each model...
    for m in models:
        
        # Replace old vectors_norm array with new one (with common vocab)
        indices = [m.wv.vocab[w].index for w in common_vocab]
                
        old_arr = m.wv.vectors_norm
                
        new_arr = np.array([old_arr[index] for index in indices])
        m.wv.vectors_norm = m.wv.syn0 = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        m.wv.index2word = common_vocab
        old_vocab = m.wv.vocab
        new_vocab = {}
        for new_index,word in enumerate(common_vocab):
            old_vocab_obj=old_vocab[word]
            new_vocab[word] = gensim.models.word2vec.Vocab(index=new_index, count=old_vocab_obj.count)
        m.wv.vocab = new_vocab

    return models

In [45]:
def measure_semantic_shift(model1,model2,word,k=25,verbose=False):
    """
    Basic implementation of William Hamilton (@williamleif) et al's measure of semantic change
    proposed in their paper "Cultural Shift or Linguistic Drift?" (https://arxiv.org/abs/1606.02821),
    which they call the "local neighborhood measure." They find this measure better suited to understand
    the semantic change of nouns owing to "cultural shift," or changes in meaning "local" to that word,
    rather than global changes in language ("linguistic drift") use that are better suited to a
    Procrustes-alignment method (also described in the same paper.)
    
    Arguments are:
    - `model1`, `model2`: Are gensim word2vec models.
    - `word` is a sting representation of a given word.
    - `k` is the size of the word's neighborhood (# of its closest words in its vector space).
    """
    # Import function for cosine distance
    from scipy.spatial.distance import cosine
    
    # Check that this word is present in both models
    if not word in model1.wv.vocab or not word in model2.wv.vocab:
        print("!! Word %s not present in both models." % word)
        return None
    
    # Get the two neighborhoods
    neighborhood1 = [w for w,c in model1.most_similar(word,topn=k)]
    neighborhood2 = [w for w,c in model2.most_similar(word,topn=k)]
    
    # Print?
    if verbose:
        print('>> Neighborhood of associations of the word "%s" in model1:' % word)
        print(', '.join(neighborhood1))
        print('>> Neighborhood of associations of the word "%s" in model2:' % word)
        print(', '.join(neighborhood2))
    
    # Get the 'meta' neighborhood (both combined)
    meta_neighborhood = list(set(neighborhood1)|set(neighborhood2))
    
    # Filter the meta neighborhood so that it contains only words present in both models
    meta_neighborhood = [w for w in meta_neighborhood if w in model1.wv.vocab and w in model2.wv.vocab]
    
    # For both models, get a similarity vector between the focus word and all of the words in the meta neighborhood
    vector1 = [model1.similarity(word,w) for w in meta_neighborhood]
    vector2 = [model2.similarity(word,w) for w in meta_neighborhood]
    
    # Compute the cosine distance *between* those similarity vectors
    dist=cosine(vector1,vector2)
    
    # Return this cosine distance -- a measure of the relative semantic shift for this word between these two models
    return dist
    
"""
Example usage:
model1 = [a gensim model I have for text published in the 1750s]
model2 = [a gensim model I have for text published in the 1850s]
# The word 'god' does not change much in meaning:
    In [61]: measure_semantic_shift_by_neighborhood(model1,model2,'god',k=10,verbose=True)
    
    >> Neighborhood of associations of the word "god" in model1:
    almighty, jehovah, creator, uncreated, omniscient, logos, righteousness, christ, redeemer, salvation
    >> Neighborhood of associations of the word "god" in model2:
    almighty, heaven, jehovah, creator, redeemer, christ, divine, righteousness, providence, saviour
    
    Out[61]: 0.011609088245951749
# The word 'matter' does, moving from meaning mainly the "matter" of the universe to "what is the matter":
    In [62]: measure_semantic_shift_by_neighborhood(model1,model2,'matter',k=10,verbose=True)
    
    >> Neighborhood of associations of the word "matter" in model1:
    cohesion, sediment, menstruum, purulent, conceivable, gelatinous, morbific, compression, cerebellum, divisible
    >> Neighborhood of associations of the word "matter" in model2:
    matters, question, subject, affair, substance, concernment, concerns, questions, controversy, discussion
    
    Out[62]: 0.0847526073498025
# The word 'station' changes even more, moving from meaning one's social rank or "station", to a train station:
    In [63]: measure_semantic_shift_by_neighborhood(model1,model2,'station',k=10,verbose=True)
    
    >> Neighborhood of associations of the word "station" in model1:
    stations, dation, sphere, employments, deg, vocation, personate, lowest, district, apprenticeship
    >> Neighborhood of associations of the word "station" in model2:
    stations, train, posts, position, situation, town, carriage, stationed, rank, cab
    
    Out[63]: 0.14173381265358098
"""

'\nExample usage:\nmodel1 = [a gensim model I have for text published in the 1750s]\nmodel2 = [a gensim model I have for text published in the 1850s]\n# The word \'god\' does not change much in meaning:\n    In [61]: measure_semantic_shift_by_neighborhood(model1,model2,\'god\',k=10,verbose=True)\n    \n    >> Neighborhood of associations of the word "god" in model1:\n    almighty, jehovah, creator, uncreated, omniscient, logos, righteousness, christ, redeemer, salvation\n    >> Neighborhood of associations of the word "god" in model2:\n    almighty, heaven, jehovah, creator, redeemer, christ, divine, righteousness, providence, saviour\n    \n    Out[61]: 0.011609088245951749\n# The word \'matter\' does, moving from meaning mainly the "matter" of the universe to "what is the matter":\n    In [62]: measure_semantic_shift_by_neighborhood(model1,model2,\'matter\',k=10,verbose=True)\n    \n    >> Neighborhood of associations of the word "matter" in model1:\n    cohesion, sediment, menstruum

# Create and train the models on each text

In [46]:
ot_model = get_model(ot_docs)
ot_model.init_sims()

nt_model = get_model(nt_docs)
nt_model.init_sims()

q_model = get_model(q_docs)
q_model.init_sims()

INFO : collecting all words and their counts
INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO : PROGRESS: at sentence #10000, processed 97904 words, keeping 5861 word types
INFO : PROGRESS: at sentence #20000, processed 182562 words, keeping 8873 word types
INFO : collected 9377 word types from a corpus of 214613 raw words and 23145 sentences
INFO : Loading a fresh vocabulary
INFO : min_count=2 retains 6304 unique words (67% of original 9377, drops 3073)
INFO : min_count=2 leaves 211540 word corpus (98% of original 214613, drops 3073)
INFO : deleting the raw counts dictionary of 9377 items
INFO : sample=0.001 downsamples 44 most-common words
INFO : downsampling leaves estimated 184285 word corpus (87.1% of prior 211540)
INFO : estimated required memory for 6304 words and 150 dimensions: 10716800 bytes
INFO : resetting layer weights
INFO : training model with 4 workers on 6304 vocabulary and 150 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
INFO

INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 14 : training on 214613 raw words (184306 effective words) took 0.1s, 1806085 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 15 : training on 214613 raw words (184063 effective words) took 0.1s, 1993109 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 16 : training on 214613 raw words (184245 effective words) took 0.1s, 1838086 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thr

INFO : EPOCH - 36 : training on 214613 raw words (184242 effective words) took 0.1s, 1659943 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 37 : training on 214613 raw words (184155 effective words) took 0.1s, 1728176 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 38 : training on 214613 raw words (184194 effective words) took 0.1s, 2045611 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thr

INFO : training on a 302755 raw words (260337 effective words) took 0.2s, 1122825 effective words/s
INFO : training model with 4 workers on 3417 vocabulary and 150 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 1 : training on 60551 raw words (52125 effective words) took 0.0s, 1320550 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 2 : training on 60551 raw words (52011 effective words) took 0.0s, 1454377 effective words/s
INFO : worker thread finished; awaiting f

INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 22 : training on 60551 raw words (52022 effective words) took 0.0s, 1518385 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 23 : training on 60551 raw words (52222 effective words) took 0.0s, 1453532 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 24 : training on 60551 raw words (52026 effective wo

INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 44 : training on 60551 raw words (52023 effective words) took 0.0s, 1491130 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 45 : training on 60551 raw words (52036 effective words) took 0.0s, 1538246 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 46 : training on 60551 raw words (52183 effective words) took 0.0s, 1482262 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread fi

INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 8 : training on 56975 raw words (48077 effective words) took 0.0s, 1500407 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 9 : training on 56975 raw words (48080 effective words) took 0.0s, 1459367 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO

INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 30 : training on 56975 raw words (48116 effective words) took 0.0s, 1585680 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 31 : training on 56975 raw words (48003 effective words) took 0.0s, 1530616 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 32 : training on 56975 raw words (47972 effective words) took 0.0s, 1636036 effective words/s
INFO : worker thread fi

In [47]:
# INTERSECT THE MODELS ON THE COMMON VOCABULARY AND ALIGN.
intersected_model = align_models([ot_model,nt_model,q_model])



In [49]:
# REASSIGN NEW INTERSECTED MODELS
ot_model = intersected_model[0]
nt_model = intersected_model[1]
q_model = intersected_model[2]

In [53]:
ot_model.most_similar(positive=['believe'])

  """Entry point for launching an IPython kernel.


[('revealed', 0.5445148944854736),
 ('declared', 0.5360902547836304),
 ('understood', 0.5340793132781982),
 ('understand', 0.497624009847641),
 ('wonder', 0.4894428849220276),
 ('talk', 0.46386152505874634),
 ('deceive', 0.4562837481498718),
 ('dream', 0.45255041122436523),
 ('declare', 0.44632279872894287),
 ('sorcerer', 0.4412204921245575)]

In [55]:
nt_model.most_similar(positive=['believe'])

  """Entry point for launching an IPython kernel.


[('plainly', 0.600974440574646),
 ('doubt', 0.5910286903381348),
 ('behaved', 0.5832509398460388),
 ('glorified', 0.4768655300140381),
 ('vengeance', 0.47010985016822815),
 ('seek', 0.4594820439815521),
 ('testify', 0.45367687940597534),
 ('salvation', 0.4528679847717285),
 ('marvel', 0.4439432621002197),
 ('try', 0.4341239929199219)]

In [56]:
q_model.most_similar(positive=['believe'])

  """Entry point for launching an IPython kernel.


[('prevail', 0.5654722452163696),
 ('faith', 0.5633742213249207),
 ('believeth', 0.5428281426429749),
 ('altogether', 0.5104032754898071),
 ('revealed', 0.5058085322380066),
 ('reckoned', 0.5042601823806763),
 ('sell', 0.4980141818523407),
 ('hearken', 0.4923272728919983),
 ('obey', 0.4836070239543915),
 ('righteousness', 0.47314804792404175)]

In [57]:
# comparing old testament to new testament, then new testament to quran because of the chronology of the religions
word = 'light'
ot_to_nt = measure_semantic_shift(ot_model,nt_model,word,k=25,verbose=True)
nt_to_q = measure_semantic_shift(nt_model,q_model,word,k=25,verbose=True)

>> Neighborhood of associations of the word "light" in model1:
darkness, shining, lamp, path, dark, brightness, night, cloud, star, walking, step, prayer, shine, countenance, clear, moon, withdraw, lightning, sun, blind, walk, spot, shadow, understand, lighten
>> Neighborhood of associations of the word "light" in model2:
darkness, shine, lighten, shadow, shining, sun, dark, moon, brightness, chamber, pain, heat, hidden, lack, walk, folly, burning, aware, suddenly, blindness, glorious, giveth, burned, lower, fourth
>> Neighborhood of associations of the word "light" in model1:
darkness, shine, lighten, shadow, shining, sun, dark, moon, brightness, chamber, pain, heat, hidden, lack, walk, folly, burning, aware, suddenly, blindness, glorious, giveth, burned, lower, fourth
>> Neighborhood of associations of the word "light" in model2:
darkness, depth, lamp, walk, glass, glorious, star, moon, leg, esteem, run, lead, leaveth, guide, sun, astray, equal, blind, leaf, shining, pleasing, book, 



In [58]:
ot_to_nt, nt_to_q

(0.270130335481935, 0.2840591278427892)

# Dimensionality reduction

In [59]:
pca = PCA(n_components=2)

ot_X = ot_model[ot_model.wv.vocab]
ot_reduced = pca.fit_transform(ot_X)

nt_X = nt_model[nt_model.wv.vocab]
nt_reduced = pca.fit_transform(nt_X)

q_X = q_model[q_model.wv.vocab]
q_reduced = pca.fit_transform(q_X)

  This is separate from the ipykernel package so we can avoid doing imports until
  
  if __name__ == '__main__':


In [60]:
wv_dict = {}
wv_dict["ot"] = {}
wv_dict["nt"] = {}
wv_dict["q"] = {}

for i, word in enumerate(list(ot_model.wv.vocab)):
    wv_dict["ot"][word] = [float(ot_reduced[i][0]),float(ot_reduced[i][1])]
    
for i, word in enumerate(list(nt_model.wv.vocab)):
    wv_dict["nt"][word] = [float(nt_reduced[i][0]),float(nt_reduced[i][1])]
    
for i, word in enumerate(list(q_model.wv.vocab)):
    wv_dict["q"][word] = [float(q_reduced[i][0]),float(q_reduced[i][1])]

## Polygon construction from Vectors

In [61]:
from shapely import geometry as g
from dxfwrite import DXFEngine as dxf
from dxfwrite.const import CENTER

In [62]:
topics = ['faith','sight', 'birth', 'prophet', 'prophecy', 'trust', 'remember', 'experience', 'dream', 'god', 'heaven', 'hell', 'love', 'hate', 'free', 'vision', 'believe',  'light', 'forget', 'darkness', 'peace', 'war', 'life', 'death', 'man', 'woman', 'child', 'eat']

In [63]:
topic_vecs = {}

for t in topics:
    topic_vecs[t] = {}

    # go through each religion
    for rel in wv_dict:
        
        model = None
        if rel == 'ot':
            model = ot_model
        elif rel == 'nt':
            model = nt_model
        elif rel == 'q':
            model = q_model
            
        most_sim = model.most_similar(positive=[t], topn=5)
        topic_vecs[t][rel] = {
            "vec": wv_dict[rel][t],
            "sim": {}
        }
        
        for w in most_sim:
            wv = wv_dict[rel][w[0]]
            topic_vecs[t][rel]["sim"][w[0]] = wv_dict[rel][w[0]]



In [64]:
topic_vecs['dream']

{'ot': {'vec': [-0.19729824364185333, 0.0063003734685480595],
  'sim': {'interpretation': [-0.30307742953300476, -0.04603069648146629],
   'doubt': [-0.0785498097538948, 0.13480640947818756],
   'vision': [-0.14620348811149597, -0.08507151901721954],
   'sorcerer': [-0.31668439507484436, 0.08682235330343246],
   'believe': [-0.43068820238113403, -0.10825461894273758]}},
 'nt': {'vec': [0.2861081063747406, -0.48295286297798157],
  'sim': {'joseph': [0.11754206568002701, -0.5034158229827881],
   'secretly': [0.12591545283794403, -0.4297703802585602],
   'daughter': [0.09747837483882904, -0.34379300475120544],
   'young': [0.2273416817188263, -0.15055856108665466],
   'appeared': [0.26974475383758545, -0.4258229434490204]}},
 'q': {'vec': [-0.35638347268104553, -0.3528476655483246],
  'sim': {'withered': [-0.44579216837882996, -0.26631423830986023],
   'seven': [-0.5516380667686462, -0.0783514603972435],
   'diligently': [-0.22533950209617615, -0.32870155572891235],
   'figure': [-0.29273

In [65]:
# Normalize the vectors around each chosen word so every chosen word becomes (0,0)
for t in topic_vecs:
    rel_data = topic_vecs[t]
    for d in rel_data:
        vecs = rel_data[d]
        
        # this is what you'll normalize around
        topic_vec = vecs['vec']
        zero_vec = np.zeros(len(topic_vec))
        
        transform_vec = zero_vec - topic_vec
        
        closest = vecs['sim'] 
        for word in closest:
            
            # grab the original vector
            og_sim_vec = closest[word]
            
            # transform the vector so it's centered around the current topic
            transformed_sim_vec = og_sim_vec + transform_vec
            
            print(transformed_sim_vec)
            
            transformed_sim_vec *= 5
            
            print(transformed_sim_vec)
            print("\n")
            
            # reassign the transformed vector
            closest[word] = list(transformed_sim_vec)
        
        # finally, reassign the original vector...
        vecs['vec'] = list(zero_vec)

[ 0.04155232 -0.08129367]
[ 0.2077616  -0.40646835]


[0.02935514 0.34414011]
[0.14677569 1.72070055]


[0.12654585 0.01668817]
[0.63272923 0.08344087]


[-0.19904684  0.11736503]
[-0.99523418  0.58682515]


[-0.07173096  0.30160221]
[-0.35865478  1.50801106]


[-0.11522597  0.07545491]
[-0.57612985  0.37727457]


[-0.07025802  0.11172661]
[-0.35129011  0.55863304]


[-0.10364413  0.21499231]
[-0.51822066  1.07496154]


[-0.20689237  0.05624179]
[-1.03446186  0.28120894]


[0.14560908 0.1473939 ]
[0.7280454 0.7369695]


[0.15197644 0.01621872]
[0.75988218 0.08109361]


[ 0.02575868 -0.19168114]
[ 0.12879342 -0.95840571]


[ 0.00542241 -0.52220769]
[ 0.02711207 -2.61103846]


[-0.14672446 -0.2110628 ]
[-0.73362231 -1.05531402]


[-0.20358497 -0.09813835]
[-1.01792485 -0.49069177]


[0.01288436 0.14432516]
[0.06442182 0.72162581]


[-0.22280198  0.10735411]
[-1.11400988  0.53677056]


[-0.1662228   0.31254993]
[-0.83111402  1.56274967]


[-0.05894364  0.28806271]
[-0.29471818  1.44031357

In [66]:
topic_vecs['dream']

{'ot': {'vec': [0.0, 0.0],
  'sim': {'interpretation': [-0.5288959294557571, -0.26165534975007176],
   'doubt': [0.5937421694397926, 0.6425301800481975],
   'vision': [0.2554737776517868, -0.456859462428838],
   'sorcerer': [-0.5969307571649551, 0.402609899174422],
   'believe': [-1.1669497936964035, -0.5727749620564282]}},
 'nt': {'vec': [0.0, 0.0],
  'sim': {'joseph': [-0.842830203473568, -0.10231480002403259],
   'secretly': [-0.8009632676839828, 0.26591241359710693],
   'daughter': [-0.9431486576795578, 0.6957992911338806],
   'young': [-0.29383212327957153, 1.6619715094566345],
   'appeared': [-0.08181676268577576, 0.2856495976448059]}},
 'q': {'vec': [0.0, 0.0],
  'sim': {'withered': [-0.4470434784889221, 0.4326671361923218],
   'seven': [-0.9762729704380035, 1.3724810257554054],
   'diligently': [0.6552198529243469, 0.12073054909706116],
   'figure': [0.3182573616504669, 0.9817350655794144],
   'clay': [-0.4415428638458252, 1.3858495652675629]}}}

In [40]:
"""
word {
    "ot": [],
    "nt"
}
"""
polygon_data = {}

for t in topic_vecs:
    
    rel_data = topic_vecs[t]
    polygon_data[t] = {}
    
    for d in rel_data:
        vecs = rel_data[d]
        
        ref_vec = vecs['vec']
        sim_vecs = list(vecs['sim'].items())
                
        polygon_data[t][d] = {
            "poly": g.Polygon([(0,0)] + [(v[1][0], v[1][1]) for v in sim_vecs]),
            "words": [t] + [v[0] for v in sim_vecs]
        }

In [67]:
print ("Creating polygon / DXF data")

for topic in polygon_data:
    
    print(topic)
    all_meta = polygon_data[topic]
    
    for rel in all_meta:
        
        # grab the specific religions meta data
        rel_meta = all_meta[rel]
                
        # all the words
        rel_words = rel_meta['words']
        
        # grab the convex hull and regular polygon data 
        rel_convex = list(rel_meta['poly'].convex_hull.exterior.coords)
        rel_poly = list(rel_meta['poly'].exterior.coords)
        
        #####################################
        # DXF 
        #####################################
        
        # CONFIGURE THE FILE PATHS FOR SAVING
        folder_path = '../data/analyzed/dxf/religion-specific/{}'.format(topic)

        if not os.path.exists(folder_path):
            os.mkdir(folder_path)
            
        # ADD THE OUTSIDE POLYGON
        drawing = dxf.drawing('../data/analyzed/dxf/religion-specific/{}/{}-{}.dxf'.format(topic,topic,rel))
        outline = dxf.polyline(linetype='CONTINUOUS', layer = 'OUTLINE')
        outline.add_vertices(rel_convex)
        
        text_layer = dxf.layer('TEXT')
        drawing.layers.add(text_layer)

        # ADD THE TEXT AND CONNECTING LINES
        for p in zip(rel_words, rel_poly):
            t = dxf.text(p[0], p[1], height=0.11, rotation=0, layer = 'TEXT')
            line = dxf.line(p[1], (0.0, 0.0))
            line['linetype'] = "DASHED2"
            
            drawing.add(line)
            drawing.add(t)
               
        # ADD THE ORIGIN POINT
#         origin_word = dxf.text(topic, (0.0,0.0), height=0.025, rotation=0, layer = 'TEXT')
#         drawing.add(origin_word)
        origin = dxf.circle(0.025, (0.0, 0.0))
        drawing.add(origin)
        
        # Close the lines
        outline.close()
        drawing.add(outline)
        drawing.save()

Creating polygon / DXF data
faith
sight
birth
prophet
prophecy
trust
remember
experience
dream
god
heaven
hell
love
hate
free
vision
believe
light
forget
darkness
peace
war
life
death
man
woman
child
eat


In [68]:
print ("Creating polygon / DXF data")
from dxfwrite import DXFList
for topic in polygon_data:
    
    print(topic)
    all_meta = polygon_data[topic]
    
    # CONFIGURE THE FILE PATHS FOR SAVING
    folder_path = '../data/analyzed/dxf/religion-specific-joined-blocks/'

    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
        
    drawing = dxf.drawing('../data/analyzed/dxf/religion-specific-joined-blocks/{}.dxf'.format(topic))

    for rel in all_meta:
        
        # MAIN GROUPING
        # ADD ENTITIES TO THE BLOCK
        print("{}-{}".format(rel,topic))
        block = dxf.block(name="{}-{}".format(rel,topic))
        entities = DXFList()
#         entities.name = "{}-{}".format(rel,topic)
        
#         rel_layer = dxf.layer(rel)
#         tl = dxf.layer('text')
#         drawing.layers.add(rel_layer)
#         rel_layer.add(tl)
     
        
        # grab the specific religions meta data
        rel_meta = all_meta[rel]
                
        # all the words
        rel_words = rel_meta['words']
        
        # grab the convex hull and regular polygon data 
        rel_convex = list(rel_meta['poly'].convex_hull.exterior.coords)
        rel_poly = list(rel_meta['poly'].exterior.coords)
        
        #####################################
        # DXF 
        #####################################
            
        # ADD THE OUTSIDE POLYGON
        outline = dxf.polyline(linetype='CONTINUOUS', layer = 'OUTLINE_{}'.format(rel))
        outline.add_vertices(rel_convex)
        
        text_layer = dxf.layer('TEXT_{}'.format(rel))
        drawing.layers.add(text_layer)

        # ADD THE TEXT AND CONNECTING LINES
        for p in zip(rel_words, rel_poly):
            t = dxf.text(p[0], p[1], height=0.11, rotation=0, layer = "TEXT_{}".format(rel))
            
            line = dxf.line(p[1], (0.0, 0.0))
            line['linetype'] = "DASHED2"
            line['layer'] = "LINES_{}".format(rel)
            
            entities.append(line)
            entities.append(t)
               
        # ADD THE ORIGIN POINT
#         origin_word = dxf.text(topic, (0.0,0.0), height=0.025, rotation=0, layer = 'TEXT')
#         drawing.add(origin_word)
        origin = dxf.circle(0.025, (0.0, 0.0), layer = "ORIGIN_{}".format(rel))
        entities.append(origin)
        
        # Close the lines
        outline.close()
        drawing.add(outline)
        
        block.add(entities)
        drawing.blocks.add(block)
        
        # create a block-reference
        blockref = dxf.insert(blockname="{}-{}".format(rel,topic), insert=(0, 0))
        # add block-reference to drawing
        drawing.add(blockref)
        
#         print(block.name)

    
    print(drawing.blocks)
    
    drawing.save()

Creating polygon / DXF data
faith
ot-faith
nt-faith
q-faith
<dxfwrite.sections.Blocks object at 0x1a2c8be748>
sight
ot-sight
nt-sight
q-sight
<dxfwrite.sections.Blocks object at 0x1a2e711a20>
birth
ot-birth
nt-birth
q-birth
<dxfwrite.sections.Blocks object at 0x1a2c61de48>
prophet
ot-prophet
nt-prophet
q-prophet
<dxfwrite.sections.Blocks object at 0x1a2c66b630>
prophecy
ot-prophecy
nt-prophecy
q-prophecy
<dxfwrite.sections.Blocks object at 0x1a2e6e7470>
trust
ot-trust
nt-trust
q-trust
<dxfwrite.sections.Blocks object at 0x1a2e6e7fd0>
remember
ot-remember
nt-remember
q-remember
<dxfwrite.sections.Blocks object at 0x1a2c8bed30>
experience
ot-experience
nt-experience
q-experience
<dxfwrite.sections.Blocks object at 0x1a2c674be0>
dream
ot-dream
nt-dream
q-dream
<dxfwrite.sections.Blocks object at 0x1a2c66deb8>
god
ot-god
nt-god
q-god
<dxfwrite.sections.Blocks object at 0x1a2c8be470>
heaven
ot-heaven
nt-heaven
q-heaven
<dxfwrite.sections.Blocks object at 0x1a2c6016a0>
hell
ot-hell
nt-hell
q