In [275]:
import os
import logging
import json
import re
import pandas as pd
import numpy as np
from collections import Counter
from functools import reduce
from itertools import chain, combinations

from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from string import punctuation

import gensim
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.utils import smart_open, simple_preprocess
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
LDA = gensim.models.ldamodel.LdaModel


from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn import preprocessing
from matplotlib import pyplot as plt

import logging
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

## Data 
Getting the data from the texts.

In [276]:
ot_df = pd.read_csv("../data/old-testament-verses.csv")
nt_df = pd.read_csv("../data/new-testament-verses.csv")

#rename from AyahText to VerseText
q_df = pd.read_csv("../data/quran-verses.csv")
q_df.columns = ['DatabaseID', 'SuraID', 'VerseID', 'VerseText']

In [277]:
def get_lines(df):
    return list(df["VerseText"])

In [278]:
all_stops = [w for w in STOPWORDS]

In [279]:
# make the stop words the most common words...?
new_stops = ['hath', "'s",'an', 'let','behold', 'went','o','hast','thine','like','thing','things','quot','and', 'in', 'thou', 'thee', 'thy', 'unto', 'ye', 'said', 'saith', 'shall', 'shalt', 'yea', 'thereof']
all_stops += new_stops

In [329]:
# Helper functions
regex = re.compile('[%s]' % re.escape(punctuation))

def strip_punc(s):  # From Vinko's solution, with fix.
    return regex.sub('', s)

def clean_text(text):
    # basic nlp clean up
    lm = WordNetLemmatizer()
    st = SnowballStemmer("english")

    base = [strip_punc(t).lower() for t in word_tokenize(text)
            if t not in punctuation
            and t.lower() not in all_stops]
    
    lemmatized = [lm.lemmatize(w) for w in base]
#     stemmed = [st.stem(w) for w in lemmatized]
    no_chars = [w for w in lemmatized if len(w) > 1]
    
    return no_chars

## Vocab preparation

Constructing a holistic Doc2Vec model for all of the texts put together. Basically, this is just Word2Vec but tagging each word with which religion it belongs to (according to a probability).

In [330]:
def get_vocab(df):
    vocab = []
    for l in get_lines(df):
        tokens = tokenize(l)
        vocab += tokens
    return set(vocab)

In [331]:
def get_shared(sets):
    # Finds the intersection of all the input sets
    return reduce((lambda set1,set2: set1&set2), sets)

In [332]:
def get_sym_diffs(sets):
    # Finds the symm etric difference of a list of sets
    return reduce((lambda set1, set2: set1.symmetric_difference(set2)), sets)

In [333]:
def get_all_uniques(sets):
    # Gets the unique elements in each set
    # return a dictionary with labels of each
    intersection = get_shared(sets)
    sym_diffs = get_sym_diffs(sets)
    return sym_diffs - intersection

In [334]:
def get_unique_vocabs(vocabs_dict):
    
    all_uniques = get_all_uniques(vocabs_dict.values())
    unique_dict = {}
    
    for tag, vocab in vocabs_dict.items():
        unique_dict[tag] = []
        for w in vocab:
            if w in all_uniques:
                unique_dict[tag].append(w)
    
    return unique_dict

In [335]:
def build_dict(vocab, tag):
    d = {}
    for w in vocab:
        d[w] = tag
    return d

In [336]:
def merge(dicts): 
    super_dict = {}
    for d in dicts:
        for k, v in d.items():
            super_dict[k] = v
    return super_dict

In [337]:
def have_common(l1,l2):
    result = False
    for x in l1: 
        for y in l2: 
            if x == y:
                print("Got same:", x, y)
                result = True
                return result  

## Another pathetic Word2Vec attempt.
Fuck me up

In [338]:
def normalize(l):
    lmin = min(l)
    lmax = max(l)
    return [(v-lmin)/(lmax-lmin) for v in l]

In [339]:
def sort_dict(d, rev):
    return sorted(d.items(), key=lambda kv: kv[1], reverse=rev)

In [340]:
def get_word2vec_docs(df):
    verses = list(df["VerseText"])
    docs = [clean_text(v) for v in verses]
    return docs

In [341]:
def normalize_origin(model, origin_word):
    
    # gets the original vector for the chosen origin word
    origin_vec = model.wv.get_vector(origin_word)
    
    # used to calculate the new origin
    zero_vec = np.zeros(origin_vec.shape)
    
    # vector to shift each point by everything by
    transformation_vec = zero_vec - origin_vec
    
    # dict to store all the new vectors
    transformed_vecs = {}
    
    for w in model.wv.vocab:
        # original vector for the word
        w_vec = model.wv.get_vector(w)
        
        # shifted by the transformation
        transformed_w_vec = w_vec + transformation_vec
        
        # store
        transformed_vecs[w] = transformed_w_vec
        
    return transformed_vecs

In [342]:
# get all the documents from each text.
ot_docs = get_word2vec_docs(ot_df)
nt_docs = get_word2vec_docs(nt_df)
q_docs = get_word2vec_docs(q_df)

In [343]:
# Global hyper parameters
hp = {
    "size": 150, # size of the one-hot-encoded word vectors
    "window": 20, # context size
    "min_count": 2,
    "workers": 4,
    "iter": 10
}

In [344]:
def get_model(docs):
    model = gensim.models.Word2Vec(
        docs,
        size=hp["size"],
        window=hp["window"],
        min_count=hp["min_count"],
        workers=hp["workers"])

    model.train(docs, total_examples=len(docs), epochs=50)
    return model

# Create and train the models on each text

In [345]:
ot_model = get_model(ot_docs)

INFO : collecting all words and their counts
INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO : PROGRESS: at sentence #10000, processed 97904 words, keeping 5861 word types
INFO : PROGRESS: at sentence #20000, processed 182562 words, keeping 8873 word types
INFO : collected 9377 word types from a corpus of 214613 raw words and 23145 sentences
INFO : Loading a fresh vocabulary
INFO : min_count=2 retains 6304 unique words (67% of original 9377, drops 3073)
INFO : min_count=2 leaves 211540 word corpus (98% of original 214613, drops 3073)
INFO : deleting the raw counts dictionary of 9377 items
INFO : sample=0.001 downsamples 44 most-common words
INFO : downsampling leaves estimated 184285 word corpus (87.1% of prior 211540)
INFO : estimated required memory for 6304 words and 150 dimensions: 10716800 bytes
INFO : resetting layer weights
INFO : training model with 4 workers on 6304 vocabulary and 150 features, using sg=0 hs=0 sample=0.001 negative=5 window=20
INF

INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 14 : training on 214613 raw words (184236 effective words) took 0.2s, 876683 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 15 : training on 214613 raw words (184123 effective words) took 0.2s, 920216 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 16 : training on 214613 raw words (184254 effective words) took 0.2s, 939389 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread

INFO : EPOCH - 36 : training on 214613 raw words (184209 effective words) took 0.2s, 888826 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 37 : training on 214613 raw words (184264 effective words) took 0.2s, 951444 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 38 : training on 214613 raw words (184217 effective words) took 0.2s, 932075 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread

In [346]:
nt_model = get_model(nt_docs)

INFO : collecting all words and their counts
INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO : collected 5152 word types from a corpus of 60551 raw words and 7957 sentences
INFO : Loading a fresh vocabulary
INFO : min_count=2 retains 3417 unique words (66% of original 5152, drops 1735)
INFO : min_count=2 leaves 58816 word corpus (97% of original 60551, drops 1735)
INFO : deleting the raw counts dictionary of 5152 items
INFO : sample=0.001 downsamples 59 most-common words
INFO : downsampling leaves estimated 52063 word corpus (88.5% of prior 58816)
INFO : estimated required memory for 3417 words and 150 dimensions: 5808900 bytes
INFO : resetting layer weights
INFO : training model with 4 workers on 3417 vocabulary and 150 features, using sg=0 hs=0 sample=0.001 negative=5 window=20
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of

INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 15 : training on 60551 raw words (52053 effective words) took 0.1s, 622021 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 16 : training on 60551 raw words (51905 effective words) took 0.1s, 693458 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO

INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 37 : training on 60551 raw words (52013 effective words) took 0.1s, 714138 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 38 : training on 60551 raw words (52089 effective words) took 0.1s, 829113 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 39 : training on 60551 raw words (52099 effective words) took 0.1s, 1025131 effective words/s
INFO : worker thread fini

In [347]:
q_model = get_model(q_docs)

INFO : collecting all words and their counts
INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO : collected 5703 word types from a corpus of 56975 raw words and 6236 sentences
INFO : Loading a fresh vocabulary
INFO : min_count=2 retains 3466 unique words (60% of original 5703, drops 2237)
INFO : min_count=2 leaves 54738 word corpus (96% of original 56975, drops 2237)
INFO : deleting the raw counts dictionary of 5703 items
INFO : sample=0.001 downsamples 45 most-common words
INFO : downsampling leaves estimated 48112 word corpus (87.9% of prior 54738)
INFO : estimated required memory for 3466 words and 150 dimensions: 5892200 bytes
INFO : resetting layer weights
INFO : training model with 4 workers on 3466 vocabulary and 150 features, using sg=0 hs=0 sample=0.001 negative=5 window=20
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of

INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 15 : training on 56975 raw words (48245 effective words) took 0.1s, 786063 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 16 : training on 56975 raw words (48116 effective words) took 0.1s, 774424 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO

INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 37 : training on 56975 raw words (48217 effective words) took 0.1s, 766064 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 38 : training on 56975 raw words (48220 effective words) took 0.1s, 814816 effective words/s
INFO : worker thread finished; awaiting finish of 3 more threads
INFO : worker thread finished; awaiting finish of 2 more threads
INFO : worker thread finished; awaiting finish of 1 more threads
INFO : worker thread finished; awaiting finish of 0 more threads
INFO : EPOCH - 39 : training on 56975 raw words (48203 effective words) took 0.1s, 741423 effective words/s
INFO : worker thread finis

# Dimensionality reduction

In [348]:
pca = PCA(n_components=2)

ot_X = ot_model[ot_model.wv.vocab]
ot_reduced = pca.fit_transform(ot_X)

nt_X = nt_model[nt_model.wv.vocab]
nt_reduced = pca.fit_transform(nt_X)

q_X = q_model[q_model.wv.vocab]
q_reduced = pca.fit_transform(q_X)

  This is separate from the ipykernel package so we can avoid doing imports until
  
  if __name__ == '__main__':


In [366]:
wv_dict = {}
wv_dict["ot"] = {}
wv_dict["nt"] = {}
wv_dict["q"] = {}

for i, word in enumerate(list(ot_model.wv.vocab)):
    wv_dict["ot"][word] = [float(ot_reduced[i][0]),float(ot_reduced[i][1])]
    
for i, word in enumerate(list(nt_model.wv.vocab)):
    wv_dict["nt"][word] = [float(nt_reduced[i][0]),float(nt_reduced[i][1])]
    
for i, word in enumerate(list(q_model.wv.vocab)):
    wv_dict["q"][word] = [float(q_reduced[i][0]),float(q_reduced[i][1])]

## Polygon construction from Vectors

In [355]:
from shapely import geometry as g
from dxfwrite import DXFEngine as dxf
from dxfwrite.const import CENTER

In [356]:
topics = ['god', 'heaven', 'hell', 'love', 'hate', 'free', 'light', 'darkness', 'peace', 'war', 'life', 'death', 'man', 'woman', 'child', 'eat']

In [452]:
topic_vecs = {}

for t in topics:
    topic_vecs[t] = {}

    # go through each religion
    for rel in wv_dict:
        
        model = None
        if rel == 'ot':
            model = ot_model
        elif rel == 'nt':
            model = nt_model
        elif rel == 'q':
            model = q_model
            
        most_sim = model.most_similar(positive=[t], topn=5)
        topic_vecs[t][rel] = {
            "vec": wv_dict[rel][t],
            "sim": {}
        }
        
#         print(wv_dict)
        for w in most_sim:
            wv = wv_dict[rel][w[0]]
            topic_vecs[t][rel]["sim"][w[0]] = wv_dict[rel][w[0]]



In [453]:
topic_vecs

{'god': {'ot': {'vec': [-2.27060866355896, -3.132342576980591],
   'sim': {'lord': [-2.1962146759033203, -2.4795944690704346],
    'worship': [-1.1756192445755005, -1.30728280544281],
    'worshipped': [1.4347145557403564, -2.2107770442962646],
    'jealous': [-1.7164735794067383, -2.051520824432373],
    'sworn': [-2.2373297214508057, -3.513554573059082]}},
  'nt': {'vec': [-2.013201951980591, -0.5630126595497131],
   'sim': {'amen': [-2.368495225906372, -0.653245210647583],
    'saviour': [-2.4734528064727783, -1.298343539237976],
    'sceptre': [-0.3421207666397095, 0.306600421667099],
    'salvation': [-3.8624179363250732, -2.0293500423431396],
    'fellowship': [-1.4601202011108398, -1.3096134662628174]}},
  'q': {'vec': [-1.507064700126648, -3.598748207092285],
   'sim': {'worship': [4.105205059051514, -2.5894370079040527],
    'centre': [-0.022862084209918976, -1.2361050844192505],
    'fashion': [-0.3592420220375061, -0.3114739954471588],
    'idol': [-0.19077439606189728, 0.61

In [454]:
# normalize around each chosen word...
for t in topic_vecs:
    rel_data = topic_vecs[t]
    for d in rel_data:
        vecs = rel_data[d]
        
        # this is what you'll normalize around
        topic_vec = vecs['vec']
        zero_vec = np.zeros(len(topic_vec))
        
        transform_vec = zero_vec - topic_vec
        
        closest = vecs['sim'] 
        for word in closest:
            
            # grab the original vector
            og_sim_vec = closest[word]
            
            # transform the vector so it's centered around the current topic
            transformed_sim_vec = og_sim_vec + transform_vec
            
            # reassign the transformed vector
            closest[word] = list(transformed_sim_vec)
        
        # finally, reassign the original vector...
        vecs['vec'] = list(zero_vec)

In [455]:
topic_vecs

{'god': {'ot': {'vec': [0.0, 0.0],
   'sim': {'lord': [0.07439398765563965, 0.6527481079101562],
    'worship': [1.0949894189834595, 1.8250597715377808],
    'worshipped': [3.7053232192993164, 0.9215655326843262],
    'jealous': [0.5541350841522217, 1.0808217525482178],
    'sworn': [0.0332789421081543, -0.3812119960784912]}},
  'nt': {'vec': [0.0, 0.0],
   'sim': {'amen': [-0.35529327392578125, -0.09023255109786987],
    'saviour': [-0.4602508544921875, -0.7353308796882629],
    'sceptre': [1.6710811853408813, 0.8696130812168121],
    'salvation': [-1.8492159843444824, -1.4663373827934265],
    'fellowship': [0.553081750869751, -0.7466008067131042]}},
  'q': {'vec': [0.0, 0.0],
   'sim': {'worship': [5.612269759178162, 1.0093111991882324],
    'centre': [1.484202615916729, 2.3626431226730347],
    'fashion': [1.1478226780891418, 3.2872742116451263],
    'idol': [1.3162903040647507, 4.217291414737701],
    'passion': [1.4657179452478886, 3.199876219034195]}}},
 'heaven': {'ot': {'vec':

In [468]:
"""
word {
    "ot": [],
    "nt"
}
"""
polygon_data = {}

for t in topic_vecs:
    
    rel_data = topic_vecs[t]
    polygon_data[t] = {}
    
    for d in rel_data:
        vecs = rel_data[d]
        
        ref_vec = vecs['vec']
        sim_vecs = list(vecs['sim'].items())
                
        polygon_data[t][d] = {
            "poly": g.Polygon([(v[1][0], v[1][1]) for v in sim_vecs]),
            "words": [v[0] for v in sim_vecs]
        }

In [470]:
print ("Creating polygon / DXF data")

for topic in polygon_data:
    
    print(topic)
    all_meta = polygon_data[topic]
    
    for rel in all_meta:
        
        # grab the specific religions meta data
        rel_meta = all_meta[rel]
        
        # all the words
        rel_words = rel_meta['words']
        
        # grab the convex hull and regular polygon data 
        rel_convex = list(rel_meta['poly'].convex_hull.exterior.coords)
        rel_poly = list(rel_meta['poly'].exterior.coords)
        
        
        #####################################
        # DXF 
        #####################################
        
        # CONFIGURE THE FILE PATHS FOR SAVING
        folder_path = '../data/analyzed/dxf/religion-specific/{}'.format(topic)

        if not os.path.exists(folder_path):
            os.mkdir(folder_path)
            
        drawing = dxf.drawing('../data/analyzed/dxf/religion-specific/{}/{}-{}.dxf'.format(topic,topic,rel))
        
        line = dxf.polyline(linetype='DOT', layer ='INTERNAL')
        outline = dxf.polyline(linetype='CONTINUOUS', layer = 'OUTLINE')
        line.add_vertices(rel_poly)
        outline.add_vertices(rel_convex)
        
        text_layer = dxf.layer('TEXT')
        drawing.layers.add(text_layer)

        # ADD THE TEXT
        for p in zip(rel_words, rel_poly):
            t = dxf.text(p[0], p[1], height=0.05, rotation=0, layer = 'TEXT')
            drawing.add(t)
            
        # Close the lines
        outline.close()
        line.close()
        
        drawing.add(outline)
        drawing.add(line)
        drawing.save()

Creating polygon / DXF data
god
heaven
hell
love
hate
free
light
darkness
peace
war
life
death
man
woman
child
eat
