In [1]:
import sys
import csv
import random
import time
import re
import nltk
from scipy.spatial.distance import cdist
import numpy as np
from __future__ import division
import pprint

In [2]:
import pandas as pd
import gensim
from gensim import corpora, models, similarities
from gensim.models.doc2vec import Doc2Vec, FAST_VERSION, TaggedDocument

1.Data Input

In [3]:
df=pd.read_csv('ngoall.csv')

In [4]:
titles=list(df[['Name in IRS Master File','EIN']].apply(lambda x: '_'.join(x), axis=1))

In [5]:
missions=[]
for title in titles:
    with open('ngomission/'+title+'.txt', encoding='utf-8') as f:
        txt=f.read()
        missions.append(txt)

2.Data Processing

In [35]:
stoplist = set('for a of the and to in'.split())

In [103]:
def tokenize(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    doc = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if token not in stoplist and re.search('[a-zA-Z]', token):
            doc.append(token)
    return doc
docs=[]
for i in missions:
    doc=tokenize(i)
    docs.append(doc)

In [65]:
from collections import defaultdict
frequency = defaultdict(int)
for doc in docs:
    for token in doc:
        frequency[token] +=1
docs = [[token for token in doc if frequency[token] > 1]for doc in docs]

3.Data Exploration

In [105]:
docs[:1]

[['‘aha',
  'pūnana',
  'leo',
  'is',
  'family-based',
  'educational',
  'organization',
  'dedicated',
  'revitalization',
  'hawaiian',
  'language',
  'pūnana',
  'leo',
  'language',
  'nest',
  'provides',
  'cultural',
  'educational',
  'environment',
  'that',
  'emulates',
  'hawaiian',
  'language',
  'culture',
  'pūnana',
  'leo',
  'is',
  'member',
  'consortium',
  'an',
  'internationally',
  'recognized',
  'association',
  'schools',
  'organizations',
  'university',
  'programs',
  'hawaii',
  'dedicated',
  'reestablishing',
  'hawaiian',
  'as',
  'first',
  'main',
  'language',
  'home',
  'pūnana',
  'leo',
  'family',
  'initiates',
  'provides',
  'nurtures',
  'various',
  'hawaiian',
  'language',
  'environments',
  'our',
  'families',
  'are',
  'living',
  'essence',
  'these',
  'environments',
  'we',
  'find',
  'our',
  'strength',
  'our',
  'spirituality',
  'love',
  'our',
  'language',
  'love',
  'our',
  'people',
  'love',
  'our',
  'lan

In [67]:
len(docs)

8197

4.Feature Generation

In [106]:
bigram = gensim.models.Phrases(docs)
bigrammed = (bigram[docs])
trigram = gensim.models.Phrases(bigrammed)
trigrammed = (trigram[bigrammed])

5.Modelling

Word to Vector

In [107]:
start = time.time()
model = gensim.models.Word2Vec(trigrammed, workers=4, batch_words=10000)

for iteration in range(10):
    model.train(trigrammed)

vocab_matrix = model.syn0
vocabulary = model.index2word

model.save('MissionsW2V')
end = time.time()
print(end - start)

485.30168104171753


Doc to Vector

6.Evaluation

In [108]:
w2v = gensim.models.Word2Vec.load('MissionsW2V')

In [109]:
w2v['education']

array([-0.47883525,  1.39619672, -1.11317945, -0.69235086, -2.61523104,
        1.79159832, -0.9246214 , -0.62084085,  0.8141697 ,  0.27155262,
       -0.52180171, -0.4197911 ,  1.2754041 ,  1.40926552, -0.17711297,
        1.82477164,  2.10545039,  0.39466086, -0.38332242, -2.45149159,
       -1.30801129, -0.52910906, -2.52472878,  0.8167603 , -1.39252615,
       -0.60964322,  0.6083827 ,  0.38423875,  0.31054568,  1.5834558 ,
       -0.72586739, -0.33901188,  1.77654541, -0.53230882,  0.68429178,
       -1.74581039,  2.62307525, -2.88806415, -2.43315721,  0.69085068,
        1.36657429, -1.761127  , -1.27641737, -0.94843   , -2.31261659,
       -0.41608807, -1.59536123,  0.99103069, -1.20516801,  0.87436265,
       -2.33065391, -0.07764923, -3.6261549 ,  0.07998333, -1.67541242,
        1.87558091,  1.10198617, -1.5891521 ,  0.41734961, -0.213735  ,
       -1.21290421,  0.36418834,  1.36866987,  0.26018289, -0.97716707,
       -0.47556475,  0.55697936, -2.16553831, -0.81634772, -0.45

In [110]:
w2v.similarity('women', 'family')

0.045308951966923909

In [111]:
w2v.similarity('women', 'children')

0.38699667578156438

In [112]:
w2v.most_similar(positive=['women', 'family'], negative=['men'], topn=10)

[('child', 0.35865381360054016),
 ('safety', 0.3565200865268707),
 ('health_safety', 0.3312123417854309),
 ('parent', 0.3298790752887726),
 ('parenting', 0.303944855928421),
 ('domestic_violence', 0.30320364236831665),
 ('economic_security', 0.30248308181762695),
 ('special', 0.30185288190841675),
 ('financial_stability', 0.29760926961898804),
 ('women_girls', 0.29752638936042786)]

In [113]:
w2v.most_similar(positive=['women', 'victim'], topn=10)

[('rape', 0.5302175879478455),
 ('pregnant_women', 0.5212306976318359),
 ('sexually', 0.5152134895324707),
 ('domestic_violence', 0.5142741799354553),
 ('pregnancies', 0.5021679401397705),
 ('women_children', 0.49617135524749756),
 ('persons', 0.4905802607536316),
 ('transitional_housing', 0.48104992508888245),
 ('child_victims', 0.46751460433006287),
 ('teenagers', 0.4657415747642517)]