In [21]:
import json
import gensim
import nltk
import string
import scipy
from nltk.corpus import stopwords
import numpy as np
#from tqdm import tqdm

In [10]:
data = json.load(open('data.json'))

In [5]:
def tokenization(text):
    """
    This function takes a a string of text. Then tokenize it.
    
    Args:
        text: a text string
    Returns:
        a list of tokens(words)
    """
    stpwords = stopwords.words('english')
    text = text.lower()
    proc = ''
    i = 0
    while i < len(text):
        c = text[i]
        #remove punctuation
        if c not in string.punctuation:
            proc+=c
        elif c == "'":
            if i == (len(text)-1):
                break
            if text[i+1] == 's' and (i+2) == len(text):
                break
            if text[i+1] == 's' and text[i+2] == ' ':
                i+=1
        else:
            proc+=' '
        i+=1  
    #tokenization by nltk
    tokens = nltk.word_tokenize(proc)
    #remove stop words in tokens
    result = []
    for word in tokens:
        if word not in stpwords:
            result.append(word)
    return result

In [14]:
tokenized_data = {}
for school in data.keys():
    tokenized_data[school] = {}
    for person in data[school].keys():
        tokenized_data[school][person] = tokenization(data[school][person])

In [15]:
with open('tokenized_data.json', 'w') as fp:
    json.dump(tokenized_data, fp)

In [12]:
#load google word2vec model
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [13]:
def embedding(tokens, model):
    """
    This function takes a list of tokens(words), and a pretrained word2vec model in gensim.
        Then transform the tokens into a single vector.
    Args: 
        tokens: a list of string representing the tokens
        model: a pretrained word2vec gensim model
    returns:
        vec: a np array representing the tokens.
    """
    vec = None
    for word in tokens:
        #if the model does not contain the word. just skip it
        if word not in model.vocab:
            continue
        if type(vec) != type(None):
            vec = vec + model[word]
        else:
            vec = model[word]
    return vec

In [16]:
for school in tokenized_data.keys():
    processed_data[school] = {}
    for person in data[school].keys():
        processed_data[school][person] = embedding(tokenized_data[school][person],model)

In [18]:
print(processed_data['CMU'].keys())

dict_keys(['Den%C3%A9e_Benton', 'Kathleen_Carley', 'Greg_Mottola', 'Steven_Bochco', 'Nik_Bonaddio', 'Keith_Lockhart', 'Roberta_Klatzky', 'Megan_Hilty', 'Jeffrey_Mylett', 'Sada_Thompson', 'Lars_Peter_Hansen', 'James_Cromwell', 'John_Robert_Anderson_(psychologist)', 'William_A._Barnett', 'Gabriel_Macht', 'Tamara_Tunie', 'Allan_Meltzer', 'Lori_Rom', 'Jeffrey_Zaslow', 'Frank_Converse', 'Gaius_Charles', 'Merton_Miller', 'David_Hornsby', 'Alan_Perlis', 'Guy_L._Steele_Jr.', 'Renee_Elise_Goldsberry', 'Lourdes_Benedicto', 'Harry_Shum', 'John_Pasquin', 'Clifford_Shull', 'John_Wells_(TV_producer)', 'Jack_Klugman', 'John_Currin', 'Otto_Stern', 'Henry_Mazer', 'Lou_Scheimer', 'Robert_V._Rice', 'Drew_D._Perkins', 'Rhys_Coiro', 'Charles_Erwin_Wilson', 'Paul_Lauterbur', 'Leslie_Valiant', 'Edward_C._Prescott', 'Herb_Gardner', 'David_Haskell', 'Arthur_Lubin', 'Ivan_Sutherland', 'Joe_Manganiello', 'Sydney_Kamlager', 'Daniel_Nagin', 'Pradeep_Sindhu', 'Richard_Duffin', 'Franco_Modigliani', 'Teresa_Heinz', '

In [22]:
np.savez('vec_data.npz', **processed_data)

In [23]:
test = np.load('vec_data.npz')


In [24]:
print(test.keys())

['CMU', 'UCLA', 'Stanford', 'Harvard', 'Berklee']
