In [78]:
import pandas as pd
import xml.etree.ElementTree as ET
from lxml import etree
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, core
import string
from collections import defaultdict
import gensim
import numpy as np

In [79]:
def xml2df(xml_data):
    tree = ET.parse(xml_data)
    root = tree.getroot()
    all_records = []
    headers = []
    for i, child in enumerate(root):
        record = []
        for subchild in child:
            record.append(subchild.text)
            if subchild.tag not in headers:
                headers.append(subchild.tag)
        all_records.append(record)
    return pd.DataFrame(all_records, columns=headers)

In [80]:
df = xml2df("MathFeedsDataAll.xml")

In [81]:
df.head()
#df.count()

Unnamed: 0,URL,Title,Text,Domain,blurb,date,image,isbn,kicker,price,timesDeleted,timesEmailed,timesOpened,timesSaved,timesShared,timestamp,wordtitle
0,http://blogs.ams.org/mathgradblog/2017/01/05/d...,Up to Date Blog Content for JMM 2017,Looking for blog content about the 2017 Joint ...,blogs.ams.org,Looking for blog content about the 2017 Joint ...,01/05/17,,,GRADUATE STUDENT BLOG,,1.0,,4.0,,,5774489532.046694,uptodateblogcontentforjmm2017
1,http://mrhonner.com/archives/17215,Math Photo: A Dodecagon of Octagons « Mr Honner,I’d never looked closely at the Parachute Jump...,mrhonner.com,I'd never looked closely at the Parachute Jump...,09/18/16,http://MrHonner.com/wp-content/uploads/2016/09...,,NOTABLE,,,,,,,5783496448.673001,mathphotoadodecagonofoctagons
2,https://mathbabe.org/2017/03/21/guest-post-the...,Guest post: the age of algorithms | mathbabe,Artie has kindly allowed me to post his though...,mathbabe.org,Artie has kindly allowed me to post his though...,03/21/17,,,BLOG,,2.0,2.0,11.0,1.0,,5767987618.719831,guestposttheageofalgorithms
3,http://www.scientificamerican.com/podcast/epis...,Sean M. Carroll Looks at The Big Picture - Sci...,Steve Mirsky: Welcome to Scientific American's...,scientificamerican.com,Caltech theoretical physicist Sean M. Carroll ...,05/12/16,https://www.scientificamerican.com/sciam/cache...,,,,,,,,,5795107200.0,seanmcarrolllooksatthebigpicture
4,http://the-japan-news.com/news/article/0003176002,The Japan News,Not found\n\nThe requested server cannot be ac...,the-japan-news.com,The education ministry will open research cent...,09/13/16,http://the-japan-news.com/modules/img/logo_ogp...,,BIG DATA,,,,,,,5784392943.842338,educationministrytopromoteuseofbigdata


In [82]:
def clean_word(w):
    strip_str = "()\".?!,;"
    new_word = "".join((c for c in w if c in string.printable))
    return new_word.strip(strip_str).lower()

def clean_text_list(doc):
    words = doc.split()
    clean_words = [clean_word(word) for word in words]
    return clean_words
    
def word_pairs(doc, window=3):
    """
    Returns a list of 2-tuples, which are pairs of words where the second word in the tuple
    appears within 'window' words of the first word.
    
    Parameters
    ----------
    doc (str) : a string representation of the document
    window (int) : how many words to the left and right should be considered 'nearby'
    
    """
    
    words = doc.split()
    word_pairs = []
    for i in range(len(words)):
        word = clean_word(words[i])
        index = i - window
        end = i + window
        while index <= end:
            if index >= 0 and index < len(words) and index != i:
                word_pairs.append((word, clean_word(words[index])))
            index += 1
    return word_pairs

def nearby(doc, window=3):
    """
    Returns a dictionary of every word mapping to a set of the words within 'window' words 
    to either side. This is to say, if window = 3, then the 3 
    
    Parameters
    ----------
    doc (str) : a string representation of the document
    window (int) : how many words to the left and right should be considered 'nearby'
    
    """
    
    words = doc.split()
    nearby_words = defaultdict(set)
    for i in range(len(words)):
        word = clean_word(words[i])
        index = i - window
        end = i + window
        while index <= end:
            if index >= 0 and index < len(words) and index != i:
                nearby_words[word].add(clean_word(words[index]))
            index += 1
    return nearby_words

def vec_of_words(doc):
    """
    Returns a numpy array of every distinct word in the given document.
    
    Parameters
    ----------
    doc (str) : a string representation of the document
    
    """
    
    word_set = set((clean_word(word) for word in doc.split()))
    return np.array(sorted(list(word_set)))

def one_hot(word, all_words):
    """
    Returns a one-hot numpy array of the position of 'word' in 'all_words'.
    
    Parameters
    ----------
    word (str) : the word for which to calculate the one-hot vector
    all_words (numpy array) : vector containing all words in your dictionary
    
    """
    
    w = np.zeros(all_words.shape)
    w[np.where(word_vec == word)] = 1
    return w

def output_vec(word, nearby, all_words):
    """
    Returns the expected output vector for the given word. 
    
    Parameters
    ----------
    word (str) : the word for which to calculate the output vector
    nearby (dict) : dictionary of words mapping to their nearby words
    all_words (numpy array) : vector containing all words in your dictionary
    
    """
    
    output = np.zeros(all_words.shape)
    for w in nearby[word]:
        value = 1 / len(nearby[word])
        output[np.where(word_vec == w)] = value
    return output

def word_vec(word, nearby_pairs, all_words, features=300):
    """
    Returns the word vector constructed by a neural network for the given word.
    
    Parameters
    ----------
    word (str) : the word for which to calculate the output vector
    nearby_pairs (iterable) : contains all pairs where 'word' is the first element
    all_words (numpy array) : vector containing all words in your dictionary
    
    """
    
    model = Sequential()
    model.add(Dense(output_dim=features, input_dim=len(all_words)))
    model.add(Dense(output_dim=len(all_words)))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='sgd')
    
    x = []
    y = []
    word_pairs = [pair for pair in nearby_pairs if pair[0] == word]
    for pair in word_pairs:
        x.append(one_hot(word, all_words).reshape((561,)))
        y.append(one_hot(pair[1], all_words))
    
    x = np.array(x)
    y = np.array(y)
    
    model.fit(x, y, batch_size=1, verbose=0)
    return model

In [83]:
text = df['Text'].as_matrix()
sentences = [clean_text_list(doc) for doc in text if type(doc) == str]

In [84]:
model = gensim.models.Word2Vec(sentences, size=300, window=5, min_count=5)

In [85]:
model.wv.most_similar(positive=['my'])

[('your', 0.6211767792701721),
 ('her', 0.5886389017105103),
 ('myself', 0.5515678524971008),
 ('his', 0.5409917831420898),
 ('their', 0.5047512054443359),
 ('reformer', 0.467762291431427),
 ('i', 0.4655497074127197),
 ('onos', 0.458649218082428),
 ('mine', 0.45262473821640015),
 ('writing', 0.4515327513217926)]

In [86]:
docs = [np.array([model.wv[word] for word in clean_text_list(text[i]) if word in model.wv]) \
        for i in range(len(text)) if type(text[i]) == str]

In [88]:
normal_docs = [doc / np.linalg.norm(doc,axis=1).reshape(-1,1) for doc in docs]

In [89]:
normal_docs[5]

array([[-0.05034369, -0.02842665,  0.05167367, ..., -0.1579048 ,
         0.07773382, -0.013701  ],
       [-0.02047904, -0.0269089 ,  0.04469975, ..., -0.02642511,
        -0.00900096, -0.01874059],
       [ 0.09048381, -0.02032686,  0.00130163, ..., -0.09201123,
         0.07367431, -0.11802816],
       ..., 
       [ 0.0207019 , -0.0496803 ,  0.06034489, ...,  0.00143243,
        -0.01272031, -0.01636854],
       [-0.01359427,  0.04085743, -0.06062559, ..., -0.14256397,
         0.0051433 , -0.01588947],
       [ 0.02121915,  0.09510662,  0.03920494, ..., -0.06651177,
         0.08185749, -0.15797204]], dtype=float32)