In [18]:
import os

import pickle
import re

from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter, PDFPageAggregator
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

import nltk as nltk

#lda
from spacy.en import English
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.snowball import SnowballStemmer
from gensim import corpora, models
import gensim
import pdb

#Bag of Words
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

#TfIdf
import gensim

#Word2Vect
from gensim.models.doc2vec import TaggedDocument

#ldaVis
import pyLDAvis.gensim

#spacy
import spacy
nlp = spacy.load('en')

In [19]:
def get_text(fname, pages=None):
        if not pages:
            pagenums = set()
        else:
            pagenums = set(pages)

        output = StringIO()
        manager = PDFResourceManager()
        converter = TextConverter(manager, output, laparams=LAParams())
        interpreter = PDFPageInterpreter(manager, converter)

        infile = file(fname, 'rb')
        for page in PDFPage.get_pages(infile, pagenums):
            interpreter.process_page(page)
        infile.close()
        converter.close()
        text = output.getvalue()
        output.close
        return unicode(text,'utf-8')

In [20]:
from nltk import ngrams
def runTextAnalysis(txt):
    # create English stop words list
    en_stop = get_stop_words('en')
    # Create p_stemmer of class PorterStemmer
    p_stemmer = SnowballStemmer('english')
    
    wnl = nltk.WordNetLemmatizer()
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    txtAnalysis = {
        'rawText':txt,
        'abstract': txt.lower().split('introduction')[0].split('abstract'),
        'tokens': [i for i in nltk.word_tokenize(txt) if not i.isdigit() if len(i)>2],
        'stopped_tokens':[i for i in nltk.word_tokenize(txt) if not i.isdigit() if not i in en_stop],
        'stemmed_tokens':0,
        'num_words': 0,
        'words': 0,
        'vocab': 0,
        'lemmatizedVocab': 0,
        'senteces': 0
    }  
    txtAnalysis['stemmed_tokens'] = [p_stemmer.stem(i) for i in txtAnalysis['stopped_tokens']]
    txtAnalysis['num_words'] = len(txtAnalysis['tokens'])
    txtAnalysis['words'] = [w for w in txtAnalysis['stopped_tokens']]
    txtAnalysis['vocab'] = sorted(set(txtAnalysis['words']))
    txtAnalysis['lemmatizedVocab'] = [wnl.lemmatize(t) for t in txtAnalysis['vocab']]
    txtAnalysis['sentences'] = sent_tokenizer.tokenize(txt)
    #convert to nltk Text
    text = nltk.Text(txtAnalysis['tokens'])
    #Collocations are very interesting but just prints text to screen need to retrieve this somehow.
    #collocations = text.collocations()
    return txtAnalysis


In [187]:
def loadTextData():
    fileList = []
    for file in os.listdir("."):
        if file.endswith(".pdf"):
#         if file.endswith('16475329441669445062.pdf'):
          fileList.append(file)
    return fileList

In [22]:
def bagOfWords(documents):
    vectorizer = CountVectorizer(min_df=5, max_df=.95)
    print type(documents)
    doc_term_matrix = vectorizer.fit_transform([document for document in documents])
    vocab = np.array(vectorizer.get_feature_names())
    print('\nVocabulary:')
    print vocab
    
    print('\nDocument Term Matrix')
    formatted_row = '{:>12}' * (len(documents) +1)
#     print('\n',formatted_row.format('Word',1),'\n')
    for word, item in zip(vocab, doc_term_matrix.T):
        output = [str(x) for x in item.data]
        print "word: " + word
        print output

In [23]:
def calculateTFidf(docAnalytics,parameter):
     #We store the tokenized text in a vector
        tokText = []
        for each in docAnalytics:
            tokText.append(each[parameter])
        #We map each word to a number
        dictionary = gensim.corpora.Dictionary(tokText)
        num_words = len(dictionary)
        #Create a corpus: List of bag of words
        corpus = [dictionary.doc2bow(eachtokText) for eachtokText in tokText]
        tf_idf = gensim.models.TfidfModel(corpus)
        #Create the simiarity measure Object
        sims = gensim.similarities.Similarity('./similarityStorage',tf_idf[corpus],num_features=num_words)
        return {"sims":sims,"dict":dictionary,"tf_idf":tf_idf}
        

In [24]:
def get_lda(textAnalytics,parameter):
    tokenizer = RegexpTokenizer(r'\w+')

    # create English stop words list
    en_stop = get_stop_words('en')

    # Create p_stemmer of class PorterStemmer
    p_stemmer = SnowballStemmer('english')
    # create sample documents
    texts = []
    for eachDocument in textAnalytics:
        texts.append(eachDocument[parameter])

    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(texts)
    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in texts]
    #Store to visualize
    pickle.dump(corpus,open('corpus.pkl','wb'))
    dictionary.save('dictionary.gensim')

    # generate LDA model
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=9, id2word = dictionary, passes=100)
    #Store to visualize
    ldamodel.save('20top_100Tok.gensim')

    return ldamodel

In [25]:
 def get_topics(lda,num_topics, num_words):
        doc_topics = []
        topics = lda.print_topics(num_topics=num_topics, num_words=num_words)
        return topics

In [26]:
def word2VectTraining(textAnalysis,parameter):
        """Training Word Vectors"""
        txt = []
        for each in textAnalysis:
            txt.append(each[parameter])
        model = gensim.models.word2vec.Word2Vec(txt,min_count=10,size=300)
        return model

In [27]:
def word2VectDocumentTraining(textAnalysis,parameter):
        """Training Document Word Vectors
        Needs to run with raw Text for expected results"""
        tagged_documents = []
        for i, doc in enumerate(textAnalysis):
            tagged_documents.append(TaggedDocument(doc[parameter],["doc_{}".format(i)]))
        d2v_model = gensim.models.doc2vec.Doc2Vec(tagged_documents,size=300)
        return d2v_model

In [28]:
def LDAvis(model):
    dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
    corpus = pickle.load(open("corpus.pkl","rb"))
    lda = gensim.models.ldamodel.LdaModel.load(model)
    print dictionary 
    print lda
    lda_display = pyLDAvis.gensim.prepare(lda,corpus,dictionary,sort_topics=False)
    return lda_display

In [31]:
def loadPickle(name):
    return pickle.load(open(name,'rb'))

def savePickle(data,name):
    pickle.dump(data,open(name,'wb'))

In [210]:
from nltk import ngrams
from nltk import regexp_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def runTextAnalysisSpacey(txt):
    """Text Processing workflow for documents"""
    def show_ents(ents):
        """Return an entity array for a document"""
        entitiesArray = []
#         print ("Print first entity ........")
        entities = list(ents)
        for entity in entities:
            entitiesArray.append((entity.label_,' '.join(t.orth_ for t in entity)))
#         if entities:
#             print(entities[0].label_,' '.join(t.orth_ for t in entities[0]))
        return entitiesArray
    def parse_ents(txtAnalysis):
        """Parses entities in text into entity name"""
        all_ents = []
        parsed_tokens = txtAnalysis['tokens'][:]
        for entities in txtAnalysis['entities']:
            all_ents.append(entities[0])
        all_ents = set(all_ents)

        for eachEntType in all_ents:
            entType = set([entity[1] for entity in txtAnalysis['entities'] if entity[0] == eachEntType])
#             print eachEntType
#             print entType
            for index, token in enumerate(txtAnalysis['tokens']):
                if token in entType:
                    parsed_tokens[index] = entType
                else:
                    parsed_tokens[index] = token
        return parsed_tokens
    def is_float(s):
        """Custom function needed for detecting float numbers"""
        try:
            float(s)
            return True
        except ValueError:
            return False
    def sentimentAnalysis(txt):
        """Simple function to detect sentiment of a sentence"""
        sentimentArray = []
        for sentence in list(txt):
            vs = analyzer.polarity_scores(sentence.text)
            sentimentArray.append((sentence,vs))
        return sentimentArray
    
    #set txt to unicode and lower case
    txt = txt.lower()
    # create English stop words list
    en_stop = get_stop_words('en')
    
    parser = English()
    #Convert to SPACEY text
    doc = nlp(txt)
    tokens = parser(txt)
    txtAnalysis = {
        'rawText':txt,
        'abstract':False,
        'conclusion': False,
        'references': False,
        'tokens': [token.orth_.lower() for token in tokens if not token.orth_.isspace() if not token.orth_.lower() in en_stop],
        'sents' : list(doc.sents),
        'lemma' : [token.lemma_.lower() for token in tokens if not token.orth_.isspace()],
        'entities': show_ents(doc.ents),
        'parsed_entities' : False
    }
    #We store the different sections if I can parse them
    if len(a['rawText'].split('introduction'))>1:
        txtAnalysis['abstract'] = {'text': nlp(''.join(txtAnalysis['rawText'].split('introduction')[0])),"sent_analysis":False}
        txtAnalysis['abstract']['sent_analysis'] = sentimentAnalysis(txtAnalysis['abstract']['text'].sents)
    if len(regexp_tokenize(txtAnalysis['rawText'], pattern=('conclusion(.*?)references')))>0:
        txtAnalysis['conclusion'] = {'text': nlp(''.join(regexp_tokenize(txtAnalysis['rawText'], pattern=('conclusion(.*?)references')))),"sent_analysis":False}
        txtAnalysis['conclusion']['sent_analysis'] = sentimentAnalysis(txtAnalysis['abstract']['text'].sents)
    if len(txtAnalysis['rawText'].split('references'))>1:
        txtAnalysis['references'] = {'text': nlp(''.join(txtAnalysis['rawText'].split('references')[1])),"sent_analysis":False}
    txtAnalysis['parsed_entities'] = parse_ents(txtAnalysis)
    #We delete the numbers from the tokens
    txtAnalysis['tokens'] = [token for token in txtAnalysis['tokens'] if not token.isdigit() if not is_float(token) if not token in [',','.','[',']','-',';','(',')',':','=']]
    return txtAnalysis

In [135]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
def sentimentAnalysis(txt):
    sentimentArray = []
    for sentence in list(a['conclusion'].sents):
        vs = analyzer.polarity_scores(sentence.text)
        sentimentArray.append((sentence,vs))
    return sentimentArray

In [134]:

analyzer = SentimentIntensityAnalyzer()

# list(a['conclusion'].sents)[0].text
for sentence in list(a['conclusion'].sents):
    vs = analyzer.polarity_scores(sentence.text)
#     print "\n"
#     print("{:-<65} {}".format(sentence, str(vs)))
    print vs

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.0, 'neu': 0.903, 'pos': 0.097, 'compound': 0.4215}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.0, 'neu': 0.939, 'pos': 0.061, 'compound': 0.0772}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.09, 'neu': 0.738, 'pos': 0.172, 'compound': 0.25}
{'neg': 0.0, 'neu': 0.828, 'pos': 0.172, 'compound': 0.6597}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
{'neg': 0.123, 'neu': 0.636, 'pos': 0.241, 'compound': 0.3818}
{'neg': 0.0, 'neu': 0.417, 'pos': 0.583, 'compound': 0.6369}


In [149]:
sents = sentimentAnalysis(a['conclusion'].sents)
for sent in sents:
    if sent[1]['pos']>0.2:
        print sent[0]
        print "\n\n"
        print sent[1]

our ap-
proach obtains more reasonable articulation patterns and is better
in solving the double counting problem.



{'neg': 0.123, 'neu': 0.636, 'pos': 0.241, 'compound': 0.3818}
best viewed in color.





{'neg': 0.0, 'neu': 0.417, 'pos': 0.583, 'compound': 0.6369}


In [205]:
a = runTextAnalysisSpacey(text[0])

In [203]:
def sentimentAnalysis(txt):
    """Simple function to detect sentiment of a sentence"""
    sentimentArray = []
    for sentence in list(txt):
        vs = analyzer.polarity_scores(sentence.text)
        sentimentArray.append((sentence,vs))
    return sentimentArray

peyt = sentimentAnalysis(a['conclusion']['text'].sents)
print peyt

[(

this paper has proposed a multi-source deep model for
pose estimation., {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}), (it non-linearly integrates three information
sources: appearance score, deformation and appearance
mixture type., {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}), (these information sources are used for de-
scribing different aspects of the single modality data, which
is the image data in our pose estimation approach., {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}), (exten-
sive experimental comparisons on three public benchmark
datasets show that the proposed model obviously improves
the pose estimation accuracy and outperforms the state of
the art., {'neg': 0.0, 'neu': 0.903, 'pos': 0.097, 'compound': 0.4215}), (since this model is a post-processing of informa-
tion sources, it is very ﬂexible in terms of integrating with
existing approaches that use different information sources,
features, or articulation models., {'neg': 0.0, 'neu': 1.0

In [113]:
list(a['conclusion'].sents)[2]

these information sources are used for de-
scribing different aspects of the single modality data, which
is the image data in our pose estimation approach.

# Main code: We load the pdfs, run the analytics on them.

In [188]:
files = loadTextData()
print files
print len(files)
text = []
textAnalytics = []
count = 0
for each in files:
    text.append(get_text(each))
    analysis = runTextAnalysisSpacey(text[count])
    textAnalytics.append(analysis)
    print count
    count = count + 1
print text

['10627391380307827290.pdf', '10871829039953032247.pdf', '11203799587472723180.pdf', '12474236712650393344.pdf', '12604916758840857978.pdf', '13187742643561364870.pdf', '13446248872976146902.pdf', '13911076262768119041.pdf', '14330626598474812204.pdf', '1451703473279047534.pdf', '14774739962596400760.pdf', '15677418431280773725.pdf', '16475329441669445062.pdf', '1712352881354258034.pdf', '17268489282982414918.pdf', '17554513562881542355.pdf', '2866321627490206847.pdf', '2973992380342580480.pdf', '3286245540879480275.pdf', '3905919677086105959.pdf', '394128052107362798.pdf', '4097536640488718528.pdf', '4520651819365506698.pdf', '5700939689460967442.pdf', '6558128843129001214.pdf', '6902252272651036995.pdf', '7106912543889350821.pdf', '74307944265552935.pdf', '8255440757806230750.pdf', '8535617555901469791.pdf', '9056558950588383944.pdf', 'document.pdf', 'Multimodal Deep Learning.pdf', 'test.pdf', 'test1.pdf', 'test2.pdf']
36
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23


In [209]:
for doc in textAnalytics:
    if doc['abstract'] != False:
        for sent in doc['abstract']['sent_analysis']:
            if sent[1] > 0.2:
                print sent
        

(multi-source deep learning for human pose estimation

wanli ouyang xiao chu xiaogang wang

department of electronic engineering, the chinese university of hong kong

wlouyang@ee.cuhk.edu.hk, xgwang@ee.cuhk.edu.hk

abstract

visual appearance score, appearance mixture type and
deformation are three important information sources for
human pose estimation., {'neg': 0.0, 'neu': 0.959, 'pos': 0.041, 'compound': 0.2023})
(this paper proposes to build a
multi-source deep model in order to extract non-linear
representation from these different aspects of information
sources. with the deep model, the global,, {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0})
(high-order hu-
man body articulation patterns in these information sources
are extracted for pose estimation., {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0})
(the task for estimat-
ing body locations and the task for human detection are
jointly learned using a uniﬁed deep model., {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound

## TFIDF

In [25]:
tfidf = calculateTFidf(textAnalytics,'lemmatizedVocab')
testQuery = textAnalytics[0]['lemmatizedVocab']
#testQuery =['deep','learning','machine','speech']
#Create dictionary and transform to tf_idf vector
testQuery_doc_bow = tfidf['dict'].doc2bow(testQuery)
#print testQuery_doc_bow
#print testQuery_doc_bow
# test_tf_idf = calculateTFidf(testQuery,False)
testQuery_tf_idf = tfidf['tf_idf'][testQuery_doc_bow]
vector = tfidf['sims'][testQuery_tf_idf]
print vector

[ 1.00000012  0.03390312  0.05097634  0.03979823  0.0481088   0.03399898
  0.04389757  0.03610792  0.05845335  0.04130838  0.03907927  0.06473102
  0.07154746  0.05932158  0.0378841   0.08124913  0.04131949  0.04432919
  0.05244407  0.04781858  0.00496908  0.05387572  0.0481088   0.05585184
  0.03395354  0.01902193  0.02988975]


In [None]:
## WORD TO VECT FOR DOCUMENTS

In [270]:
word2vecDoc = word2VectDocumentTraining(textAnalytics,'tokens')
testVector = word2vecDoc.infer_vector(textAnalytics[0]['tokens'])
word2vecDoc.docvecs.most_similar([testVector])

[('doc_0', 0.8861734867095947),
 ('doc_19', 0.8507816195487976),
 ('doc_3', 0.8429484963417053),
 ('doc_25', 0.8331794738769531),
 ('doc_21', 0.8325570821762085),
 ('doc_8', 0.8274330496788025),
 ('doc_20', 0.8213350772857666),
 ('doc_17', 0.8203819394111633),
 ('doc_12', 0.8165305256843567),
 ('doc_23', 0.8138983249664307)]

In [259]:
## Get The topics

In [271]:
ldaFunction = get_lda(textAnalytics,'tokens')

In [272]:
topics = get_topics(ldaFunction,10,4)
print topics

[(0, u'0.010*"domain" + 0.009*"cnn" + 0.007*"image" + 0.007*"e"'), (1, u'0.000*"learning" + 0.000*"deep" + 0.000*"speech" + 0.000*"processing"'), (2, u'0.011*"features" + 0.011*"audio" + 0.010*"data" + 0.009*"deep"'), (3, u'0.015*"model" + 0.013*"image" + 0.011*"images" + 0.009*"neural"'), (4, u'0.016*"text" + 0.010*"image" + 0.009*"images" + 0.008*"language"'), (5, u'0.009*"view" + 0.009*"cca" + 0.008*"correlation" + 0.007*"learning"'), (6, u'0.016*"learning" + 0.012*"speech" + 0.011*"\u201c" + 0.011*"\u201d"'), (7, u'0.021*"speech" + 0.021*"learning" + 0.020*"deep" + 0.016*"processing"'), (8, u'0.012*"feedback" + 0.006*"e" + 0.006*"n" + 0.005*"t"')]


In [262]:
lda_display = LDAvis('20top_100.gensim')
pyLDAvis.display(lda_display)

Dictionary(17809 unique tokens: [u'i|W', u'Andreas', u'LATEX', u'nunnery', u'tweenthelastlayerofthenetworkandthesoftmaxlayer']...)
LdaModel(num_terms=16675, num_topics=9, decay=0.5, chunksize=2000)


IndexError: index 16675 is out of bounds for axis 1 with size 16675

In [89]:
lda_display = LDAvis('20top_100Tok.gensim')
pyLDAvis.display(lda_display)

Dictionary(14280 unique tokens: [u'orthogon', u'fig-', u'statistical/machin', u'l.a.n.', u'yellow']...)
LdaModel(num_terms=14280, num_topics=9, decay=0.5, chunksize=2000)


In [15]:
a = nltk.Text(textAnalytics[0]['tokens'])

In [16]:
asd = a.collocations()

pose estimation; information sources; ... ...; deep model; mixture
type; mixture types; shown Fig; body locations; mod- els; human pose;
body parts; Best viewed; IEEE Trans; viewed color; hidden layers; high
response; 46.6 83.1; 85.8 76.5; L.leg U.arm; Method Torso


In [38]:
asd = ngrams(a, 3)

In [18]:
for each in textAnalytics[0]['tokens']:
    if each is not each.isdigit():
        print "####"
        print each

####
Multi-source
####
Deep
####
Learning
####
for
####
Human
####
Pose
####
Estimation
####
Wanli
####
Ouyang
####
Xiao
####
Chu
####
Xiaogang
####
Wang
####
Department
####
Electronic
####
Engineering
####
The
####
Chinese
####
University
####
Hong
####
Kong
####
wlouyang
####
ee.cuhk.edu.hk
####
xgwang
####
ee.cuhk.edu.hk
####
Abstract
####
Visual
####
appearance
####
score
####
appearance
####
mixture
####
type
####
and
####
deformation
####
are
####
three
####
important
####
information
####
sources
####
for
####
human
####
pose
####
estimation
####
This
####
paper
####
proposes
####
build
####
multi-source
####
deep
####
model
####
order
####
extract
####
non-linear
####
representation
####
from
####
these
####
different
####
aspects
####
information
####
sources
####
With
####
the
####
deep
####
model
####
the
####
global
####
high-order
####
hu-
####
man
####
body
####
articulation
####
patterns
####
these
####
information
####
sources
####
are
####
extracted
####
for
####
pose

In [20]:
a ="60.3"
a.isdigit()

  def _ipython_display_formatter_default(self):
  def _formatters_default(self):
  def _deferred_printers_default(self):
  def _singleton_printers_default(self):
  def _type_printers_default(self):
  def _singleton_printers_default(self):
  def _type_printers_default(self):
  def _deferred_printers_default(self):


False

In [40]:
for grams in asd:
    print grams

(u'Multi-source', u'Deep', u'Learning')
(u'Deep', u'Learning', u'for')
(u'Learning', u'for', u'Human')
(u'for', u'Human', u'Pose')
(u'Human', u'Pose', u'Estimation')
(u'Pose', u'Estimation', u'Wanli')
(u'Estimation', u'Wanli', u'Ouyang')
(u'Wanli', u'Ouyang', u'Xiao')
(u'Ouyang', u'Xiao', u'Chu')
(u'Xiao', u'Chu', u'Xiaogang')
(u'Chu', u'Xiaogang', u'Wang')
(u'Xiaogang', u'Wang', u'Department')
(u'Wang', u'Department', u'Electronic')
(u'Department', u'Electronic', u'Engineering')
(u'Electronic', u'Engineering', u'The')
(u'Engineering', u'The', u'Chinese')
(u'The', u'Chinese', u'University')
(u'Chinese', u'University', u'Hong')
(u'University', u'Hong', u'Kong')
(u'Hong', u'Kong', u'wlouyang')
(u'Kong', u'wlouyang', u'ee.cuhk.edu.hk')
(u'wlouyang', u'ee.cuhk.edu.hk', u'xgwang')
(u'ee.cuhk.edu.hk', u'xgwang', u'ee.cuhk.edu.hk')
(u'xgwang', u'ee.cuhk.edu.hk', u'Abstract')
(u'ee.cuhk.edu.hk', u'Abstract', u'Visual')
(u'Abstract', u'Visual', u'appearance')
(u'Visual', u'appearance', u'score'

In [38]:
# def tokens_to_root(token):
#     """
#     Walk up the syntactic tree, collecting tokens to the root of the given `token`.
#     :param token: Spacy token
#     :return: list of Spacy tokens
#     """
#     tokens_to_r = []
#     while token.head is not token:
#         tokens_to_r.append(token)
#         token = token.head
#         tokens_to_r.append(token)

#     return tokens_to_r

# # For every token in document, print it's tokens to the root
# for token in doc:
#     print('{} --> {}'.format(token, tokens_to_root(token)))

# # Print dependency labels of the tokens
# for token in doc:
#     print('-> '.join(['{}-{}'.format(dependent_token, dependent_token.dep_) for dependent_token in tokens_to_root(token)]))

# doc = nlp.make_doc(text[0])

# tokens_to_root(doc[5])

doc = par
for token in doc:
    print(token, ',', token.prob)

(Multi, ',', -19.579313278198242)
(-, ',', -5.202415943145752)
(source, ',', -8.812856674194336)
(Deep, ',', -19.579313278198242)
(Learning, ',', -19.579313278198242)
(for, ',', -4.91396951675415)
(Human, ',', -11.173282623291016)
(Pose, ',', -19.579313278198242)
(Estimation, ',', -19.579313278198242)
(

, ',', -4.458347320556641)
(Wanli, ',', -19.579313278198242)
(Ouyang, ',', -19.579313278198242)
(Xiao, ',', -19.579313278198242)
(Chu, ',', -19.579313278198242)
(Xiaogang, ',', -19.579313278198242)
(Wang, ',', -19.579313278198242)
(

, ',', -4.458347320556641)
(Department, ',', -11.11874008178711)
(of, ',', -4.1284637451171875)
(Electronic, ',', -19.579313278198242)
(Engineering, ',', -19.579313278198242)
(,, ',', -3.3914804458618164)
(The, ',', -5.774222373962402)
(Chinese, ',', -10.189560890197754)
(University, ',', -10.51649284362793)
(of, ',', -4.1284637451171875)
(Hong, ',', -19.579313278198242)
(Kong, ',', -19.579313278198242)
(

, ',', -4.458347320556641)
(wlouyang@ee.cuhk.edu.h

In [48]:
parser = English()
doc = nlp(text[0])
sents = list(doc.sents)

In [44]:
for sent in doc.sents:
    print "###"
    print sent

###
Multi-source Deep Learning for Human Pose Estimation

Wanli Ouyang Xiao Chu Xiaogang Wang

Department of Electronic Engineering, The Chinese University of Hong Kong

wlouyang@ee.cuhk.edu.hk, xgwang@ee.cuhk.edu.hk

Abstract

Visual appearance score, appearance mixture type and
deformation are three important information sources for
human pose estimation.
###
This paper proposes to build a
multi-source deep model in order to extract non-linear
representation from these different aspects of information
sources.
###
With the deep model, the global, high-order hu-
man body articulation patterns in these information sources
are extracted for pose estimation.
###
The task for estimat-
ing body locations and the task for human detection are
jointly learned using a uniﬁed deep model.
###
The proposed
approach can be viewed as a post-processing of pose esti-
mation results and can ﬂexibly integrate with existing meth-
ods by taking their information sources as input.
###
By extract-
ing the 

In [29]:
from nltk import ngrams
def runTextAnalysisSpacey(txt):
    def show_ents(ents):
        """Return an entity array for a document"""
        entitiesArray = []
#         print ("Print first entity ........")
        entities = list(ents)
        for entity in entities:
            entitiesArray.append((entity.label_,' '.join(t.orth_ for t in entity)))
#         if entities:
# #             print(entities[0].label_,' '.join(t.orth_ for t in entities[0]))
        return entitiesArray
    def parse_ents(txtAnalysis):
        """Parses entities in text into entity name"""
        all_ents = []
        parsed_tokens = txtAnalysis['tokens'][:]
        for entities in txtAnalysis['entities']:
            all_ents.append(entities[0])
        all_ents = set(all_ents)

        for eachEntType in all_ents:
            entType = set([entity[1] for entity in txtAnalysis['entities'] if entity[0] == eachEntType])
#             print eachEntType
#             print entType
            for index, token in enumerate(txtAnalysis['tokens']):
                if token in entType:
                    parsed_tokens[index] = entType
                else:
                    parsed_tokens[index] = token
        return parsed_tokens
    def is_float(s):
        """Custom function needed for detecting float numbers"""
        try:
            float(s)
            return True
        except ValueError:
            return False
            
    # create English stop words list
    en_stop = get_stop_words('en')
#     # Create p_stemmer of class PorterStemmer
#     p_stemmer = SnowballStemmer('english')
    
#     wnl = nltk.WordNetLemmatizer()
#     sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#     txtAnalysis = {
#         'rawText':txt,
#         'abstract': txt.lower().split('introduction')[0].split('abstract'),
#         'tokens': [i for i in nltk.word_tokenize(txt) if not i.isdigit() if len(i)>2],
#         'stopped_tokens':[i for i in nltk.word_tokenize(txt) if not i.isdigit() if not i in en_stop],
#         'stemmed_tokens':0,
#         'num_words': 0,
#         'words': 0,
#         'vocab': 0,
#         'lemmatizedVocab': 0,
#         'senteces': 0
#     }  
#     txtAnalysis['stemmed_tokens'] = [p_stemmer.stem(i) for i in txtAnalysis['stopped_tokens']]
#     txtAnalysis['num_words'] = len(txtAnalysis['tokens'])
#     txtAnalysis['words'] = [w for w in txtAnalysis['stopped_tokens']]
#     txtAnalysis['vocab'] = sorted(set(txtAnalysis['words']))
#     txtAnalysis['lemmatizedVocab'] = [wnl.lemmatize(t) for t in txtAnalysis['vocab']]
#     txtAnalysis['sentences'] = sent_tokenizer.tokenize(txt)
#     #convert to nltk Text
#     text = nltk.Text(txtAnalysis['tokens'])
#     #Collocations are very interesting but just prints text to screen need to retrieve this somehow.
#     #collocations = text.collocations()
    
    parser = English()
    #Convert to SPACEY text
    doc = nlp(txt)
    tokens = parser(txt)
    txtAnalysis = {
        'tokens': [token.orth_.lower() for token in tokens if not token.orth_.isspace() if not token.orth_.lower() in en_stop],
        'sents' : list(doc.sents),
        'lemma' : [token.lemma_.lower() for token in tokens if not token.orth_.isspace()],
        'entities': show_ents(doc.ents),
        'parsed_entities' : False
    }
    txtAnalysis['parsed_entities'] = parse_ents(txtAnalysis)
    #We delete the numbers from the tokens
    txtAnalysis['tokens'] = [token for token in txtAnalysis['tokens'] if not token.isdigit() if not is_float(token) if not token in [',','.','[',']','-',';','(',')',':','=']]
    return txtAnalysis

In [273]:
a=runTextAnalysisSpacey(text[0])

In [274]:
a['tokens']

[u'multi',
 u'source',
 u'deep',
 u'learning',
 u'human',
 u'pose',
 u'estimation',
 u'wanli',
 u'ouyang',
 u'xiao',
 u'chu',
 u'xiaogang',
 u'wang',
 u'department',
 u'electronic',
 u'engineering',
 u'chinese',
 u'university',
 u'hong',
 u'kong',
 u'wlouyang@ee.cuhk.edu.hk',
 u'xgwang@ee.cuhk.edu.hk',
 u'abstract',
 u'visual',
 u'appearance',
 u'score',
 u'appearance',
 u'mixture',
 u'type',
 u'deformation',
 u'three',
 u'important',
 u'information',
 u'sources',
 u'human',
 u'pose',
 u'estimation',
 u'paper',
 u'proposes',
 u'build',
 u'multi',
 u'source',
 u'deep',
 u'model',
 u'order',
 u'extract',
 u'non',
 u'linear',
 u'representation',
 u'different',
 u'aspects',
 u'information',
 u'sources',
 u'deep',
 u'model',
 u'global',
 u'high',
 u'order',
 u'hu-',
 u'man',
 u'body',
 u'articulation',
 u'patterns',
 u'information',
 u'sources',
 u'extracted',
 u'pose',
 u'estimation',
 u'task',
 u'estimat-',
 u'ing',
 u'body',
 u'locations',
 u'task',
 u'human',
 u'detection',
 u'jointly',

In [225]:
"03.523".isdigit()

False

In [263]:
print get_stop_words('en')

[u'a', u'about', u'above', u'after', u'again', u'against', u'all', u'am', u'an', u'and', u'any', u'are', u"aren't", u'as', u'at', u'be', u'because', u'been', u'before', u'being', u'below', u'between', u'both', u'but', u'by', u"can't", u'cannot', u'could', u"couldn't", u'did', u"didn't", u'do', u'does', u"doesn't", u'doing', u"don't", u'down', u'during', u'each', u'few', u'for', u'from', u'further', u'had', u"hadn't", u'has', u"hasn't", u'have', u"haven't", u'having', u'he', u"he'd", u"he'll", u"he's", u'her', u'here', u"here's", u'hers', u'herself', u'him', u'himself', u'his', u'how', u"how's", u'i', u"i'd", u"i'll", u"i'm", u"i've", u'if', u'in', u'into', u'is', u"isn't", u'it', u"it's", u'its', u'itself', u"let's", u'me', u'more', u'most', u"mustn't", u'my', u'myself', u'no', u'nor', u'not', u'of', u'off', u'on', u'once', u'only', u'or', u'other', u'ought', u'our', u'ours', u'ourselves', u'out', u'over', u'own', u'same', u"shan't", u'she', u"she'd", u"she'll", u"she's", u'should', u"