In [1]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO

from xml.dom import minidom

import numpy as np
import math

import string
import re

import functools

In [2]:
pdf_path = '/Users/HWLee/Desktop/papers/theory of relativity/0108042v1-60df3449.pdf'
pdf_path = '/Users/HWLee/Desktop/papers/exoplanet/0209382v1-1a0ebf0b.pdf'

## Defining classes

In [3]:
class Box(object):

    def __init__(self, xml_obj):
        self.xmlObj = xml_obj
        self.width = -1
        self.height = -1
        self.objId = -1
        self.center = np.array([-1, -1])
        
        boxSize = self.calBoxSize(xml_obj)
        boxCenter = self.calBoxCenter(xml_obj)
        if boxSize is not None:
            objId = -1
            if xml_obj.attributes:
                if 'id' in xml_obj.attributes.keys(): objId = xml_obj.attributes['id'].value
            
            self.width = boxSize[0]
            self.height = boxSize[1]
            self.objId = objId
            self.center = boxCenter

    def getWidth(self): return self.width

    def getHeight(self): return self.height

    def getBoxCenter(self): return self.center

    def getId(self): return self.objId

    def getBoxSize(self): return np.array([self.getWidth(), self.getHeight()])
    
    def getXMLObject(self): return self.xmlObj

    @staticmethod
    def calBoxSize(xml_obj):
        if 'bbox' in xml_obj.attributes.keys():
            pointsValues = xml_obj.attributes['bbox'].value.split(',')
            return np.array([ float(pointsValues[2]), float(pointsValues[3]) ]) - np.array([ float(pointsValues[0]), float(pointsValues[1]) ])
        return np.array([0, 0])

    @staticmethod
    def calBoxCenter(xml_obj):
        if 'bbox' in xml_obj.attributes.keys():
            pointsValues = xml_obj.attributes['bbox'].value.split(',')
            return ( np.array([ float(pointsValues[2]), float(pointsValues[3]) ]) + np.array([ float(pointsValues[0]), float(pointsValues[1]) ]) )/2
        return np.array([0, 0])

In [4]:
class Text(Box):

    def __init__(self, text_xml_obj):
        super(Text, self).__init__(text_xml_obj)
        self.textFont = ''
        self.textSize = -1
        if text_xml_obj.attributes:
            if 'font' in text_xml_obj.attributes.keys(): self.textFont = text_xml_obj.attributes['font'].value
        if text_xml_obj.attributes:
            if 'size' in text_xml_obj.attributes.keys():
                try: self.textSize = text_xml_obj.attributes['size'].value
                except: self.textSize = -1


    def getTextFont(self): return self.textFont

    def getTextSize(self): return self.textSize
    
    def getInnerXml(self):
        if self.xmlObj.firstChild is not None:
            return str(self.xmlObj.firstChild.nodeValue)
        else:
            return ''

In [5]:
class TextLine(Box):

    def __init__(self, textLine_xml_obj):
        super(TextLine, self).__init__(textLine_xml_obj)
        texts = textLine_xml_obj.getElementsByTagName('text')
        self.texts = map(Text, texts)

    def getTexts(self): return self.texts
    
    def getAvgFontSize(self):
        if len(self.texts) > 0:
            fontsizes = map(lambda text: float(text.getTextSize()), self.texts)
            fontsizes = filter(lambda x: x > 0, fontsizes)
            return float(int( sum(fontsizes) / len(fontsizes) * 1000 )) / 1000
        else:
            return -1

    def getLineText(self):
        strings = map(lambda text: text.getInnerXml(), self.texts)
        return reduce(lambda x,y: x+y, strings)

In [6]:
class TextBox(Box):

    def __init__(self, textBox_xml_obj):
        super(TextBox, self).__init__(textBox_xml_obj)
        textLines = textBox_xml_obj.getElementsByTagName('textline')
        self.textLines = map(TextLine, textLines)

    def getTextLines(self): return self.textLines
    
    def getAvgFontSize(self):
        if len(self.textLines) > 0:
            fontsizes = map(lambda textline: float(textline.getAvgFontSize()), self.textLines)
            fontsizes = filter(lambda x: x > 0, fontsizes)
            return float(int( sum(fontsizes) / len(fontsizes) * 1000 )) / 1000
        else:
            return -1
        
    def merge_box(self, box):
        map(lambda x: self.textLines.append(x), box.getTextLines())

    def toString(self): return reduce( lambda x,y: x+y , map(lambda x: x.getLineText(), self.textLines) );

In [7]:
class Page(Box):

    def __init__(self, page_xml_obj):
        super(Page, self).__init__(page_xml_obj)
        textBoxes = page_xml_obj.getElementsByTagName('textbox')
        textBoxes = textBoxes[:-(len(textBoxes)/2)]
        self.textBoxes = map(TextBox, textBoxes)

    def getTextBoxes(self): return self.textBoxes

## Extracting the first page

In [8]:
fp = file(pdf_path, 'rb')
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages_iterator = PDFPage.get_pages(fp)

In [9]:
first_page = None
for page in pages_iterator:
    read_position = retstr.tell()
    interpreter.process_page(page)
    retstr.seek(read_position, 0)
    first_page = retstr.read()
    first_page = filter(string.printable.__contains__, first_page)
    break

## Parsing xml structure

In [10]:
first_page_xml = minidom.parseString(first_page).getElementsByTagName('page')[0]
first_page_info = Page(first_page_xml)

## Define check center/edge-aligned function

In [11]:
def _is_aligned_center_or_edge(box_obj=None, tol=0, page_centerX=0):
    if box_obj:
        if abs(box_obj.getBoxCenter()[0] - page_centerX) < tol: return True
        if abs(box_obj.getBoxCenter()[0] - box_obj.getBoxSize()[0]/2) < tol: return True
        if abs(box_obj.getBoxCenter()[0] + box_obj.getBoxSize()[0]/2 - page_centerX*2) < tol: return True
    
    return False

## Define choosing title candidate function

In [12]:
def _might_be_title(textbox, page_centerX):
    if not textbox: return False
    if len(textbox.getTextLines()) > 3: return False
    if not _is_aligned_center_or_edge(box_obj=textbox, tol=20, page_centerX=page_centerX): return False
    
    return True

In [13]:
centerX = first_page_info.getBoxCenter()[0]
textboxes = first_page_info.getTextBoxes()
might_be_title = map(functools.partial(_might_be_title, page_centerX=centerX), textboxes)
title_cands = filter(lambda x: x[0], zip(might_be_title, textboxes))
title_cands = map(lambda x: x[1], title_cands)
dbg_set = zip(map(lambda x: x.toString(), title_cands), map(lambda x: x.getAvgFontSize(), title_cands))
dbg_set = zip(dbg_set, map(lambda x: x.getTextLines()[0].getTexts()[0].getTextFont(), title_cands))
dbg_set

[((' \n', 5.0), u'Times-Roman'),
 ((' \n \n', 5.0), u'Times-Roman'),
 (('What can exoplanets tell us about our Solar System?\n', 10.759),
  u'XUOWSW+CMBX12'),
 (('Our analysis suggests that Jupiter is more typical than indicated by previous\nanalyses, including our own (LG). For example, in Fig. 2, our  = 1.6 slope is\nslightly steeper than the  = 1.5 found in LG and is steeper than the   1.0\n',
   10.832),
  u'YDZIUJ+CMR10'),
 (('1\n', 10.909), u'YDZIUJ+CMR10')]

## Aggregating elements which have the same fontsize and fonttype

In [14]:
def _have_same_fontsize_n_fonttype(a, b):
    afont = a.getTextLines()[0].getTexts()[0].getTextFont()
    bfont = b.getTextLines()[0].getTexts()[0].getTextFont()
    if afont != bfont: return False
    if a.getAvgFontSize() != b.getAvgFontSize(): return False
    
    return True

def _get_sets_to_be_aggregated(title_cands):
    compared_results = map(_have_same_fontsize_n_fonttype, title_cands[:-1], title_cands[1:])
    indices = np.where(compared_results)[0]
    groupstart = np.append([ 0 ], np.where(np.diff(indices) != 1)[0] + 1)
    groupend = np.append(groupstart[1:], [ len(indices) ])
    aggre_arrays = map(lambda x,y: indices[x:y], groupstart, groupend)
    aggre_arrays = map(lambda x: np.append(x, [ x[-1]+1 ]), aggre_arrays)
    aggre_arrays = map(lambda x: (x[0], len(x)), aggre_arrays)
    
    return aggre_arrays

def _aggregate(title_cands, aggre_arrays):
    titles = title_cands
    if len(aggre_arrays) > 1:
        added_targets = reduce(lambda x,y: np.append(x[0], y[0]), aggre_arrays)
    else:
        added_targets = [ aggre_arrays[0][0] ]
    
    added_items = map(lambda x: np.r_[ x[0]+1:x[0]+x[1] ], aggre_arrays)
    remove_indices = reduce(lambda x,y: np.append(x, y), added_items)
    
    map(lambda target,items:
            map(lambda item: titles[target].merge_box(titles[item]), items), 
        added_targets, 
        added_items)
    
    need_be_removed = [False] * len(title_cands)
    for x in remove_indices: need_be_removed[x] = True
    titles = zip(titles, need_be_removed)
    
    return map(lambda x: x[0], filter(lambda x: not x[1], titles))

In [15]:
title_cands = _aggregate( title_cands, _get_sets_to_be_aggregated(title_cands) )
map(lambda x: x.toString(), title_cands)

[' \n \n \n',
 'What can exoplanets tell us about our Solar System?\n',
 'Our analysis suggests that Jupiter is more typical than indicated by previous\nanalyses, including our own (LG). For example, in Fig. 2, our  = 1.6 slope is\nslightly steeper than the  = 1.5 found in LG and is steeper than the   1.0\n',
 '1\n']

## Sorted with text size

In [16]:
title_cands_sorted = sorted(title_cands, key=lambda x: x.getAvgFontSize(), reverse=True)
map(lambda x: (x.toString(), x.getAvgFontSize()), title_cands_sorted)

[('1\n', 10.909),
 ('Our analysis suggests that Jupiter is more typical than indicated by previous\nanalyses, including our own (LG). For example, in Fig. 2, our  = 1.6 slope is\nslightly steeper than the  = 1.5 found in LG and is steeper than the   1.0\n',
  10.832),
 ('What can exoplanets tell us about our Solar System?\n', 10.759),
 (' \n \n \n', 5.0)]