In [105]:
from operator import itemgetter
import fitz
import json


def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.

    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool

    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles


def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.

    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict

    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag


def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.

    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict

    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[s['size']] + s['text']
                            else:
                                if s['size'] == previous_s['size']:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[s['size']] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[s['size']] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    block_string = size_tag[s['size']] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

                header_para.append(block_string)

    return header_para


def main():

    document = '/Users/yasminsarkhosh/Downloads/storage/IZXYGYNV/Azadi m.fl. - 2023 - ALL-IN A Local GLobal Graph-Based DIstillatioN Mo.pdf'
    doc = fitz.open(document)

    font_counts, styles = fonts(doc, granularity=False)

    size_tag = font_tags(font_counts, styles)

    elements = headers_para(doc, size_tag)

    with open("doc.json", 'w') as json_out:
        json.dump(elements, json_out)


if __name__ == '__main__':
    main()

['<h1>ALL-IN: A Local GLobal Graph-Based| DIstillatioN Model for Representation| Learning of Gigapixel Histopathology| Images With Application In Cancer Risk| Assessment|', '', '<p>Puria Azadi', '<s3>1', '<p>, Jonathan Suderman', '<s3>1', '<p>, Ramin Nakhli', '<s3>1', '<p>, Katherine Rich', '<s3>1', '<p>,| Maryam Asadi', '<s3>1', '<p>, Sonia Kung', '<s3>2', '<p>, Htoo Oo', '<s3>2', '<p>, Mira Keyes', '<s3>1', '<p>, Hossein Farahani', '<s3>1', '<p>,| Calum MacAulay', '<s3>3', '<p>, Larry Goldenberg', '<s3>2', '<p>, Peter Black', '<s3>2', '<p>, and Ali Bashashati', '<s3>1(', '<h3>B', '<s3>)|', '', '<s5>1', '<s1> University of British Columbia, Vancouver, BC, Canada| ali.bashashati@ubc.ca|', '<s5>2', '<s1> Vancouver Prostate Centre, Vancouver, BC, Canada|', '<s5>3', '<s1> BC Cancer Agency, Vancouver, BC, Canada|', '<s1>Abstract.  The utility of machine learning models in histopathology| image analysis for disease diagnosis has been extensively studied. How-| ever, eﬀorts to stratify patie

In [5]:
#!pip install pymupdf


In [6]:
import fitz

In [7]:
my_path = '/Users/yasminsarkhosh/Downloads/storage/IZXYGYNV/Azadi m.fl. - 2023 - ALL-IN A Local GLobal Graph-Based DIstillatioN Mo.pdf'
doc = fitz.open(my_path)

In [None]:
import PyPDF2
import re
import os

for foldername,subfolders,files in os.walk(r"C:/my_path"):
    for file in files:
        # open the pdf file
        object = PyPDF2.PdfFileReader(os.path.join(foldername,file))

        # get number of pages
        NumPages = object.getNumPages()

        # define keyterms
        String = "New York State Real Property Law"

        # extract text and do the search
        for i in range(0, NumPages):
            PageObj = object.getPage(i)
            print("this is page " + str(i)) 
            Text = PageObj.extractText() 
            # print(Text)
            ResSearch = re.search(String, Text)
            print(ResSearch)

In [26]:
for page in doc:
    text = page.get_text()
    print(text)

ALL-IN: A Local GLobal Graph-Based
DIstillatioN Model for Representation
Learning of Gigapixel Histopathology
Images With Application In Cancer Risk
Assessment
Puria Azadi1, Jonathan Suderman1, Ramin Nakhli1, Katherine Rich1,
Maryam Asadi1, Sonia Kung2, Htoo Oo2, Mira Keyes1, Hossein Farahani1,
Calum MacAulay3, Larry Goldenberg2, Peter Black2, and Ali Bashashati1(B)
1 University of British Columbia, Vancouver, BC, Canada
ali.bashashati@ubc.ca
2 Vancouver Prostate Centre, Vancouver, BC, Canada
3 BC Cancer Agency, Vancouver, BC, Canada
Abstract. The utility of machine learning models in histopathology
image analysis for disease diagnosis has been extensively studied. How-
ever, eﬀorts to stratify patient risk are relatively under-explored. While
most current techniques utilize small ﬁelds of view (so-called local fea-
tures) to link histopathology images to patient outcome, in this work we
investigate the combination of global (i.e., contextual) and local features
in a graph-based neural

In [27]:
output = page.get_text("blocks")
output

[(334.6109924316406,
  31.01311492919922,
  399.40130615234375,
  39.979515075683594,
  'ALL-IN\n775\n',
  0,
  0),
 (53.57623291015625,
  56.92424774169922,
  399.344970703125,
  230.27182006835938,
  '26. Son, B., Lee, S., Youn, H., Kim, E., Kim, W., Youn, B.: The role of tumor microen-\nvironment in therapeutic resistance. Oncotarget 8(3), 3933 (2017)\n27. Srinidhi, C.L., Ciga, O., Martel, A.L.: Deep neural network models for computa-\ntional histopathology: a survey. Med. Image Anal. 67, 101813 (2021)\n28. Tang, S., Chen, D., Bai, L., Liu, K., Ge, Y., Ouyang, W.: Mutual crf-gnn for few-\nshot learning. In: Proceedings of the IEEE/CVF Conference on Computer Vision\nand Pattern Recognition, pp. 2329–2339 (2021)\n29. Wetstein, S.C., et al.: Deep learning-based breast cancer grading and survival anal-\nysis on whole-slide histopathology images. Sci. Rep. 12(1), 1–12 (2022)\n30. Xu, K., Hu, W., Leskovec, J., Jegelka, S.: How powerful are graph neural networks?\narXiv preprint arXiv:1810

In [28]:
for page in doc:
    output = page.get_text("blocks")                   
    previous_block_id = 0 # Set a variable to mark the block id
    for block in output:
        if block[6] == 0: # We only take the text
            if previous_block_id != block[5]: # Compare the block number 
                print("\n")
                print(block[4])



Puria Azadi1, Jonathan Suderman1, Ramin Nakhli1, Katherine Rich1,
Maryam Asadi1, Sonia Kung2, Htoo Oo2, Mira Keyes1, Hossein Farahani1,
Calum MacAulay3, Larry Goldenberg2, Peter Black2, and Ali Bashashati1(B)



1 University of British Columbia, Vancouver, BC, Canada
ali.bashashati@ubc.ca
2 Vancouver Prostate Centre, Vancouver, BC, Canada
3 BC Cancer Agency, Vancouver, BC, Canada



Abstract. The utility of machine learning models in histopathology
image analysis for disease diagnosis has been extensively studied. How-
ever, eﬀorts to stratify patient risk are relatively under-explored. While
most current techniques utilize small ﬁelds of view (so-called local fea-
tures) to link histopathology images to patient outcome, in this work we
investigate the combination of global (i.e., contextual) and local features
in a graph-based neural network for patient risk stratiﬁcation. The pro-
posed network not only combines both ﬁne and coarse histological pat-
terns but also utilizes their in

In [11]:
#!pip install Unidecode

In [46]:
from unidecode import unidecode 

output = []

for page in doc:

    output += page.get_text("blocks")

previous_block_id = 0 # Set a variable to mark the block id

for block in output:

     if block[6] == 0: # We only take the text
          if previous_block_id != block[5]: # Compare the block number 
              print("\n")

          plain_text = unidecode(block[4])
          print(plain_text)

ALL-IN: A Local GLobal Graph-Based
DIstillatioN Model for Representation
Learning of Gigapixel Histopathology
Images With Application In Cancer Risk
Assessment



Puria Azadi1, Jonathan Suderman1, Ramin Nakhli1, Katherine Rich1,
Maryam Asadi1, Sonia Kung2, Htoo Oo2, Mira Keyes1, Hossein Farahani1,
Calum MacAulay3, Larry Goldenberg2, Peter Black2, and Ali Bashashati1(B)



1 University of British Columbia, Vancouver, BC, Canada
ali.bashashati@ubc.ca
2 Vancouver Prostate Centre, Vancouver, BC, Canada
3 BC Cancer Agency, Vancouver, BC, Canada



Abstract. The utility of machine learning models in histopathology
image analysis for disease diagnosis has been extensively studied. How-
ever, efforts to stratify patient risk are relatively under-explored. While
most current techniques utilize small fields of view (so-called local fea-
tures) to link histopathology images to patient outcome, in this work we
investigate the combination of global (i.e., contextual) and local features
in a graph-b

In [13]:
#!pip install pandas

Defaulting to user installation because normal site-packages is not writeable
[33mDEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [73]:
import pandas as pd
df = pd.DataFrame(output, columns=["x0", "y0", "x1", "y1", "text", "block_no", "type"])
df.head(50)

Unnamed: 0,x0,y0,x1,y1,text,block_no,type
0,81.351791,52.889397,371.718781,138.980408,ALL-IN: A Local GLobal Graph-Based\nDIstillati...,0,0
1,57.673523,161.491257,394.792328,197.548752,"Puria Azadi1, Jonathan Suderman1, Ramin Nakhli...",1,0
2,108.621002,205.397125,344.299713,252.046631,"1 University of British Columbia, Vancouver, B...",2,0
3,81.91748,262.790161,371.039948,431.53241,Abstract. The utility of machine learning mode...,3,0
4,81.918381,435.911743,353.175232,456.658844,Keywords: Histopathology · Risk Assessment · G...,4,0
5,53.577,458.899597,147.77124,470.866669,1\nIntroduction\n,5,0
6,53.576962,480.973267,399.473389,564.84845,The examination of tissue and cells using micr...,6,0
7,53.576996,579.446106,399.331268,599.374817,Supplementary Information The online version c...,7,0
8,53.576946,607.915588,343.219025,632.050476,"c\n⃝ The Author(s), under exclusive license to...",8,0
9,39.402,31.013115,130.415466,39.979515,766\nP. Azadi et al.\n,0,0


In [49]:
df['text'] = df['text'].apply(lambda x: unidecode(x))
df = df.drop(df[df['type'] == '1' ].index)

In [51]:
block_dict = {}

page_num = 1

for page in doc: # Iterate all pages in the document
      file_dict = page.get_text('dict') # Get the page dictionary 
      block = file_dict['blocks'] # Get the block information
      block_dict[page_num] = block # Store in block dictionary
      page_num += 1 # Increase the page value by 1

In [52]:
import re 

spans = pd.DataFrame(columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'tag'])

rows = []

for page_num, blocks in block_dict.items():
    for block in blocks:
        if block['type'] == 0:
            for line in block['lines']:
                for span in line['spans']:
                    xmin, ymin, xmax, ymax = list(span['bbox'])
                    font_size = span['size']
                    text = unidecode(span['text'])
                    span_font = span['font']
                    is_upper = False
                    is_bold = False 

                    if "bold" in span_font.lower():
                        is_bold = True 

                    if re.sub("[\(\[].*?[\)\]]", "", text).isupper():
                        is_upper = True

                    if text.replace(" ","") !=  "":
                        rows.append((xmin, ymin, xmax, ymax, text, is_upper, is_bold, span_font, font_size))

span_df = pd.DataFrame(rows, columns=['xmin','ymin','xmax','ymax', 'text', 'is_upper','is_bold','span_font', 'font_size'])

In [54]:
import numpy as np
span_scores = []
span_num_occur = {}
special = '[(_:/,#%\=@)]'

for index, span_row in span_df.iterrows():
    score = round(span_row.font_size)
    text = span_row.text

    if not re.search(special, text):
        if span_row.is_bold:
            score +=1 

        if span_row.is_upper:
            score +=1

    span_scores.append(score)

values, counts = np.unique(span_scores, return_counts=True)

In [70]:
import numpy as np

values, counts = np.unique(span_scores, return_counts=True)

style_dict = {}

for value, count in zip(values, counts):
    style_dict[value] = count

sorted(style_dict.items(), key=lambda x: x[1])


[(5, 4),
 (14, 5),
 (12, 13),
 (8, 33),
 (6, 38),
 (11, 94),
 (7, 321),
 (9, 342),
 (10, 570)]

In [56]:
p_size = max(style_dict, key=style_dict.get)

idx = 0
tag = {}

for size in sorted(values, reverse = True):
    idx += 1

    if size == p_size:
        idx = 0
        tag[size] = 'p'

    if size > p_size:
        tag[size] = 'h{0}'.format(idx)

    if size < p_size:
        tag[size] = 's{0}'.format(idx)

In [57]:
span_tags = [tag[score] for score in span_scores]
span_df['tag'] = span_tags

In [58]:
headings_list = []
text_list = []
tmp = []
heading = ''                                                                                                                

for index, span_row in span_df.iterrows():
    text = span_row.text
    tag = span_row.tag

    if 'h' in tag:
        headings_list.append(text)
        text_list.append('\n'.join(tmp))
        tmp = []
        heading = text

    else:
        tmp.append(text)

text_list.append('\n'.join(tmp))
text_list = text_list[1:]
text_df = pd.DataFrame(zip(headings_list, text_list),columns=['heading', 'content'] )

In [72]:
headings_list

['ALL-IN: A Local GLobal Graph-Based',
 'DIstillatioN Model for Representation',
 'Learning of Gigapixel Histopathology',
 'Images With Application In Cancer Risk',
 'Assessment',
 'B',
 ' *',
 ' *',
 '1',
 'Introduction',
 '2',
 'Related Works',
 '3',
 'Method',
 ' P',
 ' R',
 ' Z',
 ' R',
 ' P',
 'G',
 'P',
 ' A',
 ' P',
 ' A',
 ' M',
 ' M',
 ' Z',
 'G',
 'Z',
 ' K',
 'S',
 ' R',
 ' P',
 ' G',
 'S',
 'G',
 'X',
 'S',
 'T',
 'X',
 'A',
 '.I',
 '.X',
 ' I',
 ' X',
 ' R',
 ' X',
 ' R',
 ' P',
 ' X',
 ' Z',
 ' K',
 ' C',
 ' R',
 'S',
 ' R',
 'S',
 ' C',
 '.X',
 'C',
 'X',
 '.W',
 '.W',
 ' W',
 'R',
 'C',
 '.A',
 '.C',
 'C',
 ' .D',
 '.C',
 ' D',
 ' A',
 ' A',
 ' R',
 'C',
 '.A',
 '.C',
 'C',
 '.D',
 '.C',
 ' R',
 ' R',
 ' R',
 'R',
 'C',
 '.C',
 'C',
 ' .C',
 'I',
 'K',
 ' I',
 ' X',
 ' R',
 ' S',
 ' R',
 ' P',
 'W',
 ' R',
 ' R',
 ' X',
 ' S',
 'W',
 'W',
 ' W',
 ' W',
 '4',
 'Experiments and Results',
 'AMIL [',
 '5',
 'Conclusion',
 'References']

In [60]:
text_df.to_csv('text_df.csv', index=False)

In [16]:
df.head(20)

Unnamed: 0,text
0,ALL-IN: A Local GLobal Graph-Based\nDIstillati...
1,"Puria Azadi1, Jonathan Suderman1, Ramin Nakhli..."
2,"1 University of British Columbia, Vancouver, B..."
3,Abstract. The utility of machine learning mode...
4,Keywords: Histopathology * Risk Assessment * G...
5,1\nIntroduction\n
6,The examination of tissue and cells using micr...
7,Supplementary Information The online version c...
8,"c\n The Author(s), under exclusive license to ..."
9,766\nP. Azadi et al.\n


In [17]:
df.to_csv('output.csv', index = False)

In [18]:
df

Unnamed: 0,text
0,ALL-IN: A Local GLobal Graph-Based\nDIstillati...
1,"Puria Azadi1, Jonathan Suderman1, Ramin Nakhli..."
2,"1 University of British Columbia, Vancouver, B..."
3,Abstract. The utility of machine learning mode...
4,Keywords: Histopathology * Risk Assessment * G...
...,...
105,"8. Carbonneau, M.A., Cheplygina, V., Granger, ..."
106,"11. Cooperberg,\nM.R.,\net\nal.:\nOutcomes\nof..."
107,"18. Liu, W., He, Q., He, X.: Weakly supervised..."
108,ALL-IN\n775\n


In [19]:
df

Unnamed: 0,text
0,ALL-IN: A Local GLobal Graph-Based\nDIstillati...
1,"Puria Azadi1, Jonathan Suderman1, Ramin Nakhli..."
2,"1 University of British Columbia, Vancouver, B..."
3,Abstract. The utility of machine learning mode...
4,Keywords: Histopathology * Risk Assessment * G...
...,...
105,"8. Carbonneau, M.A., Cheplygina, V., Granger, ..."
106,"11. Cooperberg,\nM.R.,\net\nal.:\nOutcomes\nof..."
107,"18. Liu, W., He, Q., He, X.: Weakly supervised..."
108,ALL-IN\n775\n
