### Libraries and functions

In [None]:
# pdf webscrapping and text extraction
from bs4 import BeautifulSoup

# get the pdf files or url from the web
import requests

# input output operations
import io

#!pip install xhtml2pdf requests
#!pip install lxml

# for converting html to pdf
import regex as re
import pandas as pd
from urllib.request import urlopen
from urllib import request as urllib2
import numpy as np

***

## Main function for extracting PDFS into a Pandas Dataframe
***


In [None]:
pd.set_option('display.max_rows', None)

In [None]:
#helper function to find lines in html document with DOI and titles of articles
def has_doi(href):
    return href and re.compile("chapter/").search(href)

In [None]:
#main mining function returning the initial dataframe 
def mining(html_doc, year, current_page, all_pages, part):
    #opening the html document (copy pasted and saved as a .doc file)
    doc = open(html_doc, "r", encoding = "ISO-8859-1") 
    soup = BeautifulSoup(doc, 'html.parser' )

    list_of_doi = soup.find_all(href=has_doi)
        
    #getting the titles and the doi's from list generated helper function

    titles = []
    doi_str = []

    for element in list_of_doi:
        titles.append(element.get_text()) #returns the titles as the only text in the list
        string = str(element)
        first_substring = '/chapter'
        second_substring ='">'
        #separates out the DOIS (added the +9 to remove /chapter/ from the beginning of all DOIS)
        doi_str.append(string[(string.find(first_substring)+9):string.find(second_substring)]) 
                
    ## now the lines containing author are found
    authors = soup.find_all("li", class_="c-author-list__item")
        
    #keeping only the author names
    authors_str = []
    for element in authors:
        string = str(element)
        first_substring = 'item">'
        second_substring ='</li>'
        authors_str.append(string[(string.find(first_substring)+6):string.find(second_substring)])

    #now the lines containing page numbers are found
    page_numbers= soup.find_all('div', class_ = "c-meta")

    #keeping only the page numbers
    page_numbers_str = []

    # an element in page_numbers_str looks like this:
    '''' <div class="c-meta"><span class="c-meta__item u-display-inline-block" 
    data-test="page-number"> Pages 618-627</span> </div> '''  

    for element in page_numbers:            
        #removes white spaces, and everything within the div tag that is not the page numbers
        string = element.get_text()[6:-1]                       #618-627
        #splits the string into two numbers
        both = string.split("-")                                #['618', '627']
        #filtering out front matters and back matters                                                   
        if 'x' and '1-1' and '463-463' and '135-135' and '649-649' and '247-247' and '281-281' and '369-369' and '433-433' and '509-509' and '583-583' and '757-757' and '535-535' not in string:             
            try: 
                if int(both[1])-int(both[0]) > 1: 
                    page_numbers_str.append(string)              #if True adds the string to the list

            except:
                if "C1" in string or "C" in string:              #page numbers that are in the form of C<some number> 
                    page_numbers_str.append(string)
        
        #filtering out back matters
        if int(current_page) == int(all_pages) and '781-785' in string: 
            page_numbers_str = page_numbers_str[:-1]
            #print('1')         
        elif int(current_page) == int(all_pages) and '787-791' in string:
            page_numbers_str = page_numbers_str[:-1]
            #print('2')
        elif int(current_page) == int(all_pages) and '767-771' in string:
            page_numbers_str = page_numbers_str[:-1]
            #print('3')
        elif int(current_page) == int(all_pages) and '797-801' in string:  
            page_numbers_str = page_numbers_str[:-1]
            #print('4')
        elif int(current_page) == int(all_pages) and '801-806' in string:
            page_numbers_str = page_numbers_str[:-1]
            #print('5')
        elif int(current_page) == int(all_pages) and '813-818' in string:
            page_numbers_str = page_numbers_str[:-1]
            #print('6')
        elif int(current_page) == int(all_pages) and '687-690' in string:
            page_numbers_str = page_numbers_str[:-1]
            #print('7')
        elif int(current_page) == int(all_pages) and '739-743' in string:
            page_numbers_str = page_numbers_str[:-1]
            #print('8')
        elif int(current_page) == int(all_pages) and '791-795' in string:
            page_numbers_str = page_numbers_str[:-1]
            #print('9')


    #need to create a list of the year of publication to add to dataframe 
    year_of_pub = []
    for element in titles:
        year_of_pub.append(year)
    
    #will add the part of the publication to the dataframe as well
    part_of_pub = []
    for element in titles:
        part_of_pub.append(part)
        

    #creating the column names and content for the dataframe        
    data = {'Title': titles,
        'Authors': authors_str,
        'Page numbers' : page_numbers_str,
        'DOI': doi_str,
        'Year of publication' : year_of_pub,
        'Part of publication' : part_of_pub       }

    df = pd.DataFrame(data)

    return df 

In [None]:
#helper function to combine all df together
def data_together(data, year):
    combined_frame = pd.concat(data, ignore_index = True, sort = False)
    combined_frame.to_csv('database_miccai_'+ str(year) +'.csv')
   
    return combined_frame

In [None]:
#miccai 2023 papers: 10 volumes in total 
 
miccai =[
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 01 page 1 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 01 page 2 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 01 page 3 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 01 page 4 of 4.doc',

    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 02 page 1 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 02 page 2 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 02 page 3 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 02 page 4 of 4.doc',

    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 03 page 1 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 03 page 2 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 03 page 3 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 03 page 4 of 4.doc',

    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 04 page 1 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 04 page 2 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 04 page 3 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 04 page 4 of 4.doc',

    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 05 page 1 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 05 page 2 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 05 page 3 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 05 page 4 of 4.doc',

    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 06 page 1 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 06 page 2 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 06 page 3 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 06 page 4 of 4.doc',

    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 07 page 1 of 5.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 07 page 2 of 5.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 07 page 3 of 5.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 07 page 4 of 5.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 07 page 5 of 5.doc',

    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 08 page 1 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 08 page 2 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 08 page 3 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 08 page 4 of 4.doc',

    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 09 page 1 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 09 page 2 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 09 page 3 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 09 page 4 of 4.doc',

    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 10 page 1 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 10 page 2 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 10 page 3 of 4.doc',
    '/Users/yasminsarkhosh/Documents/miccai2023 papers/miccai2023 vol 10 page 4 of 4.doc',     
    ]

In [None]:
data = []
for element in miccai:
    data.append(mining(element, 2023, element[-10], element[-5],  element[-18:-16]))

data_together(data, 2023)