In [11]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (
    LAParams,
    LTContainer,
    LTTextLine,
)
import camelot
from io import StringIO
import os
import tabula
import pandas as pd

In [2]:
def get_objs(layout, results):
    if not isinstance(layout, LTContainer):
        return
    for obj in layout:
        if isinstance(obj, LTTextLine):
            results.append({'bbox': obj.bbox, 'text' : obj.get_text(), 'type' : type(obj)})
        get_objs(obj, results)

In [3]:
def main(path):
    with open(path, "rb") as f:
        parser = PDFParser(f)
        document = PDFDocument(parser)
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        # https://pdfminersix.readthedocs.io/en/latest/api/composable.html#
        laparams = LAParams(
            all_texts=True,
        )
        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            results = []
            print('objs-------------------------')
            get_objs(layout, results)
            for r in results:
                print(r)

In [2]:
PDF_DIR = './pdfs/'
pdfs = os.listdir(PDF_DIR)
pdfs.sort()

In [16]:
df = tabula.read_pdf(PDF_DIR + pdfs[-1], lattice=True, pages ='all')

Got stderr: 10 10, 2022 10:37:21 午後 org.apache.fontbox.ttf.CmapSubtable processSubtype14
警告: Format 14 cmap table is not supported and will be ignored
10 10, 2022 10:37:21 午後 org.apache.fontbox.ttf.CmapSubtable processSubtype14
警告: Format 14 cmap table is not supported and will be ignored



In [27]:
df[0].shape

(2, 6)

In [5]:
main(PDF_DIR + pdfs[0])

objs-------------------------
{'bbox': (247.43975999999998, 710.9700680000001, 294.23976, 720.3300680000001), 'text': '暴風域半径\n', 'type': <class 'pdfminer.layout.LTTextLineHorizontal'>}
{'bbox': (327.599736, 710.9700680000001, 374.39973599999996, 720.3300680000001), 'text': '強風域半径\n', 'type': <class 'pdfminer.layout.LTTextLineHorizontal'>}
{'bbox': (266.160072, 692.4840919999999, 275.520072, 701.8440919999999), 'text': 'km\n', 'type': <class 'pdfminer.layout.LTTextLineHorizontal'>}
{'bbox': (346.32004800000004, 692.4840919999999, 355.680048, 701.8440919999999), 'text': 'km\n', 'type': <class 'pdfminer.layout.LTTextLineHorizontal'>}
{'bbox': (408.00026399999996, 716.2500440000001, 478.20026399999995, 725.6100440000001), 'text': '大きさ・強さ 等\n', 'type': <class 'pdfminer.layout.LTTextLineHorizontal'>}
{'bbox': (401.04048, 697.6442599999999, 429.12048000000004, 707.0042599999999), 'text': '大きさ\n', 'type': <class 'pdfminer.layout.LTTextLineHorizontal'>}
{'bbox': (457.800456, 697.6442599999999, 

In [7]:
tables = camelot.read_pdf(PDF_DIR + pdfs[0])

# for ix in tables[0].df.index:
#     print(ix, tables[0].df.loc[ix][0], '|', tables[0].df.loc[ix][1])

In [18]:
for ix in tables[0].df.index:
    tb = tables[0].df
    col = tb.shape[1]
    li = [tables[0].df.loc[ix][r] for r in range(0,col)]
    print(' | '.join(li))

月 日 時 | 中心位置 |  | hPa | 中心 最大
気圧 風速
m/s | 暴風域半径
km | 強風域半径
km | 大きさ・強さ 等
 | 緯度 | 経度 |  |  |  |  | 大きさ
強さ
5
9
21
10
03
09
15
21
11
03
09
15
21
12
03
09
15
21
13
00
03
06
09
12
15
18
21
14
00
03
06
09
12
15
18
21
15
03
09
15
21
16
03 | 11.8
N
12.3
13.1
14.1
15.3
16.2
17.1
17.6
18.0
18.4
18.9
19.5
20.0
20.2
20.4
20.7
20.9
21.6
22.1
22.6
23.1
23.6
24.4
25.0
25.6
26.2
26.7
27.2
27.2
27.4
27.4
27.7
27.7 | 120.0
E
119.6
119.5
119.3
119.1
119.1
119.1
119.1
119.2
119.4
119.9
120.5
121.0
121.1
121.3
121.5
121.7
122.2
122.6
123.2
123.9
124.5
125.2
125.8
126.5
127.2
128.0
128.4
129.1
130.7
131.9
133.5
134.6 | 1004
1004
1004
1004
1002
1002
1000
998
998
996
994
994
994
992
990
990
985
985
990
990
990
990
990
990
990
992
996
998
1000
1000
1002
1002
1004 | --
--
--
--
--
--
18
18
18
20
20
20
20
23
23
23
25
25
23
23
23
23
23
23
23
23
20
18
--
--
--
--
-- | ---
 
 
---
 
  ---
 
  ---
 
  ---
 
 
---
 
  ---
 
  ---
 
  ---
 
 
---
 
  ---
 
  ---
 
  ---
 
 
---
 
  ---
 
  ---
 
  ---
 
  ---
 
  ---


In [32]:
tables[0].df.loc[2] [1].split('\n')

['11.8',
 'N',
 '12.3',
 '13.1',
 '14.1',
 '15.3',
 '16.2',
 '17.1',
 '17.6',
 '18.0',
 '18.4',
 '18.9',
 '19.5',
 '20.0',
 '20.2',
 '20.4',
 '20.7',
 '20.9',
 '21.6',
 '22.1',
 '22.6',
 '23.1',
 '23.6',
 '24.4',
 '25.0',
 '25.6',
 '26.2',
 '26.7',
 '27.2',
 '27.2',
 '27.4',
 '27.4',
 '27.7',
 '27.7']

In [22]:
def exTextPdf(fPath):
    f = open(fPath, 'rb')
    outf = StringIO()    
    rm = PDFResourceManager()
    lap = LAParams()
    dev = TextConverter(rm, outf, laparams=lap)
    iprlr = PDFPageInterpreter(rm, dev)
    for page in PDFPage.get_pages(f):
        iprlr.process_page(page)
    contents = outf.getvalue()
    outf.close()
    dev.close()
    f.close()
    return contents.split('\n')

In [24]:
fPath = PDF_DIR + pdfs[0]

In [25]:
exTextPdf(fPath)

['暴風域半径',
 '',
 '強風域半径',
 '',
 'km',
 '',
 'km',
 '',
 '大きさ・強さ 等',
 '',
 '大きさ',
 '',
 '強さ',
 '',
 ' 熱帯低気圧発生',
 '',
 '台風0101号（0101 CIMARON)',
 '',
 '位\u3000  置\u3000  表',
 '',
 '月 日 時',
 '',
 '中心位置',
 '',
 '緯度',
 '',
 '経度',
 '',
 '5',
 '',
 '9 21 11.8 N 120.0 E 1004',
 '1004',
 '10 03 12.3',
 '1004',
 '09 13.1',
 '1004',
 '15 14.1',
 '1002',
 '21 15.3',
 '1002',
 '11 03 16.2',
 '1000',
 '09 17.1',
 '998',
 '15 17.6',
 '998',
 '21 18.0',
 '996',
 '12 03 18.4',
 '994',
 '09 18.9',
 '994',
 '15 19.5',
 '994',
 '21 20.0',
 '992',
 '13 00 20.2',
 '990',
 '03 20.4',
 '990',
 '06 20.7',
 '985',
 '09 20.9',
 '985',
 '12 21.6',
 '990',
 '15 22.1',
 '990',
 '18 22.6',
 '990',
 '21 23.1',
 '990',
 '14 00 23.6',
 '990',
 '03 24.4',
 '990',
 '06 25.0',
 '990',
 '09 25.6',
 '992',
 '12 26.2',
 '996',
 '15 26.7',
 '998',
 '18 27.2',
 '1000',
 '21 27.2',
 '1000',
 '15 03 27.4',
 '1002',
 '09 27.4',
 '1002',
 '15 27.7',
 '1004',
 '21 27.7',
 '',
 '119.6',
 '119.5',
 '119.3',
 '119.1',
 '119.1',
 '119.1'

In [27]:
fPath1 = PDF_DIR + pdfs[-1]
exTextPdf(fPath1)

['2022年台風第18号  ROKE (2218)',
 '',
 '位\u3000  置\u3000  表\u3000\u3000\u3000（速報値）',
 '',
 '（日本時）',
 '',
 '中心位置',
 '',
 '月 日 時 緯度',
 '',
 '経度',
 '',
 '中心 最大',
 '気圧 風速',
 'hPa m/s',
 '',
 'km',
 '',
 '暴風域半径',
 '',
 '強風域半径',
 '',
 '9 28 21 23.0 N 131.7 E 1000',
 '1000',
 '998',
 '998',
 '994',
 '994',
 '994',
 '994',
 '990',
 '990',
 '990',
 '990',
 '980',
 '980',
 '975',
 '975',
 '975',
 '975',
 '980',
 '985',
 '990',
 '990',
 '992',
 '992',
 '992',
 '992',
 '992',
 '992',
 '992',
 '992',
 '992',
 '',
 '29 00 23.4',
 '03 23.8',
 '06 24.5',
 '09 24.9',
 '12 25.3',
 '13 25.4',
 '14 25.5',
 '15 25.7',
 '16 25.9',
 '17 26.0',
 '18 26.1',
 '21 26.6',
 '30 00 27.0',
 '03 27.6',
 '06 27.9',
 '09 28.2',
 '12 28.6',
 '15 28.9',
 '18 29.6',
 '21 30.2',
 '1 00 30.6',
 '03 31.1',
 '06 31.3',
 '09 31.7',
 '12 32.3',
 '15 33.0',
 '18 33.6',
 '21 34.0',
 '2 00 34.7',
 '03 35.0',
 '',
 '131.2',
 '131.7',
 '131.7',
 '131.6',
 '131.8',
 '131.9',
 '131.9',
 '132.0',
 '132.3',
 '132.4',
 '132.6',
 '133.1',
 '1

['暴風域半径',
 '',
 '強風域半径',
 '',
 'km',
 '',
 'km',
 '',
 '大きさ・強さ 等',
 '',
 '大きさ',
 '',
 '強さ',
 '',
 ' 熱帯低気圧発生',
 '',
 '台風0101号（0101 CIMARON)',
 '',
 '位\u3000  置\u3000  表',
 '',
 '月 日 時',
 '',
 '中心位置',
 '',
 '緯度',
 '',
 '経度',
 '',
 '5',
 '',
 '9 21 11.8 N 120.0 E 1004',
 '1004',
 '10 03 12.3',
 '1004',
 '09 13.1',
 '1004',
 '15 14.1',
 '1002',
 '21 15.3',
 '1002',
 '11 03 16.2',
 '1000',
 '09 17.1',
 '998',
 '15 17.6',
 '998',
 '21 18.0',
 '996',
 '12 03 18.4',
 '994',
 '09 18.9',
 '994',
 '15 19.5',
 '994',
 '21 20.0',
 '992',
 '13 00 20.2',
 '990',
 '03 20.4',
 '990',
 '06 20.7',
 '985',
 '09 20.9',
 '985',
 '12 21.6',
 '990',
 '15 22.1',
 '990',
 '18 22.6',
 '990',
 '21 23.1',
 '990',
 '14 00 23.6',
 '990',
 '03 24.4',
 '990',
 '06 25.0',
 '990',
 '09 25.6',
 '992',
 '12 26.2',
 '996',
 '15 26.7',
 '998',
 '18 27.2',
 '1000',
 '21 27.2',
 '1000',
 '15 03 27.4',
 '1002',
 '09 27.4',
 '1002',
 '15 27.7',
 '1004',
 '21 27.7',
 '',
 '119.6',
 '119.5',
 '119.3',
 '119.1',
 '119.1',
 '119.1'