## 正規表達法斷句

In [1]:
import re
article = '''

【綜合報導】違建真害命！違建大火連續燒，近2周內，光雙北市至少有10人因密集隔間違建火警喪命，新北市府日前展開大動作，強拆一棟2層樓加蓋成6層樓、隔成158間房分租的離譜違建，強調明起全面執行頂加分租套房拆除。台北市長柯文哲昨也表示，考慮拿掉前市長陳水扁下的「違建特赦令」，涉及公安的大型違建，不論是既存違建或是新違建，都要依法處置。目前雙北共有29萬戶列管違建，若不改善，恐將面臨拆除命運。
'''

In [4]:
re.split('！|，|。',article)

['\n\n【綜合報導】違建真害命',
 '違建大火連續燒',
 '近2周內',
 '光雙北市至少有10人因密集隔間違建火警喪命',
 '新北市府日前展開大動作',
 '強拆一棟2層樓加蓋成6層樓、隔成158間房分租的離譜違建',
 '強調明起全面執行頂加分租套房拆除',
 '台北市長柯文哲昨也表示',
 '考慮拿掉前市長陳水扁下的「違建特赦令」',
 '涉及公安的大型違建',
 '不論是既存違建或是新違建',
 '都要依法處置',
 '目前雙北共有29萬戶列管違建',
 '若不改善',
 '恐將面臨拆除命運',
 '\n']

## 使用Jieba

In [5]:
! pip install jieba



In [8]:
import jieba
for ele in jieba.cut('大巨蛋案對市府同仁下封口令? 柯P否認'):
    print(ele)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\User\AppData\Local\Temp\jieba.cache
Loading model cost 1.176 seconds.
Prefix dict has been built succesfully.


大
巨蛋
案對
市府
同仁
下
封口令
?
 
柯
P
否認


In [10]:
'/'.join(jieba.cut('大巨蛋案對市府同仁下封口令? 柯P否認'))

'大/巨蛋/案對/市府/同仁/下/封口令/?/ /柯/P/否認'

In [12]:
'/'.join(jieba.cut('大巨蛋案對市府同仁下封口令? 柯P否認', cut_all=True))

'大/巨蛋/案/對/市府/同仁/下/封口/封口令/口令////柯/P/否/認'

In [13]:
'/'.join(jieba.cut('大巨蛋案對市府同仁下封口令? 柯P否認'))

'大/巨蛋/案對/市府/同仁/下/封口令/?/ /柯/P/否認'

## 讀取使用者字典

In [18]:
jieba.load_userdict('userdict.txt')

In [19]:
'/'.join(jieba.cut('大巨蛋案對市府同仁下封口令? 柯P否認'))

'大巨蛋/案對/市府/同仁/下/封口令/?/ /柯P/否認'

## 抓出詞性

In [20]:
import jieba.posseg as pseg
words = pseg.cut('大巨蛋案對市府同仁下封口令? 柯P否認')
for w in words:
    print(w.word, w.flag)

大巨蛋 n
案 ng
對 p
市府 n
同仁 nr
下 f
封口令 n
? x
  x
柯P n
否認 v


In [21]:
sentence = '大巨蛋案對市府同仁下封口令? 柯P否認'
for tw in jieba.tokenize(sentence):
    print(tw)

('大巨蛋', 0, 3)
('案對', 3, 5)
('市府', 5, 7)
('同仁', 7, 9)
('下', 9, 10)
('封口令', 10, 13)
('?', 13, 14)
(' ', 14, 15)
('柯P', 15, 17)
('否認', 17, 19)


In [23]:
import jieba.analyse
tags = jieba.analyse.extract_tags(sentence, 1)
print(tags)

['封口令']


In [24]:
import jieba.analyse
tags = jieba.analyse.extract_tags(sentence, 1, allowPOS=['nr'])
print(tags)

['同仁']


## 擴充字典
- https://www.moedict.tw/

- https://zh.wikipedia.org/wiki/%E5%94%90%E7%B4%8D%E5%BE%B7%C2%B7%E5%B7%9D%E6%99%AE

## 使用新聞關鍵字

In [26]:
import requests
from bs4 import BeautifulSoup
res = requests.get('http://news.ltn.com.tw/news/business/breakingnews/2272811')
soup = BeautifulSoup(res.text, 'html.parser')

In [29]:
keywords = [ele.text for ele in soup.select('.keyword a')]

In [31]:
with open('userdict.txt', 'a', encoding='utf-8') as f:
    f.write('\n')
    for keyword in keywords:
        f.write(keyword + '\n')

## 取得保險詞彙

In [37]:
import requests
from bs4 import BeautifulSoup
url = 'https://www.ib.gov.tw/ch/home.jsp?id=59&parentpath=0,6&mcustomize='
payload = {
'id':'59',
'contentid':'59',
'parentpath':'0,6',
'mcustomize':'bilingual_list.jsp',
'ckeyword':'請輸入中文關鍵字',
'ekeyword':'請輸入英文關鍵字',
'page':'3'
}

res = requests.post(url, data = payload)

soup = BeautifulSoup(res.text, 'html.parser')
keywords = [ele.text for ele in soup.select('.bich_name_con')]
with open('userdict.txt', 'a', encoding='utf-8') as f:
    f.write('\n')
    for keyword in keywords:
        f.write(keyword + '\n')

## 將PDF 轉成文字

In [38]:
! pip install pdfminer3k



In [40]:
res = requests.get('https://www.fubon.com/life/public_info/public_info_04/5-12.pdf')
with open('fubon.pdf', 'wb') as f:
    f.write(res.content)

In [None]:
## from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine

s = ''
fp = open('fubon.pdf', 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in doc.get_pages():
    interpreter.process_page(page)
    layout = device.get_result()
    for lt_obj in layout:
        if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
            #print(lt_obj.get_text())
            pass

## 將文字轉成 Word 檔案

In [44]:
! pip install python-docx

Collecting python-docx
  Downloading python-docx-0.8.6.tar.gz (5.3MB)
Building wheels for collected packages: python-docx
  Running setup.py bdist_wheel for python-docx: started
  Running setup.py bdist_wheel for python-docx: finished with status 'done'
  Stored in directory: C:\Users\User\AppData\Local\pip\Cache\wheels\cc\74\10\42b00d7d6a64cf21f194bfef9b94150009ada880f13c5b2ad3
Successfully built python-docx
Installing collected packages: python-docx
Successfully installed python-docx-0.8.6


In [45]:
from docx import Document
from docx.shared import Inches

document = Document()

document.add_heading('Document Title', 0)

p = document.add_paragraph('A plain paragraph having some ')
p.add_run('bold').bold = True
p.add_run(' and some ')
p.add_run('italic.').italic = True

document.add_heading('Heading, level 1', level=1)
document.add_paragraph('Intense quote', style='IntenseQuote')

document.add_paragraph(
    'first item in unordered list', style='ListBullet'
)
document.add_paragraph(
    'first item in ordered list', style='ListNumber'
)


document.add_page_break()

document.save('demo.docx')



## 將PDF轉 WORD

In [46]:
## from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from docx import Document
from docx.shared import Inches

document = Document()


fp = open('fubon.pdf', 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the document.
for page in doc.get_pages():
    interpreter.process_page(page)
    layout = device.get_result()
    for lt_obj in layout:
        if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
            document.add_paragraph(lt_obj.get_text())
    document.add_page_break()
document.save('fubon.docx')

