In [28]:
import requests
import random
import json
import pprint
import time
from datetime import datetime
from bs4 import BeautifulSoup

class Crawler:
    pttUrl = "https://www.ptt.cc"

    def __init__(self, board):
        self.board = board
        self.Articles = []
        self.boardUrl = "/bbs/{}".format(board)
        self.contents = []
#         request = requests.get(Crawler.pttUrl + self.boardUrl, cookies = {"over18": "1"})
#         if request.status_code == 404:
#             print("No such board")
#             return
#         pageText = crawler.pageText.split("r-list-sep")[0]
#         self.pageText = request.text
        
    def getToday(self, date=None, pageNext=None, keyword=None, num_posts=1000):
        if pageNext == None:
            pageNext = self.boardUrl
        request = requests.get(Crawler.pttUrl + pageNext, cookies = {"over18": "1"})
        time.sleep(random.random()*3)
        if request.status_code == 404:
            print("No such board")
            return
        self.pageText = request.text
        soup = BeautifulSoup(self.pageText, "lxml")
        pageNext = soup.find("div", "btn-group btn-group-paging").find_all("a")[1].attrs["href"]
        if date == None:
            date = datetime(2020, 1, 1) 
        pageText = self.pageText.split("r-list-sep")[0]
        soup = BeautifulSoup(pageText, "lxml")
        for post in soup.select("div.r-ent"):
            url = post.find("div", "title").a
            if url == None:
                continue
            else:
                url = post.find("div", "title").a.attrs["href"]
            articleTxt = requests.get(Crawler.pttUrl + url, cookies = {"over18": "1"}).text
            articleSoup = BeautifulSoup(articleTxt, "lxml")
            title = articleSoup.find("title").text  # 標題
            content = articleSoup.find("meta", property="og:description").get("content")  # 文章內容
            if title == None or content == None:
                continue
            if keyword != None:
                if title.find(keyword) < 0 and content.find(keyword) < 0:
                    continue
            print(Crawler.pttUrl + url)
#             print(type(title))
            self.contents.append(title + ":" + content)
#             self.contents.append(content)
            push_list = []
            for push in articleSoup.find_all("span", class_="push-content"):
                if len(push.text[2:]) <= 0:
                    continue
                push_list.append(push.text[2:])  # 推文
            self.contents.append(push_list)  
            time_str = post.find("div", "date").text.strip()
            time_obj = datetime.strptime(time_str+"/2020", '%m/%d/%Y')
            #if True:  # 一篇文章
            if time_obj < date:  # 不是今天的文章
                return
        else:
            if len(self.contents)/2 > num_posts:
                return
            print(pageNext)
            self.getToday(date, pageNext, keyword, num_posts)
            
class ArticleInfo:
    def __init__(self, **kwargs):
        self.title = kwargs.get('title', None)
        self.author = kwargs.get('author', None)
        self.url = kwargs.get('url', None)
        self.time = kwargs.get('time', None)
        self.push_message = dict()
        self.content = None
        self.res = None

    def data_process(info, board):
        data = []
        for index, article in enumerate(info):
            data.append(
                {
                    "作者": article.author,
                    "標題": article.title,
                    "日期": article.time,
                    "ip": article.ip,
                    "內文": article.content,
                    "推文": article.push_text
                }
            )
         
        json_data = json.dumps(data, indent=4, sort_keys=True, ensure_ascii=False)
        file_name = 'data-{}-{}.json'.format(board, datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
        with open(file_name, 'a', encoding='utf-8') as f:
            f.write(json_data)


In [29]:
board = "car"
crawler = Crawler(board)
crawler.getToday(keyword="車", num_posts=1)

https://www.ptt.cc/bbs/car/M.1608121523.A.3A7.html
https://www.ptt.cc/bbs/car/M.1608122075.A.EF1.html
https://www.ptt.cc/bbs/car/M.1608122485.A.945.html
https://www.ptt.cc/bbs/car/M.1608125443.A.061.html
https://www.ptt.cc/bbs/car/M.1608127448.A.ABB.html
https://www.ptt.cc/bbs/car/M.1608127699.A.CAA.html
https://www.ptt.cc/bbs/car/M.1608129161.A.E6E.html
https://www.ptt.cc/bbs/car/M.1608130140.A.7AF.html
https://www.ptt.cc/bbs/car/M.1608159874.A.804.html
https://www.ptt.cc/bbs/car/M.1608166715.A.F4E.html
https://www.ptt.cc/bbs/car/M.1608167175.A.117.html


In [9]:
print(len(crawler.contents))

16


In [10]:
import tensorflow as tf
from ckiptagger import data_utils, construct_dictionary, WS, POS, NER

import os
import sys

# Suppress as many warnings as possible
# os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
# from tensorflow.python.util import deprecation
# deprecation._PRINT_DEPRECATION_WARNINGS = False
# import tensorflow as tf
# tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
ws = WS("./data", disable_cuda=False)
pos = POS("./data", disable_cuda=False)
ner = NER("./data", disable_cuda=False)

In [11]:
word_sentence_list = ws(
    crawler.contents,
    sentence_segmentation=True, # To consider delimiters
    segment_delimiter_set = {",", "。", ":", "?", "!", ";", ".", "（", "）", "", "()", " [",
        "] ", ":", "", "》"}, # This is the defualt set of delimiters
#     recommend_dictionary = dictionary1, # words in this dictionary are encouraged
#     coerce_dictionary = dictionary2, # words in this dictionary are forced
)

pos_sentence_list = pos(word_sentence_list)

entity_sentence_list = ner(word_sentence_list, pos_sentence_list)

del ws
del pos
del ner

In [12]:
def print_word_pos_sentence(word_sentence, pos_sentence):
    assert len(word_sentence) == len(pos_sentence)
    for word, pos in zip(word_sentence, pos_sentence):
        print(f"{word}({pos})", end="\u3000")
    print()
    return
    
for i, sentence in enumerate(word_sentence_list):
    print()
    print(f"'{sentence}'")
    print_word_pos_sentence(word_sentence_list[i],  pos_sentence_list[i])
    for entity in sorted(entity_sentence_list[i]):
        print(entity)


'['「', '問題', '」', '三', '代', 'elantra', '冷氣', '壓縮機', ' - ', '看板', ' car - ', '批踢踢', '實業坊', ':', '現代', '三', '代', 'elantra', '冷氣', '壓縮機', '和', '後面', '5', '.', '6', '代', '有', '一樣', '嗎', '？', '還是', '和', '那', '個', '車款', '共用', '可', '直上', '？', '\n', '因為', '車子', '少', '開', '但', '車況', '很', '好', '，', '只是', '壓縮機', '壞掉', '而已', '，', '但', '車子', '沒', '什麼', '殘值', '，', '想', '說', '殺', '肉', '的', '用', '一', '用', '\n', '就', '好', '。', '\n--\n', '※ ', '發信站', ':', ' ', '批', '踢踢', '實業坊', '(', 'ptt', '.', 'cc', ')', ',', ' ', '來自', ':', ' 123', '.', '110', '.', '250', '.', '154', ' ', '(', '臺灣', ')', '\n']'
「(PARENTHESISCATEGORY)　問題(Na)　」(PARENTHESISCATEGORY)　三(Neu)　代(Na)　elantra(FW)　冷氣(Na)　壓縮機(Na)　 - (FW)　看板(Na)　 car - (FW)　批踢踢(Na)　實業坊(Nc)　:(COLONCATEGORY)　現代(Nd)　三(Neu)　代(Na)　elantra(FW)　冷氣(Na)　壓縮機(Na)　和(Caa)　後面(Ncd)　5(Neu)　.(PERIODCATEGORY)　6(Neu)　代(Na)　有(V_2)　一樣(VH)　嗎(T)　？(QUESTIONCATEGORY)　還是(D)　和(P)　那(Nep)　個(Nf)　車款(Na)　共用(VC)　可(D)　直上(VCL)　？(QUESTIONCATEGORY)　
(WHITESPACE)　因為(Cbb)　車子(Na)　少(D)　開(VC)　但(Cbb)　車況(

In [13]:
import pandas as pd

count_list = []
# count_dict = {}
for e in entity_sentence_list:
    for i in e:
#         count_dict[i[3]] = count_dict.get(i[3], 0) + 1
#         df = pd.concat([df, pd.DataFrame(i[3])])
        count_list.append(i[3])

In [14]:
from collections import Counter
entity = Counter(count_list)
entity.most_common(5)

[('一年半', 2), ('8萬', 2), ('5.6', 1), ('韓國大盤館', 1), ('和泰', 1)]

In [15]:
df = pd.DataFrame(count_list, columns=["entity"])
text = df.entity.value_counts()
text.head(30)

一年半       2
8萬        2
5.6       1
6500      1
5萬        1
兩年        1
桃園        1
6.5-7萬    1
韓國大盤館     1
福特        1
中古        1
2021年     1
國際牌       1
1萬        1
四五天       1
95.7      1
6.5       1
新竹地區      1
先清算       1
東西都       1
8千        1
五年八年      1
38%11年    1
民進黨       1
1萬多       1
和泰        1
不明顯       1
107.5     1
新竹        1
台北        1
Name: entity, dtype: int64

In [16]:
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.preprocessing import TransactionEncoder
import pandas as pd

fq_count_lists = []
# count_dict = {}
for e in entity_sentence_list:
    fq_count_list = []
    for i in e:
#         count_dict[i[3]] = count_dict.get(i[3], 0) + 1
#         df = pd.concat([df, pd.DataFrame(i[3])])
        fq_count_list.append(i[3])
    fq_count_lists.append(fq_count_list)
te = TransactionEncoder()
te_ary = te.fit(fq_count_lists).transform(fq_count_lists)
df = pd.DataFrame(te_ary, columns=te.columns_)
from mlxtend.frequent_patterns import fpgrowth
  
fpgrowth(df, min_support=0.05, use_colnames=True)

Unnamed: 0,support,itemsets
0,0.0625,(5.6)
1,0.0625,(韓國大盤館)
2,0.0625,(和泰)
3,0.0625,(民進黨)
4,0.0625,(先清算)
...,...,...
8473,0.0625,"(107.5, 95.7)"
8474,0.0625,"(107.5, 中部, 北部)"
8475,0.0625,"(107.5, 北部, 95.7)"
8476,0.0625,"(107.5, 中部, 95.7)"


In [17]:
python app.py  car  10

SyntaxError: invalid syntax (<ipython-input-17-e5dccad10135>, line 1)