In [27]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from html.parser import HTMLParser
from urllib.request import urlopen
from urllib import parse
from bs4 import BeautifulSoup

# We are going to create a class called LinkParser that inherits some
# methods from HTMLParser which is why it is passed into the definition
class LinkParser(HTMLParser): # will be used by apple spider

    # This is a function that HTMLParser normally has
    # but we are adding some functionality to it
    def handle_starttag(self, tag, attrs):
        # We are looking for the begining of a link. Links normally look
        # like <a href="www.someurl.com"></a>
        if tag == 'a':
            for (key, value) in attrs:
                if key == 'href':
                    # We are grabbing the new URL. We are also adding the
                    # base URL to it. For example:
                    # www.netinstructions.com is the base and
                    # somepage.html is the new URL (a relative URL)
                    #
                    # We combine a relative URL with the base URL to create
                    # an absolute URL like:
                    # www.netinstructions.com/somepage.html
                    newUrl = parse.urljoin(self.baseUrl, value)
                    # And add it to our colection of links:
                    if self.rules is not None and self.rules.get('link_prefix') is not None:
                        found = False
                        for rule in self.rules.get('link_prefix'):
                            found = found or newUrl.startswith( parse.urljoin(self.baseUrl, rule ) )
                        if not found:
                            break
                    self.links = self.links + [newUrl]

    # This is a new function that we are creating to get content and links
    # that our spider() function will call
    def get_Content_Links(self, url, rules=None):
        """ Return html string, links """
        self.links = []
        self.rules = rules
        # Remember the base URL which will be important when creating
        # absolute URLs
        self.baseUrl = url
        # Use the urlopen function from the standard Python 3 library
        response = urlopen(url)
        # Make sure that we are looking at HTML and not other things that
        # are floating around on the internet (such as
        # JavaScript files, CSS, or .PDFs for example)
        if response.getheader('Content-Type')=='text/html':
            htmlBytes = response.read()
            # Note that feed() handles Strings well, but not bytes
            # (A change from Python 2.x to Python 3.x)
            htmlString = htmlBytes.decode("utf-8")
            self.feed(htmlString)
            return htmlString, self.links
        else:
            return "",[]


class Spider:

    def __init__(self, baseUrl=None, rules=None, callback=None):
        # self.baseUrl = baseUrl or [('http://hkm.appledaily.com/list.php?category_guid=10829391&category=instant', 0)] # news link
        # self.baseUrl = baseUrl or [('http://hkm.appledaily.com/detail.php?guid=55369858&category_guid=10829391&category=instant&issue=20160717', 0)]
        # self.rules = rules or {'link_prefix': ['http://hkm.appledaily.com/detail.php']}

        # self.baseUrl = baseUrl or [('http://orientaldaily.on.cc/cnt/main/20160701/index.html', 0)] # news link
        self.baseUrl = baseUrl or [('http://orientaldaily.on.cc/cnt/finance/20160717/00202_001.html', 0)]
        self.rules = rules or {'link_prefix': ['http://orientaldaily.on.cc/cnt/china_world/']}

        self.callback = callback # callback function
        print('Inited a AppleSpider, baseUrls =', self.baseUrl[0])
        self.count  =0

    def setCallback(self,callback):
        self.callback = callback

    def extract_content_orientaldaily(self, html, url):
        """ Extract oriental daily 1 header, 2 contect """
        soup = BeautifulSoup(html, 'html.parser')
#         print (soup.prettify())
        content = ''
        lastUpdateTime = None
        title = ''
#         if soup.select('.lastupdate'):
#             lastUpdateTime = soup.select('.lastupdate')[0].text
#         if soup.select('#content-article h1'):
#             title = soup.select('#content-article h1')[0].text
#         paragraphs = soup.select('#content-article p')
#         for paragraph in paragraphs:
#             if paragraph.get('class') is None or ( paragraph.get('class') not in [ ['video-caption'], ['next'] ] ):
#                 if not paragraph.text.startswith('【'):
#                     content += paragraph.text
#         print(soup.get_text())

        para= soup.find_all('p','h3')
        for p in para:
            print (p.getText())
    
        print ( 'title = ', soup.title.get_text())
        print ('content = ', content)
        print ('lastUpdateTime = ', lastUpdateTime)


    # And finally here is our spider. It takes in an URL, a word to find,
    # and the number of pages to search through before giving up
    def crawl(self, maxLevel=0):
        """ Craw the page with maxLevel """

        print('[Crawl] Page to visit = ', self.baseUrl)
        pagesToVisit = self.baseUrl

        levelVisited = 0
        # The main loop. Create a LinkParser and get all the links on the page.
        # Also search the page for the word or string
        # In our getLinks function we return the web page
        # (this is useful for searching for the word)
        # and we return a set of links from that web page
        # (this is useful for where to go next)
        while pagesToVisit != []:
            # Start from the beginning of our collection of pages to visit:
            url, levelVisited = pagesToVisit[0]
            if levelVisited > maxLevel:
                print ('[Crawl] levelVisited = ', levelVisited, ' reached maxLevel =', levelVisited, ', Break ..')
                break
            pagesToVisit = pagesToVisit[1:]
            print('[Crawl] levelVisited = ', levelVisited, " Visiting:", url)

            # a LinkParser
            parser = LinkParser()

            # return the (web page html, a set of links from that web page, initially the root page only)
            data, links = parser.get_Content_Links(url, self.rules)
            # print ('data = ', data)
            # print ('links = ', links)
            print ('++++++++++++++++++++++++++++++++++++++++++++++++')


            # parse html to extract header, content
            self.extract_content_orientaldaily(data, url)

            # parse data (html) to extract contents from it with predefined rules
            # self.extract_content_apple(data,url)

            # Add the pages that we visited to the end of our collection
            # of pages to visit:
            # links = [(link, levelVisited+1) for link in links ]
            # pagesToVisit = pagesToVisit + links

            
# unit test of applespider
spider = Spider()
spider.crawl()

Inited a AppleSpider, baseUrls = ('http://orientaldaily.on.cc/cnt/finance/20160717/00202_001.html', 0)
[Crawl] Page to visit =  [('http://orientaldaily.on.cc/cnt/finance/20160717/00202_001.html', 0)]
[Crawl] levelVisited =  0  Visiting: http://orientaldaily.on.cc/cnt/finance/20160717/00202_001.html
++++++++++++++++++++++++++++++++++++++++++++++++
樓價回升，反價、封盤又變回市場常態，有業主「畀票都唔賣」，令買家乾着急，樓市「癲」峰時的荒誕現象重演。沙田好運中心有買家恐防有人「截胡」，漏夜到機場接業主機，交票求成交。大埔新達廣場罕有海景戶，買家為求心頭好「唔睇樓」都即畀票，業主反而「潛水」多日。
樓市再現狂態，業主「吊高嚟賣」屢見不鮮，更有買家比業主更「離地」。
好運中心418萬高價沽

美聯吳梓鋒表示，沙田好運中心榆林閣一個兩房戶，實用面積423方呎，業主與買家口頭議價成功，業主表明去完旅行返港後落實成交，但買家恐防有人捷足先登，或業主轉趨猶豫，連夜與代理趕到機場接業主機，即時交票簽約，最終得償所願以418萬元高市價承接。

美聯楊浩然指，大埔新達廣場4座高層戶，實用面積489方呎，屬罕有享吐露港海景的優質單位，有買家心儀該類單位。甫聽到市場有放盤，無睇樓也寫支票予業主求購，但業主一聽代理「有票在手」，竟突然「潛水」，電話或WhatsApp等一律不覆，數天後才致電代理表明要加價放盤。
此外，近日不少業主放盤個案亦為之瘋狂。美聯鄺啟鋒指，東涌映灣園9座一個中層戶，實用面績757方呎。業主以600萬元放售，惟睇樓客太多，業主表明要「篩選吓」，合適的才准睇樓。
美聯劉浩勤稱，將軍澳維景灣畔5座低層戶，業主開價575萬元，曾反價至580萬元「試水溫」，但有買家有興趣時，業主卻指：「我志在見吓票、見吓人咁，出到580萬元都唔賣。」及後有買家想睇樓亦遭拒絕。
中原亞太區住宅部總裁陳永傑指，英國脫歐公投後，環球貨幣匯價波動，對本港樓市而言

