# 爬蟲模組
利用Facebook graph api在臉書粉絲團擷取訪客貼文
### 安裝所需套件

In [1]:
!pip3 install requests



### 設定存取TOKEN
注意：該TOKEN會過期 因此需要定期更新

In [2]:
access_token="EAACEdEose0cBACXRazrzG3tLncLGJBHZCcPY2IZC7pgZBGwOK57ECGRg1Wogtnpj97Vwq1duK5610LumBTpaMMHthX6JlZAZClOc3eQQYuUD5y0SynEpegKN63nZCUofiCVHgwqyJFZAnEIVm9SRim8WWCr5bvYq8rGg8ZBJ8GPZByhOo9N23ZBI85DDusl1XhIAXA5Xr9aztdVQZDZD"

### 設定爬蟲程式參數
若設定為-1則為全部抓取

In [3]:
# 貼文抓取數量
post_limit=1000
# 評論抓取數量
comment_limit=1000
# 輸出檔案位置
output_file_path="/Users/Steve/PycharmProjects/Phantacy/result_all.json"
# 粉絲團ID
fan_page_id="818773044827025"

### 主程式

In [4]:
import datetime
import io
import json
import re

import threading

import requests
time_format = "%Y-%m-%dT%H:%M:%S+0000"


def get_request_url(api_version, request_object, request_field, facebook_access_token):
    api_url_path = 'https://graph.facebook.com'
    request_format = '{}/{}/{}/?fields={}&access_token={}'
    request_url = request_format \
        .format(api_url_path, api_version, request_object, request_field, facebook_access_token)
    return request_url


def write_json_to_result_file(py_object, file_name):
    with io.open(file_name, 'w', encoding='utf-8') as result_file:
        data = json.dumps(py_object, default=lambda o: o.__dict__,
                          sort_keys=True, indent=4, ensure_ascii=False)
        result_file.write(data)


class Post:
    prev_created_time = None

    def __init__(self, pid, pwid, time, pcontent, fpr, pcids):
        create_date = datetime.datetime.strptime(time, time_format)
        self.post_id = pid
        self.post_writer_id = pwid
        self.post_created_time = time
        self.post_content = pcontent
        self.fan_page_reply = fpr
        self.post_comment_ids = pcids
        # self.tokens = tokenizer.tokenize(pcontent)
        time_elapsed = Post.prev_created_time - create_date if Post.prev_created_time is not None else None
        self.time_interval = int(time_elapsed.total_seconds()) if time_elapsed is not None else None

    @staticmethod
    def set_prev_created_time(prev):
        Post.prev_created_time = datetime.datetime.strptime(prev, time_format)

    def __repr__(self):
        return u"文章 id: {}  [{} ({})]   [全部的評論: {}] [粉絲團回覆: {}]\n{}\n".format(self.post_id,
                                                                              self.post_created_time,
                                                                              self.time_interval,
                                                                              len(
                                                                                  self.post_comment_ids),
                                                                              self.fan_page_reply,
                                                                              self.post_content
                                                                              )


class FanPageCrawler:
    # do not set this over 100
    api_request_post_max_count = 50
    api_request_review_max_count = 50

    def __init__(self, token, fid, output_ui):
        self.output_ui = output_ui
        self.access_token = token
        self.fan_page_id = fid
        self.request_fields = "visitor_posts.limit(1){comments.limit(2){from},created_time,message,from}"

    def generate_post_request_url(self, posts_limit, comments_limit):
        if posts_limit > self.api_request_post_max_count or posts_limit == -1:
            posts_limit = self.api_request_post_max_count
        if comments_limit > self.api_request_review_max_count or comments_limit == -1:
            comments_limit = self.api_request_post_max_count
        self.request_fields = self.request_fields \
            .replace("(1)", "({})".format(posts_limit)) \
            .replace("(2)", "({})".format(comments_limit))

        request_url = get_request_url(api_version='v2.10',
                                      request_object=self.fan_page_id,
                                      request_field=self.request_fields,
                                      facebook_access_token=self.access_token)
        return request_url

    def crawl_posts(self, output_file_path, posts_limit=-1, comments_limit=-1):
        self.print_to_output("開始抓取資料")
        request_url = self.generate_post_request_url(posts_limit, comments_limit)
        response = requests.get(request_url).json()
        if 'error' in response:
            self.print_to_output('存取以下url錯誤: \n{}'.format(request_url))
            self.print_to_output('錯誤訊息: {}'.format(response['error']['message']))
            exit(1)
        post_output_list = []
        for pid, post in enumerate(self.get_all_elements(response['visitor_posts'], '貼文已抓取.', posts_limit)):
            if 'message' in post and 'comments' in post:
                post_id = str(post['id'])
                post_writer_id = int(post['from'][u'id'])
                post_created_time = str(post['created_time'])
                post_content = self.message_pre_process(post['message'])
                post_comments = post['comments']
                # Get all reviewer ids

                post_comment_ids = [int(comment['from'][u'id']) for comment in
                                    self.get_all_elements(post_comments,
                                                          '則評論已從貼文[{pid}]抓取'.format(pid=pid + 1),
                                                          comments_limit)]

                fan_page_reply = True if int(self.fan_page_id) in post_comment_ids else False
                post_output_list.append(
                    Post(post_id, post_writer_id, post_created_time, post_content, fan_page_reply, post_comment_ids))
                Post.set_prev_created_time(post_created_time)
        for post in post_output_list:
            self.print_to_output(post)

        self.print_to_output('抓取完畢，寫入檔案中')
        write_json_to_result_file(post_output_list, output_file_path)
        self.print_to_output('寫入檔案完畢，可開始分析')

    def message_pre_process(self, message):
        message = str(message).replace('\n', '')
        message = re.sub("http[s]?:\/\/[^\/\s]+\/[^\s]+", "", message)
        return message

    def get_all_elements(self, response, message, limit):
        result_list = []
        while 'paging' in response:
            result_list.extend(response['data'])
            if len(result_list) > limit != -1:
                break
            if 'next' in response['paging']:
                next_request_url = response['paging']['next']
                # self.print_to_output() next_request_url
                response = requests.get(next_request_url).json()
                # self.print_to_output() 'Currently fetch {} {}s'.format(len(result_list), element_name)
            else:
                break
        number = len(result_list) if limit == -1 else limit
        self.print_to_output("總共 {} {}".format(number, message))
        return result_list[:number]

    def print_to_output(self, text):
        try:
            self.output_ui.refresh_text(text)
        except:
            print(text)




class CrawlerTask(threading.Thread):
    def __init__(self, output_ui):
        super(CrawlerTask, self).__init__()
        self.is_complete=False
        self.output_ui = output_ui

    def print_to_output(self, text):
        try:
            self.output_ui.refresh_text(text)
        except:
            print(text)


    def run(self):
        crawler = FanPageCrawler(token=access_token,
                                 fid=fan_page_id, output_ui=self.output_ui)
        crawler.crawl_posts(posts_limit=post_limit,
                            comments_limit=comment_limit,
                            output_file_path=output_file_path)
        self.is_complete=True





呼叫主程式開始執行

In [5]:
if __name__ == '__main__':
    CrawlerTask(None).run()

開始抓取資料
總共 1000 貼文已抓取.
總共 1000 則評論已從貼文[1]抓取
總共 1000 則評論已從貼文[17]抓取
總共 1000 則評論已從貼文[19]抓取
總共 1000 則評論已從貼文[38]抓取
總共 1000 則評論已從貼文[39]抓取
總共 1000 則評論已從貼文[42]抓取
總共 1000 則評論已從貼文[45]抓取
總共 1000 則評論已從貼文[50]抓取
總共 1000 則評論已從貼文[55]抓取
總共 1000 則評論已從貼文[57]抓取
總共 1000 則評論已從貼文[59]抓取
總共 1000 則評論已從貼文[60]抓取
總共 1000 則評論已從貼文[66]抓取
總共 1000 則評論已從貼文[69]抓取
總共 1000 則評論已從貼文[71]抓取
總共 1000 則評論已從貼文[73]抓取
總共 1000 則評論已從貼文[74]抓取
總共 1000 則評論已從貼文[81]抓取
總共 1000 則評論已從貼文[82]抓取
總共 1000 則評論已從貼文[85]抓取
總共 1000 則評論已從貼文[86]抓取
總共 1000 則評論已從貼文[88]抓取
總共 1000 則評論已從貼文[94]抓取
總共 1000 則評論已從貼文[99]抓取
總共 1000 則評論已從貼文[100]抓取
總共 1000 則評論已從貼文[103]抓取
總共 1000 則評論已從貼文[106]抓取
總共 1000 則評論已從貼文[109]抓取
總共 1000 則評論已從貼文[112]抓取
總共 1000 則評論已從貼文[114]抓取
總共 1000 則評論已從貼文[117]抓取
總共 1000 則評論已從貼文[118]抓取
總共 1000 則評論已從貼文[121]抓取
總共 1000 則評論已從貼文[122]抓取
總共 1000 則評論已從貼文[123]抓取
總共 1000 則評論已從貼文[124]抓取
總共 1000 則評論已從貼文[126]抓取
總共 1000 則評論已從貼文[127]抓取
總共 1000 則評論已從貼文[128]抓取
總共 1000 則評論已從貼文[130]抓取
總共 1000 則評論已從貼文[131]抓取
總共 1000 則評論已從貼文[132]抓取
總共 1000 則評論已從貼文[134]抓取
總共 1000 則評論已從貼

總共 1000 則評論已從貼文[775]抓取
總共 1000 則評論已從貼文[776]抓取
總共 1000 則評論已從貼文[777]抓取
總共 1000 則評論已從貼文[779]抓取
總共 1000 則評論已從貼文[780]抓取
總共 1000 則評論已從貼文[781]抓取
總共 1000 則評論已從貼文[782]抓取
總共 1000 則評論已從貼文[783]抓取
總共 1000 則評論已從貼文[785]抓取
總共 1000 則評論已從貼文[787]抓取
總共 1000 則評論已從貼文[789]抓取
總共 1000 則評論已從貼文[791]抓取
總共 1000 則評論已從貼文[792]抓取
總共 1000 則評論已從貼文[793]抓取
總共 1000 則評論已從貼文[795]抓取
總共 1000 則評論已從貼文[796]抓取
總共 1000 則評論已從貼文[797]抓取
總共 1000 則評論已從貼文[799]抓取
總共 1000 則評論已從貼文[801]抓取
總共 1000 則評論已從貼文[802]抓取
總共 1000 則評論已從貼文[803]抓取
總共 1000 則評論已從貼文[806]抓取
總共 1000 則評論已從貼文[808]抓取
總共 1000 則評論已從貼文[809]抓取
總共 1000 則評論已從貼文[811]抓取
總共 1000 則評論已從貼文[812]抓取
總共 1000 則評論已從貼文[813]抓取
總共 1000 則評論已從貼文[815]抓取
總共 1000 則評論已從貼文[816]抓取
總共 1000 則評論已從貼文[817]抓取
總共 1000 則評論已從貼文[818]抓取
總共 1000 則評論已從貼文[819]抓取
總共 1000 則評論已從貼文[820]抓取
總共 1000 則評論已從貼文[822]抓取
總共 1000 則評論已從貼文[823]抓取
總共 1000 則評論已從貼文[824]抓取
總共 1000 則評論已從貼文[825]抓取
總共 1000 則評論已從貼文[827]抓取
總共 1000 則評論已從貼文[828]抓取
總共 1000 則評論已從貼文[829]抓取
總共 1000 則評論已從貼文[830]抓取
總共 1000 則評論已從貼文[833]抓取
總共 1000 則評論已從貼文[837]抓取
總共 1000 則評論