In [2]:
from apify_client import ApifyClient
import pandas as pd
import re
import os
import argparse
import urllib.request
from urllib.parse import urlparse, urlunparse


def get_api_client():
    APIFY_API_KEY = os.getenv('APIFY_API_TOKEN')
    # if APIFY_API_KEY is None:
    #     raise ValueError('API_TOKEN environment variable not set')
    APIFY_API_KEY = "apify_api_3X6i9qAVnWmMrvPM0VrtU3HFvxfg1c2MMrI6"

    # Initialize the ApifyClient with your API token
    client = ApifyClient(APIFY_API_KEY)

    return client


def get_actors(client):
    actors_list = client.actors().list()
    fb_posts_scraper_id, fb_comments_scraper_id = "", ""
    for actor in actors_list.items:
        if actor['name'] == 'facebook-posts-scraper':
            fb_posts_scraper_id = actor['id']
        if actor['name'] == 'facebook-comments-scraper':
            fb_comments_scraper_id = actor['id']

    return fb_posts_scraper_id, fb_comments_scraper_id


def get_comments(client, results_Limit=25, comments_limit=50):
    fb_posts_scraper_id, fb_comments_scraper_id = get_actors(client)

    # Get last 25 posts from HK01
    input_get_post = {
        "startUrls": [{ "url": "https://www.facebook.com/hk01wemedia/" }],
        "resultsLimit": results_Limit,
    }
    # Run the Actor and wait for it to finish
    run_get_post = client.actor(fb_posts_scraper_id).call(run_input=input_get_post)

    # Fetch and print Actor results from the run's dataset (if there are any)
    output_urls = []
    for item in client.dataset(run_get_post["defaultDatasetId"]).iterate_items():
        if "01新聞" in item.get("text", ""):
            output_urls.append({"url": item["url"]})


    # Prepare the Actor input
    input_get_comments = {
        "startUrls": output_urls,
        "resultsLimit": comments_limit,
        "includeNestedComments": False,
        "viewOption": "RANKED_UNFILTERED",
    }
    # Run the Actor and wait for it to finish
    run_get_comments = client.actor(fb_comments_scraper_id).call(run_input=input_get_comments)

    # Fetch Actor results from the run's dataset and create a DataFrame
    items = list(client.dataset(run_get_comments["defaultDatasetId"]).iterate_items())
    df = pd.DataFrame(items)

    return df


def extract_and_expand_bityl_link(post_title):
    match = re.search(r"全文：(.*?)\n", post_title)
    if match:
        bityl_link = match.group(1)
        try:
            with urllib.request.urlopen(bityl_link) as response:
                parsed_url = urlparse(response.geturl())
                clean_url = urlunparse(parsed_url._replace(query=""))
                return clean_url
        except:
            return None
    else:
        return None


def main(args):
    results_limit = args.results_limit if args is not None else 25

    client = get_api_client()
    df = get_comments(client, results_limit)

    # Add a new column with expanded links
    df["Expanded Link"] = df["postTitle"].apply(extract_and_expand_bityl_link)

    # Save the DataFrame to an Excel file in Colab
    # excel_file_path = "comments_with_expanded_links.xlsx"
    # df.to_excel(excel_file_path, index=False)

    # print("Excel file with expanded links has been successfully created.")
    # print(df)



In [3]:
client = get_api_client()

In [4]:
fb_posts_scraper_id, fb_comments_scraper_id = get_actors(client)

# Get last 25 posts from HK01
input_get_post = {
    "startUrls": [{ "url": "https://www.facebook.com/hk01wemedia/" }],
    "resultsLimit": 5,
}
# Run the Actor and wait for it to finish
run_get_post = client.actor(fb_posts_scraper_id).call(run_input=input_get_post)

# Fetch and print Actor results from the run's dataset (if there are any)
output_urls = []
for item in client.dataset(run_get_post["defaultDatasetId"]).iterate_items():
    if "01新聞" in item.get("text", ""):
        output_urls.append({"url": item["url"]})


# Prepare the Actor input
input_get_comments = {
    "startUrls": output_urls,
    "resultsLimit": 5,
    "includeNestedComments": False,
    "viewOption": "RANKED_UNFILTERED",
}
# Run the Actor and wait for it to finish
run_get_comments = client.actor(fb_comments_scraper_id).call(run_input=input_get_comments)

In [5]:
run_get_comments

{'id': 'gFSSZAiZYiM35LQ0v',
 'actId': 'us5srxAYnsrkgUv2v',
 'userId': 's9KWWmxyr2xr8Zqqr',
 'startedAt': datetime.datetime(2024, 5, 27, 12, 31, 58, 88000, tzinfo=datetime.timezone.utc),
 'finishedAt': datetime.datetime(2024, 5, 27, 12, 32, 12, 200000, tzinfo=datetime.timezone.utc),
 'status': 'SUCCEEDED',
 'statusMessage': 'Finished! Total 6 requests: 6 succeeded, 0 failed.',
 'isStatusMessageTerminal': True,
 'meta': {'origin': 'API',
  'userAgent': 'ApifyClient/1.7.0 (darwin; Python/3.9.19); isAtHome/False'},
 'stats': {'inputBodyLen': 472,
  'rebootCount': 0,
  'restartCount': 0,
  'durationMillis': 13979,
  'resurrectCount': 0,
  'runTimeSecs': 13.979,
  'metamorph': 0,
  'computeUnits': 0.015532222222222222,
  'memAvgBytes': 95237988.79913971,
  'memMaxBytes': 130834432,
  'memCurrentBytes': 122036224,
  'cpuAvgUsage': 13.96185742407352,
  'cpuMaxUsage': 107.11159456118664,
  'cpuCurrentUsage': 5.836574074074074,
  'netRxBytes': 455106,
  'netTxBytes': 82090},
 'options': {'build'

In [11]:
items = list(client.dataset(run_get_comments["defaultDatasetId"]).iterate_items())

In [12]:
items[0]

{'facebookUrl': 'https://www.facebook.com/hk01wemedia/posts/pfbid02ysQ6gSHWVBP74BEcqeRVL6mTFgCHDqBAMMC9AaoKDKxHdmZiKv1sWGiQ82ZkFcsLl',
 'commentUrl': 'https://www.facebook.com/hk01wemedia/posts/pfbid02ysQ6gSHWVBP74BEcqeRVL6mTFgCHDqBAMMC9AaoKDKxHdmZiKv1sWGiQ82ZkFcsLl?comment_id=819532210129127',
 'id': 'Y29tbWVudDo3NTc3ODI5OTY1MjgwMzhfODE5NTMyMjEwMTI5MTI3',
 'feedbackId': 'ZmVlZGJhY2s6NzU3NzgyOTk2NTI4MDM4XzgxOTUzMjIxMDEyOTEyNw==',
 'date': '2024-05-27T12:31:19.000Z',
 'text': '唔好衝動呀！其實呢個世界好美好，除咗病痛外，冇乜嘢係事情解決唔到，祝你早日康復🙏',
 'profilePicture': 'https://scontent.fman2-2.fna.fbcdn.net/v/t1.6435-1/142845230_3589029204500062_8119221974962450491_n.jpg?stp=cp0_dst-jpg_p32x32&_nc_cat=111&ccb=1-7&_nc_sid=5f2048&_nc_ohc=mfZGAn4YlS8Q7kNvgFrm78a&_nc_ht=scontent.fman2-2.fna&oh=00_AYBd4iaawso0oVsChO3_ATC6ibznCFZAzw8pwY3SWbPokg&oe=667BE5FE',
 'profileId': 'pfbid02Hv52PGCoFcZeL1C4JkpGxRSkJtLbqMcq1sUwqTFtBgQkv7Hz4h9pDvUmYKPUVrYvl',
 'profileName': 'Cyrus Lam',
 'likesCount': 0,
 'facebookId': '75778299652803

In [14]:
items[1]

{'facebookUrl': 'https://www.facebook.com/hk01wemedia/posts/pfbid0365gaZ3A1B98X4ae1uDDFKU9c74wev716dtef8hsRRi1monPvLjGATtAXKSh1NXphl',
 'commentUrl': 'https://www.facebook.com/hk01wemedia/posts/pfbid0365gaZ3A1B98X4ae1uDDFKU9c74wev716dtef8hsRRi1monPvLjGATtAXKSh1NXphl?comment_id=389800230037302',
 'id': 'Y29tbWVudDo3NTc3NjIzMTY1MzAxMDZfMzg5ODAwMjMwMDM3MzAy',
 'feedbackId': 'ZmVlZGJhY2s6NzU3NzYyMzE2NTMwMTA2XzM4OTgwMDIzMDAzNzMwMg==',
 'date': '2024-05-27T11:24:31.000Z',
 'text': '願平安🙏🏻',
 'profilePicture': 'https://scontent-lga3-2.xx.fbcdn.net/v/t1.18169-1/533223_10202035453711336_60841803_n.jpg?stp=c37.37.468.468a_cp0_dst-jpg_s32x32&_nc_cat=104&ccb=1-7&_nc_sid=5f2048&_nc_ohc=Oo-pKKK1I-8Q7kNvgEY7dbz&_nc_ht=scontent-lga3-2.xx&oh=00_AYAjSvklnlGsTrRJO3eV0L1Ug9Dg_J-Os6pWGMCGuYbDlQ&oe=667C083A',
 'profileId': 'pfbid0zV18NxsJQX5GdMq2hcC2HZErqJAjSa2cA4UpQAjXusmdDr2jyYyg8Esm1tqypqtjl',
 'profileName': 'Janice Hui',
 'likesCount': 0,
 'facebookId': '757762316530106',
 'postTitle': '【緊貼 01新聞 】女傷者面部、

In [31]:
df = get_comments(client, 5, 5)

ApifyApiError: You must rent a paid Actor in order to run it.

In [28]:
df["expandedLink"] = df["postTitle"].apply(extract_and_expand_bityl_link)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["expandedLink"] = df["postTitle"].apply(extract_and_expand_bityl_link)


In [29]:
df

Unnamed: 0,commentUrl,date,text,postTitle,ExpandedLink,commentId,expandedLink
0,https://www.facebook.com/hk01wemedia/posts/pfb...,2024-05-27T12:31:19.000Z,唔好衝動呀！其實呢個世界好美好，除咗病痛外，冇乜嘢係事情解決唔到，祝你早日康復🙏,【緊貼 01新聞 】何苦要咁傻\n全文：https://bityl.co/Q9NG\n\n旺...,https://www.hk01.com/%E7%AA%81%E7%99%BC/102334...,819532210129127,https://www.hk01.com/%E7%AA%81%E7%99%BC/102334...
1,https://www.facebook.com/hk01wemedia/posts/pfb...,2024-05-27T12:35:29.000Z,🙏🏻🙏🏻,【緊貼 01新聞 】何苦要咁傻\n全文：https://bityl.co/Q9NG\n\n旺...,https://www.hk01.com/%E7%AA%81%E7%99%BC/102334...,986591906268933,https://www.hk01.com/%E7%AA%81%E7%99%BC/102334...
2,https://www.facebook.com/hk01wemedia/posts/pfb...,2024-05-27T12:34:31.000Z,真係服左，咁嘅case都有人比笑\n所以話呢個世界乜衰人都有,【緊貼 01新聞 】何苦要咁傻\n全文：https://bityl.co/Q9NG\n\n旺...,https://www.hk01.com/%E7%AA%81%E7%99%BC/102334...,1623627895137727,https://www.hk01.com/%E7%AA%81%E7%99%BC/102334...
3,https://www.facebook.com/hk01wemedia/posts/pfb...,2024-05-27T12:37:08.000Z,為個男人燒爛曬己塊面值得咩 你毀容佢更只會更加離開你 最緊要係你自己呀,【緊貼 01新聞 】何苦要咁傻\n全文：https://bityl.co/Q9NG\n\n旺...,https://www.hk01.com/%E7%AA%81%E7%99%BC/102334...,803232781768370,https://www.hk01.com/%E7%AA%81%E7%99%BC/102334...
4,https://www.facebook.com/hk01wemedia/posts/757...,2024-05-27T11:24:31.000Z,願平安🙏🏻,【緊貼 01新聞 】女傷者面部、四肢均須貼上敷料、紗布或保鮮紙包紮\n全文：https://...,https://www.hk01.com/%E7%AA%81%E7%99%BC/102329...,389800230037302,https://www.hk01.com/%E7%AA%81%E7%99%BC/102329...
5,https://www.facebook.com/hk01wemedia/posts/757...,2024-05-27T11:08:05.000Z,🙏,【緊貼 01新聞 】女傷者面部、四肢均須貼上敷料、紗布或保鮮紙包紮\n全文：https://...,https://www.hk01.com/%E7%AA%81%E7%99%BC/102329...,368267072424294,https://www.hk01.com/%E7%AA%81%E7%99%BC/102329...
6,https://www.facebook.com/hk01wemedia/posts/757...,2024-05-27T11:40:15.000Z,毀容 慘過當場死,【緊貼 01新聞 】女傷者面部、四肢均須貼上敷料、紗布或保鮮紙包紮\n全文：https://...,https://www.hk01.com/%E7%AA%81%E7%99%BC/102329...,873114331521625,https://www.hk01.com/%E7%AA%81%E7%99%BC/102329...
7,https://www.facebook.com/hk01wemedia/posts/757...,2024-05-27T10:59:27.000Z,比死更難受,【緊貼 01新聞 】女傷者面部、四肢均須貼上敷料、紗布或保鮮紙包紮\n全文：https://...,https://www.hk01.com/%E7%AA%81%E7%99%BC/102329...,1949536242149262,https://www.hk01.com/%E7%AA%81%E7%99%BC/102329...
8,https://www.facebook.com/hk01wemedia/posts/757...,2024-05-27T11:17:21.000Z,願傷者平平安安🙏🏿,【緊貼 01新聞 】女傷者面部、四肢均須貼上敷料、紗布或保鮮紙包紮\n全文：https://...,https://www.hk01.com/%E7%AA%81%E7%99%BC/102329...,1219297022813916,https://www.hk01.com/%E7%AA%81%E7%99%BC/102329...
9,https://www.facebook.com/hk01wemedia/posts/757...,2024-05-27T10:31:32.000Z,係個後備月台掛幾塊板叫主題🥰,【緊貼 01新聞 】仲會推出紀念票\n全文：https://bityl.co/Q9Fk\n\...,https://www.hk01.com/%E7%A4%BE%E6%9C%83%E6%96%...,479097471351472,https://www.hk01.com/%E7%A4%BE%E6%9C%83%E6%96%...


In [24]:
df = df[['commentUrl', 'date', 'text', 'postTitle', 'expandedLink']]

In [25]:
# df["comment_id"] = df.apply(lambda row: [id for id in row['url'].split('?')[1].split('=')[1].split(',') if id not in url_dict.get(row['url'], [])], axis=1)

df["commentId"] = df["commentUrl"].apply(lambda url: url.split('?')[1].split('=')[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["commentId"] = df["commentUrl"].apply(lambda url: url.split('?')[1].split('=')[1])


In [26]:
df

Unnamed: 0,commentUrl,date,text,postTitle,ExpandedLink,commentId
0,https://www.facebook.com/hk01wemedia/posts/pfb...,2024-05-27T12:31:19.000Z,唔好衝動呀！其實呢個世界好美好，除咗病痛外，冇乜嘢係事情解決唔到，祝你早日康復🙏,【緊貼 01新聞 】何苦要咁傻\n全文：https://bityl.co/Q9NG\n\n旺...,https://www.hk01.com/%E7%AA%81%E7%99%BC/102334...,819532210129127
1,https://www.facebook.com/hk01wemedia/posts/pfb...,2024-05-27T12:35:29.000Z,🙏🏻🙏🏻,【緊貼 01新聞 】何苦要咁傻\n全文：https://bityl.co/Q9NG\n\n旺...,https://www.hk01.com/%E7%AA%81%E7%99%BC/102334...,986591906268933
2,https://www.facebook.com/hk01wemedia/posts/pfb...,2024-05-27T12:34:31.000Z,真係服左，咁嘅case都有人比笑\n所以話呢個世界乜衰人都有,【緊貼 01新聞 】何苦要咁傻\n全文：https://bityl.co/Q9NG\n\n旺...,https://www.hk01.com/%E7%AA%81%E7%99%BC/102334...,1623627895137727
3,https://www.facebook.com/hk01wemedia/posts/pfb...,2024-05-27T12:37:08.000Z,為個男人燒爛曬己塊面值得咩 你毀容佢更只會更加離開你 最緊要係你自己呀,【緊貼 01新聞 】何苦要咁傻\n全文：https://bityl.co/Q9NG\n\n旺...,https://www.hk01.com/%E7%AA%81%E7%99%BC/102334...,803232781768370
4,https://www.facebook.com/hk01wemedia/posts/757...,2024-05-27T11:24:31.000Z,願平安🙏🏻,【緊貼 01新聞 】女傷者面部、四肢均須貼上敷料、紗布或保鮮紙包紮\n全文：https://...,https://www.hk01.com/%E7%AA%81%E7%99%BC/102329...,389800230037302
5,https://www.facebook.com/hk01wemedia/posts/757...,2024-05-27T11:08:05.000Z,🙏,【緊貼 01新聞 】女傷者面部、四肢均須貼上敷料、紗布或保鮮紙包紮\n全文：https://...,https://www.hk01.com/%E7%AA%81%E7%99%BC/102329...,368267072424294
6,https://www.facebook.com/hk01wemedia/posts/757...,2024-05-27T11:40:15.000Z,毀容 慘過當場死,【緊貼 01新聞 】女傷者面部、四肢均須貼上敷料、紗布或保鮮紙包紮\n全文：https://...,https://www.hk01.com/%E7%AA%81%E7%99%BC/102329...,873114331521625
7,https://www.facebook.com/hk01wemedia/posts/757...,2024-05-27T10:59:27.000Z,比死更難受,【緊貼 01新聞 】女傷者面部、四肢均須貼上敷料、紗布或保鮮紙包紮\n全文：https://...,https://www.hk01.com/%E7%AA%81%E7%99%BC/102329...,1949536242149262
8,https://www.facebook.com/hk01wemedia/posts/757...,2024-05-27T11:17:21.000Z,願傷者平平安安🙏🏿,【緊貼 01新聞 】女傷者面部、四肢均須貼上敷料、紗布或保鮮紙包紮\n全文：https://...,https://www.hk01.com/%E7%AA%81%E7%99%BC/102329...,1219297022813916
9,https://www.facebook.com/hk01wemedia/posts/757...,2024-05-27T10:31:32.000Z,係個後備月台掛幾塊板叫主題🥰,【緊貼 01新聞 】仲會推出紀念票\n全文：https://bityl.co/Q9Fk\n\...,https://www.hk01.com/%E7%A4%BE%E6%9C%83%E6%96%...,479097471351472


In [30]:
del df