In [2]:
from facebook_scraper import get_posts

group: group id, to scrape groups instead of pages. Default is None.

pages: how many pages of posts to request, the first 2 pages may have no results, so try with a number greater than 2. Default is 10.

timeout: how many seconds to wait before timing out. Default is 5.

credentials: tuple of user and password to login before requesting the posts. Default is None.

extra_info: bool, if true the function will try to do an extra request to get the post reactions. Default is False.

youtube_dl: bool, use Youtube-DL for (high-quality) video extraction. You need to have youtube-dl installed on your 
environment. Default is False.

post_urls: list, URLs or post IDs to extract posts from. Alternative to fetching based on username.

cookies: One of:
The path to a file containing cookies in Netscape or JSON format. You can extract cookies from your browser after logging into Facebook with an extension like EditThisCookie (Chrome) or Cookie Quick Manager (Firefox). Make sure that you include both the c_user cookie and the xs cookie, you will get an InvalidCookies exception if you don't.
A CookieJar
A dictionary that can be converted to a CookieJar with cookiejar_from_dict

options: Dictionary of options. Set options={"comments": True} to extract comments, set options={"reactors": True} to extract the people reacting to the post. Both comments and reactors can also be set to a number to set a limit for the amount of comments/reactors to retrieve. The default limit for comments is 5000 and the default limit for reactors is 3000. Set options={"progress": True} to get a tqdm progress bar while extracting comments and replies. Set options={"allow_extra_requests": False} to disable making extra requests when extracting post data (required for some things like full text and image links). Set options={"posts_per_page": 200} to request 200 posts per page. The default is 4.

In [3]:
PAGE_NAME = 'coherenciaporfavor'

In [155]:
import copy
import yaml
from pymongo import MongoClient, errors
from facebook_scraper import get_posts

DOMAIN = 'localhost'
PORT = '27017'
PAGE_NAME = "coherenciaporfavor"
DB_NAME = 'scrapes'


def fetch_posts(page=PAGE_NAME, pages=10, **kwargs):
    return [post for post in get_posts(page, pages=pages, **kwargs)]


def get_objects(posts):
    posts_c = copy.deepcopy(posts)
    p, c, r = [], [], []
    
    for post in posts_c:
        if post['comments_full']:
            comments = post['comments_full']
            for comment in comments:
                comment['_post_id'] = post['post_id']

                if 'replies' in comment:

                    for reply in comment['replies']:
                        reply['_comment_id'] = comment['comment_id']
                        r.append(reply)
                    del comment['replies']

                c.append(comment)

        del post['comments_full']
        p.append(post)

    return p, c, r


def fetch_scrape_args(filepath='scrape_args.yml'):
    
    with open(filepath, 'r') as file:
        args = yaml.safe_load(file)
        
        credentials = None
        if args['credentials']['user'] and args['credentials']['password']:
            credentials = (args['credentials']['user'], args['credentials']['password'])
        
        post_urls = args['post_urls'] if args['post_urls'] else None
        
    return [credentials, post_urls,
            {k: args[k] for k in ['comments', 'reactors', 'posts_per_page'] if k in args}]

In [139]:
options = {
    "comments": True,
    "progress": False,
    "reactors": True,
}
    
posts = fetch_posts(page=PAGE_NAME, pages=10, options=options)

In [156]:
posts_clean, comments, replies = get_objects(posts)

In [191]:
pd.DataFrame(comments)._post_id.value_counts()

1593778564296339    30
1593028127704716    30
1593315441009318    30
1593351294339066    30
1591223194551876    30
1592920454382150    30
1593711840969678    30
1593287297678799    17
Name: _post_id, dtype: int64

In [189]:
import pandas as pd

pd.DataFrame(replies)

Unnamed: 0,comment_id,comment_url,commenter_id,commenter_url,commenter_name,commenter_meta,comment_text,comment_time,comment_image,_comment_id
0,1593860034288192,https://facebook.com/1593860034288192,100000938826998,,Mercedes Conde,,Horacio Magni:\nViendo como estamos hacen bien...,2021-08-08 14:35:47.121672,,1593807614293434
1,1593783554295840,https://facebook.com/1593783554295840,100021286542760,,Leonardo Adrian,,Mariposa Mariposa y lo decis vos q votastes a ...,2021-08-08 13:26:47.490574,,1593781634296032
2,1593786734295522,https://facebook.com/1593786734295522,1569006583,,Mariposa Mariposa,,Leonardo AdrianSeguro q vote Macri y muy orgul...,2021-08-08 13:26:47.494281,,1593781634296032
3,1593830974291098,https://facebook.com/1593830974291098,100011119179810,,Jem Calca,,"Leonardo Adrian...ahhh,p\nero Macri!😂,vos segu...",2021-08-08 14:26:47.499842,,1593781634296032
4,1593857967621732,https://facebook.com/1593857967621732,100000938826998,,Mercedes Conde,,Mariposa Mariposa: es el comentario de un kk e...,2021-08-08 14:31:47.504341,,1593781634296032
...,...,...,...,...,...,...,...,...,...,...
109,1591963584477837,https://facebook.com/1591963584477837,1426541080,,Cyn Ti,,Ésta vino de uruguay.. igual acá hacen el Mast...,NaT,,1591250884549107
110,1591477841193078,https://facebook.com/1591477841193078,100000795819748,,Sergio Luciano Barrionuevo,,Nazareno Anselmi ..la doble moral zurda de est...,NaT,,1591248341216028
111,1591549387852590,https://facebook.com/1591549387852590,100065515831817,,Judith Adragna,,Nazareno Anselmi Los zurdos no sirven para pro...,NaT,,1591248341216028
112,1591245264549669,https://facebook.com/1591245264549669,100003113936141,,Paula Sosa Alvarez,,Analia Eiras 😒 Tal cual...,NaT,,1591231861217676


In [5]:
def fetch_posts(page=PAGE_NAME, pages=10, **kwargs):
    return [post for post in get_posts(page, pages=pages, **kwargs)]


if __name__ == "__main__":

    options = {
        "comments": True,
        "progress": True,
        "reactors": True,
        # , 'posts_per_page': 200}
    }
    
    db = client.scrapes()

    try:
        # try to instantiate a client instance
        client = MongoClient(
            host=DOMAIN + ":" + PORT,
            serverSelectionTimeoutMS=5000,  # 5 second timeout
            username="admin",
            password="password",
        )
        print(f"Server version: {client.server_info()['version']}")

        database_names = client.list_database_names()
        print("\nDatabases:", database_names)
        
        if not DB_NAME in database_names:
            

    except errors.ServerSelectionTimeoutError as err:
        print("pymongo ERROR:", err)


pymongo ERROR: No servers found yet, Timeout: 3.0s, Topology Description: <TopologyDescription id: 60d806f8df8d9d6f87ac449b, topology_type: Single, servers: [<ServerDescription ('172.17.0.4', 27017) server_type: Unknown, rtt: None>]>

databases: []
