In [2]:
from facebook_scraper import get_posts

group: group id, to scrape groups instead of pages. Default is None.

pages: how many pages of posts to request, the first 2 pages may have no results, so try with a number greater than 2. Default is 10.

timeout: how many seconds to wait before timing out. Default is 5.

credentials: tuple of user and password to login before requesting the posts. Default is None.

extra_info: bool, if true the function will try to do an extra request to get the post reactions. Default is False.

youtube_dl: bool, use Youtube-DL for (high-quality) video extraction. You need to have youtube-dl installed on your 
environment. Default is False.

post_urls: list, URLs or post IDs to extract posts from. Alternative to fetching based on username.

cookies: One of:
The path to a file containing cookies in Netscape or JSON format. You can extract cookies from your browser after logging into Facebook with an extension like EditThisCookie (Chrome) or Cookie Quick Manager (Firefox). Make sure that you include both the c_user cookie and the xs cookie, you will get an InvalidCookies exception if you don't.
A CookieJar
A dictionary that can be converted to a CookieJar with cookiejar_from_dict

options: Dictionary of options. Set options={"comments": True} to extract comments, set options={"reactors": True} to extract the people reacting to the post. Both comments and reactors can also be set to a number to set a limit for the amount of comments/reactors to retrieve. The default limit for comments is 5000 and the default limit for reactors is 3000. Set options={"progress": True} to get a tqdm progress bar while extracting comments and replies. Set options={"allow_extra_requests": False} to disable making extra requests when extracting post data (required for some things like full text and image links). Set options={"posts_per_page": 200} to request 200 posts per page. The default is 4.

In [3]:
import copy
import yaml
import sqlalchemy
from facebook_scraper import get_posts

DOMAIN = "localhost"
PORT = "5432"
PAGE_NAME = "coherenciaporfavor"
DB_NAME = "scrapes"


def fetch_posts(
    page=PAGE_NAME,
    pages=10,
    credentials=None,
    post_urls=None,
    cookies=None,
    options=None,
):

    if post_urls:
        if page:
            print("Ignoring 'page' argument.")

        return [
            post
            for post in get_posts(
                credentials=credentials,
                post_urls=post_urls,
                cookies=cookies,
                options=options,
            )
        ]
    print(options)
    return [
        post
        for post in get_posts(
            page,
            pages=pages,
            credentials=credentials,
            cookies=cookies,
            options=options,
        )
    ]


def get_objects(posts):
    posts_c = copy.deepcopy(posts)
    p, c, r = [], [], []

    for post in posts_c:
        if post["comments_full"]:
            comments = post["comments_full"]
            for comment in comments:
                comment["_post_id"] = post["post_id"]

                if "replies" in comment:

                    for reply in comment["replies"]:
                        reply["_comment_id"] = comment["comment_id"]
                        r.append(reply)
                    del comment["replies"]

                c.append(comment)

        del post["comments_full"]
        p.append(post)

    return p, c, r


def fetch_scrape_args(filepath="scrape_args.yml"):

    with open(filepath, "r") as file:
        args = yaml.safe_load(file)

        credentials = None
        if args["credentials"]["user"] and args["credentials"]["password"]:
            credentials = (args["credentials"]["user"], args["credentials"]["password"])

        page = args["page"] if args["page"] else None
        pages = args["pages"] if args["pages"] else None
        post_urls = args["post_urls"] if args["post_urls"] else None
        cookies = args["cookies"] if args["cookies"] else None

    return [
        page,
        credentials,
        pages,
        post_urls,
        cookies,
        {
            k: args[k]
            for k in ["comments", "progress", "reactors", "posts_per_page"]
            if k in args
        },
    ]

In [5]:
page, credentials, pages, post_urls, cookies, options = fetch_scrape_args(
    "../scrape_args.yml"
)

In [8]:
pages = 2

In [None]:
page, credentials, pages, post_urls, cookies, options = fetch_scrape_args(
    "../scrape_args.yml"
)

posts = fetch_posts(
    page=page,
    pages=pages,
    post_urls=post_urls,
    credentials=credentials,
#    cookies=cookies,
    options=options,
)

posts, comments, replies = get_objects(posts)

In [10]:
posts = fetch_posts(
    page=page,
    pages=pages,
    post_urls=post_urls,
    credentials=credentials,
#    cookies=cookies,
    options=options,
)


Ignoring 'page' argument.




  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

In [11]:
posts, comments, replies = get_objects(posts)

In [12]:
posts

[{'original_request_url': 1584781918529337,
  'post_url': 'https://m.facebook.com/1584781958529333',
  'post_id': '1584781958529333',
  'text': 'Coherencia Por Favor',
  'post_text': 'Coherencia Por Favor',
  'shared_text': None,
  'time': datetime.datetime(2021, 7, 26, 0, 0),
  'image': 'https://scontent.faep9-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/217326948_1584781928529336_8731486713886378979_n.jpg?_nc_cat=103&ccb=1-5&_nc_sid=110474&_nc_ohc=uf6JtHOlFNwAX_E4CGQ&_nc_ht=scontent.faep9-1.fna&oh=4260765b26eb82ee3cdc3c9c809dd766&oe=6154332F',
  'image_lowquality': 'https://scontent.faep9-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/217326948_1584781928529336_8731486713886378979_n.jpg?_nc_cat=103&ccb=1-5&_nc_sid=110474&_nc_ohc=uf6JtHOlFNwAX_E4CGQ&_nc_ht=scontent.faep9-1.fna&oh=4260765b26eb82ee3cdc3c9c809dd766&oe=6154332F',
  'images': ['https://scontent.faep9-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/217326948_1584781928529336_8731486713886378979_n.jpg?_nc_cat=103&ccb=1-5&_nc_sid=110474

In [13]:
comments

[{'comment_id': '1584803675193828',
  'comment_url': 'https://facebook.com/1584803675193828',
  'commenter_id': '100026229755116',
  'commenter_url': None,
  'commenter_name': 'Nestor Paladino',
  'commenter_meta': None,
  'comment_text': '¿ Nueva "cepa Robin Hood" ..?🏹🤣🏹🤣',
  'comment_time': None,
  'comment_image': None,
  '_post_id': '1584781958529333'},
 {'comment_id': '1584783685195827',
  'comment_url': 'https://facebook.com/1584783685195827',
  'commenter_id': '100005306702370',
  'commenter_url': None,
  'commenter_name': 'Analia Gabriela',
  'commenter_meta': None,
  'comment_text': 'La nueva cepa se llama Hawkeye.',
  'comment_time': None,
  'comment_image': None,
  '_post_id': '1584781958529333'},
 {'comment_id': '1585255635148632',
  'comment_url': 'https://facebook.com/1585255635148632',
  'commenter_id': '100006770348232',
  'commenter_url': None,
  'commenter_name': 'Nora Rossi',
  'commenter_meta': None,
  'comment_text': 'Nora Rossi',
  'comment_time': None,
  'comment

## Posts

original_request_url, post_url, post_id, text, post_text, time, image, images, images_lowquality, images_description, images_lowquality_description, likes, comments, shares, link, user_id	username, user_url, reactions, reaction_count

## Comments

comment_id, comment_url, commenter_id, commenter_name, comment_text, comment_image, _post_id

## Replies

comment_id, comment_url, commenter_id, commenter_name, comment_text, comment_time, comment_image, _comment_id

In [30]:
pd.DataFrame(posts)

Unnamed: 0,original_request_url,post_url,post_id,text,post_text,shared_text,time,image,image_lowquality,images,...,shared_user_id,shared_username,shared_post_url,available,reactors,w3_fb_url,reactions,reaction_count,image_id,image_ids
0,1584781918529337,https://m.facebook.com/1584781958529333,1584781958529333,Coherencia Por Favor,Coherencia Por Favor,,2021-07-26 00:00:00,https://scontent.faep9-1.fna.fbcdn.net/v/t1.64...,https://scontent.faep9-1.fna.fbcdn.net/v/t1.64...,[https://scontent.faep9-1.fna.fbcdn.net/v/t1.6...,...,,,,True,,,,,,
1,1590404371300425,https://facebook.com/story.php?story_fbid=1590...,1590404371300425,,,,2021-08-03 18:12:25,https://scontent.faep9-2.fna.fbcdn.net/v/t1.64...,https://scontent.faep9-2.fna.fbcdn.net/v/t1.64...,[https://scontent.faep9-2.fna.fbcdn.net/v/t1.6...,...,,,,True,,,,,1590404241300438.0,[1590404241300438]


In [35]:
pd.DataFrame(comments).comment_url.apply(lambda x: len(x)).max()

37

In [None]:
, , commenter_id, commenter_name, comment_text, comment_image, _post_id


In [49]:
df = pd.DataFrame(posts)
for col in df.columns:
    print(col)
    print(df[col].apply(lambda x: len(str(x))).max())

original_request_url
16
post_url
77
post_id
16
text
20
post_text
20
shared_text
4
time
19
image
257
image_lowquality
257
images
261
images_description
525
images_lowquality
261
images_lowquality_description
525
video
4
video_duration_seconds
4
video_height
4
video_id
4
video_quality
4
video_size_MB
4
video_thumbnail
4
video_watches
4
video_width
4
likes
3
comments
2
shares
4
link
89
user_id
15
username
20
user_url
60
is_live
5
factcheck
4
shared_post_id
4
shared_time
4
shared_user_id
4
shared_username
4
shared_post_url
4
available
4
reactors
4
w3_fb_url
4
reactions
4
reaction_count
4
image_id
16
image_ids
20


In [65]:
df.iloc[:, :20]

Unnamed: 0,original_request_url,post_url,post_id,text,post_text,shared_text,time,image,image_lowquality,images,images_description,images_lowquality,images_lowquality_description,video,video_duration_seconds,video_height,video_id,video_quality,video_size_MB,video_thumbnail
0,1584781918529337,https://m.facebook.com/1584781958529333,1584781958529333,Coherencia Por Favor,Coherencia Por Favor,,2021-07-26 00:00:00,https://scontent.faep9-1.fna.fbcdn.net/v/t1.64...,https://scontent.faep9-1.fna.fbcdn.net/v/t1.64...,[https://scontent.faep9-1.fna.fbcdn.net/v/t1.6...,,[https://scontent.faep9-1.fna.fbcdn.net/v/t1.6...,"[Puede ser una imagen de texto que dice ""Yo co...",,,,,,,
1,1590404371300425,https://facebook.com/story.php?story_fbid=1590...,1590404371300425,,,,2021-08-03 18:12:25,https://scontent.faep9-2.fna.fbcdn.net/v/t1.64...,https://scontent.faep9-2.fna.fbcdn.net/v/t1.64...,[https://scontent.faep9-2.fna.fbcdn.net/v/t1.6...,"[Puede ser una imagen de texto que dice ""ANSES...",[https://scontent.faep9-2.fna.fbcdn.net/v/t1.6...,"[Puede ser una imagen de texto que dice ""ANSES...",,,,,,,


In [45]:
df.isna().sum()

comment_id         0
comment_url        0
commenter_id       0
commenter_url     51
commenter_name     0
commenter_meta    51
comment_text       0
comment_time      51
comment_image     45
_post_id           0
dtype: int64

In [52]:
df

Unnamed: 0,original_request_url,post_url,post_id,text,post_text,shared_text,time,image,image_lowquality,images,...,shared_user_id,shared_username,shared_post_url,available,reactors,w3_fb_url,reactions,reaction_count,image_id,image_ids
0,1584781918529337,https://m.facebook.com/1584781958529333,1584781958529333,Coherencia Por Favor,Coherencia Por Favor,,2021-07-26 00:00:00,https://scontent.faep9-1.fna.fbcdn.net/v/t1.64...,https://scontent.faep9-1.fna.fbcdn.net/v/t1.64...,[https://scontent.faep9-1.fna.fbcdn.net/v/t1.6...,...,,,,True,,,,,,
1,1590404371300425,https://facebook.com/story.php?story_fbid=1590...,1590404371300425,,,,2021-08-03 18:12:25,https://scontent.faep9-2.fna.fbcdn.net/v/t1.64...,https://scontent.faep9-2.fna.fbcdn.net/v/t1.64...,[https://scontent.faep9-2.fna.fbcdn.net/v/t1.6...,...,,,,True,,,,,1590404241300438.0,[1590404241300438]


In [53]:
df.time

0   2021-07-26 00:00:00
1   2021-08-03 18:12:25
Name: time, dtype: datetime64[ns]

In [55]:
pd.DataFrame(comments)

Unnamed: 0,comment_id,comment_url,commenter_id,commenter_url,commenter_name,commenter_meta,comment_text,comment_time,comment_image,_post_id
0,1584803675193828,https://facebook.com/1584803675193828,100026229755116,,Nestor Paladino,,"¿ Nueva ""cepa Robin Hood"" ..?🏹🤣🏹🤣",,,1584781958529333
1,1584783685195827,https://facebook.com/1584783685195827,100005306702370,,Analia Gabriela,,La nueva cepa se llama Hawkeye.,,,1584781958529333
2,1585255635148632,https://facebook.com/1585255635148632,100006770348232,,Nora Rossi,,Nora Rossi,,https://media1.tenor.co/images/90b44a6dab0064c...,1584781958529333
3,1584876181853244,https://facebook.com/1584876181853244,1436352840,,Nuñez Daniel,,Nuñez Daniel,,https://media.tenor.co/images/cb41642a609f2480...,1584781958529333
4,1584843918523137,https://facebook.com/1584843918523137,1316246105,,Maria Fernanda Torres,,Maria Fernanda Torres,,,1584781958529333
5,1584863355187860,https://facebook.com/1584863355187860,1306980338,,Gladys Haydee Corbetto,,Gladys Haydee Corbetto,,,1584781958529333
6,1585454831795379,https://facebook.com/1585454831795379,100002383804525,,Myriam Ochoa,,Myriam Ochoa,,https://media1.tenor.co/images/28c2e402103a281...,1584781958529333
7,1584829095191286,https://facebook.com/1584829095191286,100003025108562,,Emilce Bello,,Emilce Bello,,,1584781958529333
8,1585569948450534,https://facebook.com/1585569948450534,1507737021,,Monica Lupiani,,🤦🤦🤦,,,1584781958529333
9,1585697945104401,https://facebook.com/1585697945104401,1266234466,,Adriana Maria Cagnoni,,Adriana Maria Cagnoni,,,1584781958529333
