In [2]:
import requests
import pandas as pd
from datetime import datetime
from utils import authenticate_API

ModuleNotFoundError: No module named 'utils'

In [3]:
def df_from_response_for_posts(res):
    """
    We use this function to convert responses to dataframes, in the context of POSTS extraction.
    In this dataframe, we extract metrics from the json file given as a response
    
    Arg : res, which is the query we make to the Reddit API (a GET query).
    Returns : a dataframe containing the info for each extracted post, that is (for now):
    - subreddit: subreddit name;
    - title: post title;
    - selftext: post body;
    - author_fullname: id of the post author (t2_'xxxxx');
    - upvote_ratio; 
    - created_utc: publication time;
    - num_comments;
    - id: post ID, which is part of each post URL btw;
    - kind: type prefix, for posts of a subreddit it is "t3", "t1" for a comment...
    """
    # initialize temp dataframe for batch of data in response
    df = pd.DataFrame()

    # loop through each post pulled from res and append to df
    for post in res.json()['data']['children']:
        df = df.append({
            'subreddit': post['data']['subreddit'],
            # Take the subreddit ID?
            'title': post['data']['title'],
            'selftext': post['data']['selftext'],
            'author': post['data']['author'],
            'upvote_ratio': post['data']['upvote_ratio'],
            'created_utc': datetime.fromtimestamp(post['data']['created_utc']).strftime('%Y-%m-%dT%H:%M:%SZ'),
            'num_comments': post['data']['num_comments'],
            'id': post['data']['id'],
            'kind': post['kind']
        }, ignore_index=True)

    return df

In [4]:
def posts_extraction(subreddit:str="r/worldnews"):
    """
    Function to extract the first posts from a subreddit page.
    
    Args :
    - subreddit: name of the subreddit under format "r/name", by default "r/worldnews";
    - n_posts: int, number of posts we want to extract from the subreddit, by default 100.
        A choice we make here is to automatically round this number to the upper hundred in our code.
    Creates : a csv file containing the info about the extracted posts, that is:
    -
    -
    -

    TODO: try-except, error prevention 
    """

    # We first authenticate to the API
    headers = authenticate_API()

    # initialize dataframe and parameters for pulling data in loop 
    data = pd.DataFrame()
    params = {'limit': 100}

    # Create a flag for scanning the subreddit as long as there is a post to fetch
    flag = True

    # At each loop, we extract 100 posts with their info
    while flag:
        # make request
        res = requests.get(f"https://oauth.reddit.com/{subreddit}",
                        headers=headers,
                        params=params)

        # get dataframe from response
        new_df = df_from_response_for_posts(res)
        # take the final row (oldest entry)
        row = new_df.iloc[len(new_df)-1]
        # create fullname
        fullname = row['kind'] + '_' + row['id']
        # add/update fullname in params
        params['after'] = fullname
        
        # append new_df to data
        data = data.append(new_df, ignore_index=True)

        # Flag set to True if len(new_df)>=100, False otherwise
        flag = (len(new_df)>=100)

    # Save final dataframe to csv file, 
    # name_format: data_API_whole_"subreddit name"_"date and hour".csv
    filename = f'dataAPI_whole_{subreddit.replace("/","_")}_'+ datetime.today().strftime('%Y-%m-%d_%H-%M') + ".csv"

    data.to_csv(filename, sep = ',')