# Notebook pour réaliser l'extraction des posts et commentaires
### Adapté pour être facilement utilisable sur Jupyter dans GCP

In [1]:
import re
import requests

import os
import pandas as pd
from datetime import datetime

## Importer un fichier depuis le bucket

In [2]:
from google.cloud import storage

def download_blob(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from the bucket."""
    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"

    # The ID of your GCS object
    # source_blob_name = "storage-object-name"

    # The path to which the file should be downloaded
    # destination_file_name = "local/path/to/file"

    storage_client = storage.Client()

    bucket = storage_client.bucket(bucket_name)

    # Construct a client side representation of a blob.
    # Note `Bucket.blob` differs from `Bucket.get_blob` as it doesn't retrieve
    # any content from Google Cloud Storage. As we don't need additional data,
    # using `Bucket.blob` is preferred here.
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

## Fonction d'authentification  à l'API reddit

In [3]:
def authenticate_API():
    """
    Function to authenticate to the Reddit API
    Returns the headers we'll need for future API queries.
    """
    # Open the file containing the login infos, and store them for later on
    download_blob('the-clean-project','notebooks/jupyter/authentication_file.txt','auth_file.txt')
    
    with open('auth_file.txt', 'r') as f: 
        client_id, secret_token, grant_type, username, password, user_agent = f.readline().split(",")
        
    # note that CLIENT_ID refers to 'personal use script' and SECRET_TOKEN to 'token'
    auth = requests.auth.HTTPBasicAuth(client_id, secret_token)

    # here we pass our login method (password), username, and password
    data = {'grant_type': grant_type,
            'username': username,
            'password': password}

    # setup our header info, which gives reddit a brief description of our app
    headers = {'User-Agent': user_agent}

    # send our request for an OAuth token
    res = requests.post('https://www.reddit.com/api/v1/access_token',
                        auth=auth, data=data, headers=headers)

    # convert response to JSON and pull access_token value
    TOKEN = res.json()['access_token']

    # add authorization to our headers dictionary
    headers = {**headers, **{'Authorization': f"bearer {TOKEN}"}}
    return headers


## Le code d'extraction des posts

In [4]:
def df_from_response_for_posts(res):
    """
    We use this function to convert responses to dataframes, in the context of POSTS extraction.
    In this dataframe, we extract metrics from the json file given as a response
    
    Arg : res, which is the query we make to the Reddit API (a GET query).
    Returns : a dataframe containing the info for each extracted post, that is (for now):
    - subreddit: subreddit name;
    - title: post title;
    - selftext: post body;
    - author_fullname: id of the post author (t2_'xxxxx');
    - upvote_ratio; 
    - created_utc: publication time;
    - num_comments;
    - id: post ID, which is part of each post URL btw;
    - kind: type prefix, for posts of a subreddit it is "t3", "t1" for a comment...
    """
    # initialize temp dataframe for batch of data in response
    df = pd.DataFrame()

    # loop through each post pulled from res and append to df
    for post in res.json()['data']['children']:
        df_new = pd.DataFrame({
            'subreddit': post['data']['subreddit'],
            # Take the subreddit ID?
            'title': post['data']['title'],
#             'selftext': post['data']['selftext'],
            'author': post['data']['author'],
            'upvote_ratio': post['data']['upvote_ratio'],
            'created_utc': datetime.fromtimestamp(post['data']['created_utc']).strftime('%Y-%m-%dT%H:%M:%SZ'),
            'num_comments': post['data']['num_comments'],
            'id': post['data']['id'],
            'kind': post['kind']
        }, index=[1])
        df = pd.concat([df, df_new], ignore_index=True)

    return df


def posts_extraction(subreddit:str="r/worldnews"):
    """
    Function to extract the first posts from a subreddit page.
    
    Args :
    - subreddit: name of the subreddit under format "r/name", by default "r/worldnews";
    - n_posts: int, number of posts we want to extract from the subreddit, by default 100.
        A choice we make here is to automatically round this number to the upper hundred in our code.
    Creates : a csv file containing the info about the extracted posts, that is:
    -
    -
    -

    TODO: try-except, error prevention 
    """

    # We first authenticate to the API
    headers = authenticate_API()

    # initialize dataframe and parameters for pulling data in loop 
    data = pd.DataFrame()
    params = {'limit': 10}

    # Create a flag for scanning the subreddit as long as there is a post to fetch
    flag = True

    # At each loop, we extract 100 posts with their info
#     while flag:
#         # make request
#         res = requests.get(f"https://oauth.reddit.com/{subreddit}",
#                         headers=headers,
#                         params=params)

#         # get dataframe from response
#         new_df = df_from_response_for_posts(res)
#         # take the final row (oldest entry)
#         row = new_df.iloc[len(new_df)-1]
#         # create fullname
#         fullname = row['kind'] + '_' + row['id']
#         # add/update fullname in params
#         params['after'] = fullname
        
#         # append new_df to data
#         # data = data.append(new_df, ignore_index=True)
#         data = pd.concat([data, new_df], ignore_index=True)

#         # Flag set to True if len(new_df)>=100, False otherwise
#         flag = (len(new_df)>=100)
        
    # make request
    res = requests.get(f"https://oauth.reddit.com/{subreddit}",
                    headers=headers,
                    params=params)

    # get dataframe from response
    new_df = df_from_response_for_posts(res)
    # take the final row (oldest entry)
    row = new_df.iloc[len(new_df)-1]
    # create fullname
    fullname = row['kind'] + '_' + row['id']
    # add/update fullname in params
    params['after'] = fullname

    # append new_df to data
    # data = data.append(new_df, ignore_index=True)
    data = pd.concat([data, new_df], ignore_index=True)

    # Flag set to True if len(new_df)>=100, False otherwise
    flag = (len(new_df)>=100)

    # Save final dataframe to csv file, 
    # name_format: data_API_whole_"subreddit name"_"date and hour".csv
    path = "gs://the-clean-project/posts_files/"
    filename = os.path.join(path, f'dataAPI_whole_{subreddit.replace("/","_")}_'+ datetime.today().strftime('%Y-%m-%d') + ".csv")

    data.to_csv(filename, sep = ',')
    
    return filename

## Le code d'extraction des commentaires

In [5]:
def df_replies(resjson, df_comments, id_post, rang, headers):
    """
    Permet d'extraire les réponses aux commentaires contenus dans le json resjson

    Requête : comments

    Params :
        - resjson : json
        - df_comments : dataframe
        - id_post : str
        - rang : int
    
    Return : le dataframe contenant les informations des réponses aux commentaires
    """
    # add content of the comment in the dataframe
    if "body" in resjson['data']:
        df_add = pd.DataFrame({
            'id': id_post,
            'contenu' : resjson['data']['body'],
            'type': "Comment",
            'rang': rang
        }, index=[1])

        df_comments = pd.concat([df_comments, df_add], ignore_index=True)
    else:
        df_comments = children_extraction(df_comments, resjson['data']['id'],resjson['data']['children'], rang + 1, headers, id_post)

    if "replies" in resjson['data']:
        # recursive to add replies
        if resjson['data']['replies'] != "":
            for post in resjson['data']['replies']['data']['children']:
                df_comments = df_replies(post, df_comments, id_post, rang + 1, headers)
    else:
        df_comments = children_extraction(df_comments, resjson['data']['id'],resjson['data']['children'], rang + 1, headers, id_post)

    return df_comments

def df_from_response_for_comments(res, headers):
    """
    Permet d'extraire les commentaires contenus dans le json res

    Requête : comments

    Params :
        - res : json

    Return : le dataframe contenant les informations des commentaires
    """
    # initialize temp dataframe for batch of data in response
    df_comments = pd.DataFrame()
    
    # add_type title of the post
    df_add = pd.DataFrame({
        'id': res.json()[0]['data']['children'][0]['data']['id'], 
        'contenu' : res.json()[0]['data']['children'][0]['data']['title'],
        'type': "Post",
        'rang': 0
    }, index = [1])

    df_comments = pd.concat([df_comments, df_add], ignore_index=True)

    # add_type content of the post
    df_add = pd.DataFrame({
        'id': res.json()[0]['data']['children'][0]['data']['id'], 
        'contenu' : res.json()[0]['data']['children'][0]['data']['selftext'],
        'type': "Post",
        'rang': 0
    }, index = [1])

    df_comments = pd.concat([df_comments, df_add], ignore_index=True)

    id_post = res.json()[0]['data']['children'][0]['data']['id']

    # loop to add content of the comments
    for post in res.json()[1]['data']['children']:
        df_comments = df_replies(post, df_comments, id_post, 1, headers)
    return df_comments

def df_from_response_for_children(res, id_post, rang, headers):
    """
    Permet d'extraire les commentaires contenus dans le json res

    Requête : morechildren

    Params :
        - res : json
        - id_post : str
        - rang : int

    Return : le dataframe contenant les informations des commentaires
    """
    # initialize temp dataframe for batch of data in response
    df_children = pd.DataFrame()
    
    if 'json' in res.json():
        if len(res.json()['json']['data']['things']) != 0:
            for post in res.json()['json']['data']['things']:
                if 'body' in post['data']:
                    df_add = pd.DataFrame({
                        'id': id_post,
                        'contenu': post['data']['body'],
                        'type': "Comment",
                        'rang': rang + 1
                    }, index=[1])
                    df_children = pd.concat([df_children, df_add], ignore_index=True)
                else:
                    df_children = children_extraction(df_children, post['data']['id'], post['data']['children'], rang + 1, headers, id_post)

    return df_children

def children_extraction(df_comments, link_id, liste_children, rang, headers, id_post):
    """
    Permet d'extraire les commentaires non accessibles avec la première requête
    Utilisation de l'API avec la requêtre 'more children'

    Params :
        - df_comments : dataframe
        - link_id : str
        - children : list
        - rang : int

    Return : le dataframe avec les nouveaux commentaires
    """
    children = ""
    i = 0

    while len(liste_children) > 0:
        if len(liste_children) > 100:
            children = liste_children[0]
            for i in range(1, 100):
                children += ", "
                children += liste_children[i]
            del(liste_children[0:100])
        else:
            children = liste_children[0]
            for i in range(1, len(liste_children)):
                children += ", "
                children += liste_children[i]
            liste_children.clear()

        params = {'link_id': "t3_" + id_post, 'children': children, 'api_type': "json"}

        # make request
        res = requests.get("http://oauth.reddit.com/api/morechildren",
                            headers=headers, params=params)

        # get dataframe from response
        new_df = df_from_response_for_children(res, id_post, rang, headers)

        # append new_df to data
        df_comments = pd.concat([df_comments, new_df], ignore_index=True)

    return df_comments

def comments_extraction(subreddit:str="r/worldnews", id_post:str="w7bl8z"):
    """
    Permet d'extraire tous les commentaires d'un post et de les stocker dans un fichier csv

    Params :
        - subreddit : str
        - id_post : str
    """
    # We first authenticate to the API
    headers = authenticate_API()

    # initialize dataframe and parameters for pulling data in loop
    data = pd.DataFrame()
    params = {'limit': None}

    # make request
    res = requests.get(f"https://oauth.reddit.com/{subreddit}/comments/{id_post}",
                    headers=headers,
                    params=params)

    # get dataframe from response
    new_df = df_from_response_for_comments(res, headers)

    # append new_df to data
    # data = data.append(new_df, ignore_index=True)
    data = pd.concat([data, new_df], ignore_index=True)

    # Save final dataframe to csv file
    path = "gs://the-clean-project/comments_files/testmultiFichiers/"
    filename = os.path.join(path, f'dataAPI_comments_{subreddit.replace("/","_")}_{id_post}_'+ datetime.today().strftime('%Y-%m-%d') + ".csv")

    data.to_csv(filename, sep = ',')



## L'extraction complète

In [6]:
import csv
import time

def extraction_complete(subreddit:str="r/worldnews"):
    """
        Fonction qui, quand on lui fournit le subreddit, va récupérer de façon automatique :
        - ses posts;
        - les commentaires associés à chacun des posts;
        (- la liste de ses contributeurs, ainsi que
        - leur activité récente respective) (abandonné : demande trop de requêtes API pour l'instant...)

        param:
            subreddit: str, nom du subreddit au format "r/nom_subreddit"
    
        Output:
            Enregistre les données ainsi extraites dans des fichiers csv.

        
        #TODO :
            - on récupère aussi les json ?
            - prévoir d'organiser le système de fichiers où les sauvegarder (bucket dans GCP?)
    """
    # ti = time.time()
    # Extraire les posts du subreddit, enregistré dans un csv
    posts_filename = posts_extraction(subreddit)

#     # On récupère le fichier csv ainsi créé
#     with open(posts_filename, encoding='utf-8') as f:
        
#         # Parcourir les posts du subreddit (en omettant la ligne d'entête)
#         posts = csv.reader(f, delimiter=',')
#         next(posts)
#         # pour chaque post, en extraire les commentaires (sauvegarde dans un csv)
#         for post in posts:
#             # l'ID du post est en avant dernière position dans la liste des infos du post
#             comments_extraction(subreddit, id_post=post[-2])
#             # Mettre la sécurité d'une seconde entre les extractions, du fait de la limitation de l'API reddit
#             # time.sleep(0.9)
#     # print(time.time()-ti)

    df = pd.read_csv(posts_filename, storage_options={"token": "cloud"})
    
    for i in df.index:
#         print(i)
        comments_extraction(subreddit, df["id"][i])

In [7]:
# extraction_complete("r/fcbasel")

## Import des données des 30 plus gros subreddits

In [8]:
top30Subreddits = ["r/announcements", "r/funny", "r/AskReddit", "r/gaming", "r/aww",
                "r/Music", "r/pics", "r/science", "r/worldnews", "r/videos",
                "r/todayilearned", "r/movies", "r/news", "r/Showerthoughts", "r/EarthPorn",
                "r/gifs", "r/IAmA", "r/food", "r/askscience", "r/Jokes",
                "r/LifeProTips", "r/explainlikeimfive", "r/Art", "r/books", "r/mildlyinteresting",
                "r/nottheonion", "r/DIY", "r/sports", "r/blog", "r/space"]

In [9]:
str(datetime.now())

'2022-09-09 08:06:48.986827'

In [None]:
extraction_complete("r/worldnews")

In [None]:
str(datetime.now())

'2022-09-09 08:29:44.073743'