# Librairie pour réaliser l'extraction des posts et commentaires

* __Description__: Librairie des fonctions utilisées pour l'extraction des commentaires et des posts de Reddit pour leur sauvegarde dans BigQuery
* __Source__: API de REDDIT
* __Output__: Tables comment et post (BigQuery) 
* __Auteur__: Corentin TIMAL et Camille MATTHIEU
* __Date de création__: 03/08/2022
* __Date de mise à jour__: 14/09/2022

### Import des bibliothèques

In [2]:
import re
import requests
import os
import pandas as pd
from datetime import datetime
import sqlite3
from google.cloud import storage
import numpy as np
import sys
import csv
import time
from google.cloud import bigquery
from pyspark.sql import SparkSession

### Création du client pour la connexion à bigquery

In [None]:
client = bigquery.Client()

### Import d'un fichier depuis le bucket

In [None]:
def download_blob(bucket_name, source_blob_name, destination_file_name):
    """
    Downloads a blob from the bucket
    """
    
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)


### Authentification pour l'API Reddit

In [None]:
def authenticate_API():
    """
    Function to authenticate to the Reddit API
    Returns the headers we'll need for future API queries
    """
    # Open the file containing the login infos, and store them for later on
    download_blob('the-clean-project','notebooks/jupyter/authentication_file.txt','auth_file.txt')
    
    with open('auth_file.txt', 'r') as f: 
        client_id, secret_token, grant_type, username, password, user_agent = f.readline().split(",")
        
    # note that CLIENT_ID refers to 'personal use script' and SECRET_TOKEN to 'token'
    auth = requests.auth.HTTPBasicAuth(client_id, secret_token)

    # here we pass our login method (password), username, and password
    data = {'grant_type': grant_type,
            'username': username,
            'password': password}

    # setup our header info, which gives reddit a brief description of our app
    headers = {'User-Agent': user_agent}

    # send our request for an OAuth token
    res = requests.post('https://www.reddit.com/api/v1/access_token',
                        auth=auth, data=data, headers=headers)

    # convert response to JSON and pull access_token value
    TOKEN = res.json()['access_token']

    # add authorization to our headers dictionary
    headers = {**headers, **{'Authorization': f"bearer {TOKEN}"}}
    
    return headers


## POSTS

### Extraction des posts depuis la requête

In [None]:
def df_from_response_for_posts(res, date_extraction):
    """
    We use this function to convert responses json to dataframes, in the context of POSTS extraction.
    In this dataframe, we extract metrics from the json file given as a response
    
    Arg : res, which is the query we make to the Reddit API (a GET query).
    Returns : a dataframe containing the info for each extracted post
    """
    # initialize temp dataframe for batch of data in response
    df = pd.DataFrame()
    
    # loop through each post pulled from res and append to df
    for post in res.json()['data']['children']:
        author = None
        if 'author_fullname' in post['data']:
            author = post['data']['author_fullname']
        df_new = pd.DataFrame({
            'id_post': post['data']['id'],
            'id_subreddit': post['data']['subreddit_id'],
            'subreddit': post['data']['subreddit'],
            'id_author': author,
            'author': post['data']['author'],
            'num_comments': post['data']['num_comments'],
            'subreddit_subscribers': post['data']['subreddit_subscribers'],
            'upvote_ratio': post['data']['upvote_ratio'],
            'ups': post['data']['ups'],
            'downs': post['data']['downs'],
            'score': post['data']['score'],
            'created_utc': datetime.fromtimestamp(post['data']['created_utc']).strftime('%Y-%m-%d %H:%M:%S'),
            'extraction_utc': date_extraction,
            'kind': post['kind'],
            'score_jigsaw': None,
        }, index=[1])
        df = pd.concat([df, df_new], ignore_index=True)
        
    return df


### Extraction complète des posts

In [None]:
def posts_extraction(subreddit:str):
    """
    Function to extract posts from a subreddit page.
    
    Args :
    - subreddit: name of the subreddit under format "name" 
    """

    # We first authenticate to the API
    headers = authenticate_API()

    # Initialize dataframe and parameters for pulling data in loop 
    data = pd.DataFrame()
    params = {'limit': 100}

    # Create a flag for scanning the subreddit as long as there is a post to fetch
    flag = True

    # At each loop, we extract 100 posts with their info
    while flag:
        # make request
        res = requests.get(f"https://oauth.reddit.com/r/{subreddit}",
                        headers=headers,
                        params=params)

        # get dataframe from response
        new_df = df_from_response_for_posts(res, date_extraction)
        # take the final row (oldest entry)
        row = new_df.iloc[len(new_df)-1]
        # create fullname
        fullname = row['kind'] + '_' + row['id_post']
        # add/update fullname in params
        params['after'] = fullname

        # append new_df to data
        data = pd.concat([data, new_df], ignore_index=True)

        # Flag set to True if len(new_df)>=100, False otherwise
        flag = (len(new_df)>=100)
    
    # Data insertion
    insertion_bigquery('dwh', 'post', data)


## COMMENTS

In [None]:
def df_info_commentaires(resjson, id_post, author):
    """
    Permet d'extraire les informations des commentaires contenus dans le json
    Return : le dataframe contenant les informations des commentaires
    """
    # on ajoute le contenu du json au dataframe
    df_add = pd.DataFrame({
            'id_comment': resjson['data']['id'],
            'id_post': id_post,
            'id_author': author,
            'author': resjson['data']['author'],
            'content': resjson['data']['body'].replace("[supprimé]","").replace("[effacé]",""),
            'type_content': 'Comment',
            'ups': resjson['data']['ups'],
            'downs': resjson['data']['downs'],
            'score': resjson['data']['score'],
            'created_utc': datetime.fromtimestamp(resjson['data']['created_utc']).strftime('%Y-%m-%d %H:%M:%S'),
            'extraction_utc': date_extraction,
            'score_jigsaw': None,
        }, index=[1])
    
    return df_add


### Extraction des réponses aux commentaires

In [None]:
def df_replies(resjson, df_comments, id_post, headers):
    """
    Permet d'extraire les réponses aux commentaires contenus dans le json resjson

    Params :
        - resjson : json
        - df_comments : dataframe
        - id_post : str
        - headers :
    
    Return : le dataframe contenant les informations des réponses aux commentaires
    """
    
    global listeChildren
    
     # on ajoute le contenu du json au dataframe
    author = None
    if 'author_fullname' in resjson['data']:
        author = resjson['data']['author_fullname']
    if "body" in resjson['data']:
        df_add = df_info_commentaires(resjson, id_post, author)
        df_comments = pd.concat([df_comments, df_add], ignore_index=True)
    else:
        # on complète la listeChildren avec les id des commentaires non accessibles avec la première requête
        listeChildren += resjson['data']['children']

    # On vérifie s'il y a ou non une réponse aux commentaires
    if "replies" in resjson['data']:
        # recursive to add replies
        if resjson['data']['replies'] != "":
            for post in resjson['data']['replies']['data']['children']:
                df_comments = df_replies(post, df_comments, id_post, headers)
    else:
        listeChildren += resjson['data']['children']

    return df_comments


### Extraction des commentaires depuis la requête

In [None]:
def df_info_titre_post(res, id_post, author, typeContent):
    """
    Renvoie un dataframe contenant les informations du titre ou du contenu du post selon le type
    """
    if typeContent == "title":
        df_add = pd.DataFrame({
            'id_comment': res.json()[0]['data']['children'][0]['data']['id']+"t",
            'id_post': id_post,
            'id_author': author,
            'author': res.json()[0]['data']['children'][0]['data']['author'],
            'content': res.json()[0]['data']['children'][0]['data']['title'].replace("[supprimé]","").replace("[effacé]",""),
            'type_content': 'Title',
            'ups': res.json()[0]['data']['children'][0]['data']['ups'],
            'downs': res.json()[0]['data']['children'][0]['data']['downs'],
            'score': res.json()[0]['data']['children'][0]['data']['score'],
            'created_utc': datetime.fromtimestamp(res.json()[0]['data']['children'][0]['data']['created_utc']).strftime('%Y-%m-%d %H:%M:%S'),
            'extraction_utc': date_extraction,
            'score_jigsaw': None,
        }, index = [1])
    else:
        df_add = pd.DataFrame({
            'id_comment': res.json()[0]['data']['children'][0]['data']['id']+"p",
            'id_post': id_post,
            'id_author': author,
            'author': res.json()[0]['data']['children'][0]['data']['author'],
            'content': res.json()[0]['data']['children'][0]['data']['selftext'].replace("[supprimé]","").replace("[effacé]",""),
            'type_content': 'Post',
            'ups': res.json()[0]['data']['children'][0]['data']['ups'],
            'downs': res.json()[0]['data']['children'][0]['data']['downs'],
            'score': res.json()[0]['data']['children'][0]['data']['score'],
            'created_utc': datetime.fromtimestamp(res.json()[0]['data']['children'][0]['data']['created_utc']).strftime('%Y-%m-%d %H:%M:%S'),
            'extraction_utc': date_extraction,
            'score_jigsaw': None,
        }, index = [1])

    return df_add

In [None]:
def df_from_response_for_comments(res, headers, id_post):
    """
    Permet d'extraire les commentaires contenus dans le json res

    Params :
        - res : json

    Return : le dataframe contenant les informations des commentaires
    """
    # initialize temp dataframe for batch of data in response
    df_comments = pd.DataFrame()
    
    # add_type title of the post
    author = None
    if 'author_fullname' in res.json()[0]['data']['children'][0]['data']:
        author = res.json()[0]['data']['children'][0]['data']['author_fullname']
    
    df_add = df_info_titre_post(res, id_post, author, "title")
    df_comments = pd.concat([df_comments, df_add], ignore_index=True)

    # add_type content of the post
    author = None
    if 'author_fullname' in res.json()[0]['data']['children'][0]['data']:
        author = res.json()[0]['data']['children'][0]['data']['author_fullname']
    
    df_add = df_info_titre_post(res, id_post, author, "post")
    df_comments = pd.concat([df_comments, df_add], ignore_index=True)

    # loop to add content of the comments
    for post in res.json()[1]['data']['children']:
        df_comments = df_replies(post, df_comments, id_post, headers)
        
    return df_comments

### Extraction des enfants

In [None]:
def df_from_response_for_children(res, id_post, rang, headers):
    """
    Permet d'extraire les commentaires contenus dans le json res

    Params :
        - res : json
        - id_post : str
        - rang : int

    Return : le dataframe contenant les informations des commentaires
    """
    
    global listeChildren
    # initialize temp dataframe for batch of data in response
    df_children = pd.DataFrame()
    
    if 'json' in res.json():
        if len(res.json()['json']['data']['things']) != 0:
            for post in res.json()['json']['data']['things']:
                author = None
                if 'author_fullname' in post['data']:
                    author = post['data']['author_fullname']
                if 'body' in post['data']:
                    df_add = df_info_commentaires(post, id_post, author)
                    df_children = pd.concat([df_children, df_add], ignore_index=True)

    return df_children


In [None]:
def children_extraction(df_comments, link_id, rang, headers, id_post):
    """
    Permet d'extraire les commentaires non accessibles avec la première requête
    Utilisation de l'API avec la requêtre 'more children'

    Return : le dataframe avec les nouveaux commentaires
    """
    children = ""
    i = 0
    global listeChildren

    while len(listeChildren) > 0:
        if len(listeChildren) > 100:
            children = listeChildren[0]
            for i in range(1, 100):
                children += ", "
                children += listeChildren[i]
            del(listeChildren[0:100])
        else:
            children = listeChildren[0]
            for i in range(1, len(listeChildren)):
                children += ", "
                children += listeChildren[i]
            listeChildren.clear()

        params = {'link_id': "t3_" + id_post, 'children': children, 'api_type': "json"}

        # make request
        res = requests.get("http://oauth.reddit.com/api/morechildren",
                            headers=headers, params=params)

        # get dataframe from response
        new_df = df_from_response_for_children(res, id_post, rang, headers)

        # append new_df to data
        df_comments = pd.concat([df_comments, new_df], ignore_index=True)

    return df_comments


### Extraction complète des commentaires

In [None]:
def comments_extraction(subreddit:str, id_post:str):
    """
    Permet d'extraire tous les commentaires d'un post et de les stocker dans un fichier csv

    Params :
        - subreddit : str
        - id_post : str
    """
    # We first authenticate to the API
    headers = authenticate_API()

    # initialize dataframe and parameters for pulling data in loop
    data = pd.DataFrame()
    params = {'limit': None}
    
    url = f"https://oauth.reddit.com/{subreddit}/comments/{id_post}"

    # make request
    res = requests.get(url,
                    headers=headers,
                    params=params)
    
    # get dataframe from response
    new_df = df_from_response_for_comments(res, headers, id_post)

    # append new_df to data
    data = pd.concat([data, new_df], ignore_index=True)

    new_df = pd.DataFrame()
    new_df = children_extraction(new_df, "t3_" + id_post, 1, headers, id_post)
    
    data = pd.concat([data, new_df], ignore_index=True)

    return data