In [24]:
import requests
import time
import sys
import pandas as pd
import os

In [25]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install --upgrade google-cloud-storage

Requirement already up-to-date: google-cloud-storage in /Users/willyraedy/anaconda3/lib/python3.7/site-packages (1.21.0)


In [26]:
%env GOOGLE_APPLICATION_CREDENTIALS=/Users/willyraedy/Sync/SideProjects/RedditResearch/credentials-ebeb319739c4.json

env: GOOGLE_APPLICATION_CREDENTIALS=/Users/willyraedy/Sync/SideProjects/RedditResearch/credentials-ebeb319739c4.json


In [27]:
# Imports the Google Cloud client library
from google.cloud import storage

# Instantiates a client
storage_client = storage.Client()

In [79]:
# setup
UPLOAD_SIZE = 10000
IN_NETWORK_STRING = 'latinoamerica' # 'AskReddit'
LOGGING = False

In [80]:
def fetch_page(after):
    resp = requests.get(f'https://www.reddit.com/reddits.json?limit=100&after={after}', headers={'User-Agent': 'script:melis-thesis:v0.0.1 (by /u/wilburRay)'})
    data = resp.json()
    new_after = data['data']['after']
    number_returned = data['data']['dist']
    new_reddits = list(map(lambda r: r['data'], data['data']['children']))
    return dict(new_after=new_after, new_reddits=new_reddits, number_returned=number_returned)

In [81]:
def upload_file(dataframe, gcp_filename):
    temp_filename = 'tempCSVToUpload.csv'
    dataframe.to_csv(temp_filename)
    
    bucket = storage_client.get_bucket('meli_thesis')
    blob = bucket.blob(gcp_filename)
    blob.upload_from_filename(temp_filename)
    
    os.remove(temp_filename)

In [82]:
def filter_network_reddits(dataframe):
    return dataframe[dataframe['description_html'].str.contains(IN_NETWORK_STRING, na=False)]

In [83]:
def process_chunk(list_of_reddits):
    # upload raw results
    
    df = pd.DataFrame(list_of_reddits)

    from_id = list_of_reddits[0]['id']
    to_id = list_of_reddits[1]['id']
    count = len(list_of_reddits)
    raw_gcp_filename = f'raw/{from_id}-{to_id}-reddits-{count}'
    
    if LOGGING:
        print(f'raw_gcp_filename: {raw_gcp_filename}')
        print(f'raw_head: {df.head(5)}')
    
    if not LOGGING:
        upload_file(df, raw_gcp_filename)
    
    # filter out in network reddits
    # upload in network reddits
    
    in_network = filter_network_reddits(df)
    
    in_network_count = in_network.shape[0]
    network_gcp_filename = f'network/{from_id}-{to_id}-reddits-{in_network_count}'    
    
    if LOGGING:
        print(f'in_network_filename: {network_gcp_filename}')
        print(f'network_head: {in_network.head(5)}')

    if not LOGGING:
        upload_file(in_network, network_gcp_filename)
    

In [84]:
def fetch_all(after_arg = ''):
    returned = 100
    after = after_arg
    all_reddits = []
    total_scraped = 0
    
    while after != None:
        if LOGGING:
            print('###### new batch #####')
        try:
            result = fetch_page(after)
            all_reddits += list(map(lambda r: dict(id=r['id'], display_name=r['display_name'], public_description=r['public_description'], description_html=r['description_html'], subscribers=r['subscribers']), result['new_reddits']))
            after = result['new_after']
            returned = result['number_returned']
            total_scraped += returned
            time.sleep(1)
            
            if LOGGING:
                print(f'after: {after}')
                print(pd.DataFrame(all_reddits).tail(3))
                print(f'total scraped: {total_scraped}')

            if len(all_reddits) % UPLOAD_SIZE == 0:
                print('***** uploading data *****')
                process_chunk(all_reddits)
                all_reddits = []
        except KeyboardInterrupt:
            raise
        except:
            e = sys.exc_info()[0]
            print(e)
            if LOGGING:
                raise(e)
                
    if len(all_reddits) > 0:
        print('processing last chunk')
        process_chunk(all_reddits)
    print('done')
    return all_reddits

In [85]:
all_reddits = fetch_all()

processing last chunk
done


In [86]:
df = pd.DataFrame(all_reddits)

In [87]:
df.shape

(3673, 5)

In [92]:
mex = df[df['display_name'] == 'mexico']

In [93]:
mex.head()

Unnamed: 0,description_html,display_name,id,public_description,subscribers
1161,"&lt;!-- SC_OFF --&gt;&lt;div class=""md""&gt;&lt...",mexico,2qhv7,MÉXICO,181956
