In [1]:
from google.cloud import bigquery
import google.auth
import polars as pl
from tqdm.auto import tqdm

##### Get credentials to access reddit dataset

In [2]:
# with Application Default Credentials (ADC) :
# install gcloud cli from : https://cloud.google.com/sdk/docs/install
# create credential file : gcloud auth application-default login

# with service account :
# from google.oauth2 import service_account
# credentials = service_account.Credentials.from_service_account_file('/path/to/key.json')
# docs : https://google-auth.readthedocs.io/en/master/user-guide.html

credentials, project = google.auth.default()
project_id = 'cmst-reddit-analysis'
client = bigquery.Client(credentials=credentials,project=project_id)



In [9]:
columns = {
    'body': 0, 'score_hidden': 1, 'archived': 2, 'name': 3, 'author': 4, 'author_flair_text': 5, 'downs': 6, 'created_utc': 7, 'subreddit_id': 8, 'link_id': 9, 'parent_id': 10, 'score': 11, 'retrieved_on': 12, 'controversiality': 13, 'gilded': 14, 'id': 15, 'subreddit': 16, 'ups': 17, 'distinguished': 18, 'author_flair_css_class': 19
}

# make sure these are the same keys as in columns above
keys = ['body', 'created_utc', 'id', 'subreddit']
df_dict = {key:[] for key in keys}

table_id = 'cmst-reddit-analysis.comments.2016_9_politics'

bar = tqdm(range(3650000))
#bar = tqdm(range(10))
rows_iter = client.list_rows(table_id) # max_results=10)
for row in rows_iter:
    for key in keys:
        df_dict[key].append(row[columns[key]])
    bar.update(1)

political_df = pl.DataFrame(df_dict).drop_nulls()
political_df.head()

  0%|          | 0/3650000 [00:00<?, ?it/s]

body,created_utc,id,subreddit
str,i64,str,str
"""So will Trump …",1473788168,"""d7l92vl""","""Republican"""
"""Hello there. I…",1475171434,"""d876i6l""","""NeutralPolitic…"
"""&gt;North Kore…",1473438677,"""d7ftb4m""","""neutralnews"""
"""The incentives…",1474911650,"""d82wm1o""","""NeutralPolitic…"
"""&gt; almost ce…",1474989129,"""d846a85""","""Republican"""


In [11]:
len(political_df)

3650430

In [12]:
political_df.write_csv('/Volumes/PortableSSD/css/data/processed/politics_comments_2016_9.csv', separator=",")

##### Download sample political comments table

In [4]:
table_id = 'cmst-reddit-analysis.politics_in_sports.sample_2016_2021_political_comments'

rows_iter = client.list_rows(table_id)  # max_results=10
rows = list(rows_iter)
print("Downloaded {} rows from table {}".format(len(rows), table_id))

KeyboardInterrupt: 

Let's look at a sample

In [33]:
print(rows[0])

Row(('er1nbsy', 't3_c04mtp', 't1_er1n13t', 1560434110, 'AOC', 'politics_2019', 'politics', 0, 'Lol. I was asking about the other thing.\n\nDont you think you are being manipulative trying to conflate closed borders with dehumanization?', 'Just_WoW_Things', 0), {'id': 0, 'link_id': 1, 'parent_id': 2, 'created_utc': 3, 'subreddit': 4, 'category': 5, 'super_category': 6, 'score': 7, 'body': 8, 'author': 9, 'gilded': 10})


Store all the columns

In [34]:
columns = {
    'id': 0,
    'link_id': 1,
    'parent_id': 2,
    'created_utc': 3,
    'subreddit': 4,
    'category': 5,
    'super_category': 6,
    'score': 7,
    'body': 8,
    'author': 9,
    'gilded': 10
}

Let's keep id, created time, subreddit, category, super category, body and author. And create a dataframe to store the data

In [80]:
# make sure these are the same keys as in columns above
keys = ['id', 'created_utc', 'subreddit', 'category', 'super_category', 'body', 'author']
df_dict = {key:[] for key in keys}

for row in rows:
    for key in keys:
        df_dict[key].append(row[columns[key]])

political_df = pl.DataFrame(df_dict).drop_nulls()
political_df.head()

id,created_utc,subreddit,category,super_category,body,author
str,i64,str,str,str,str,str
"""er1nbsy""",1560434110,"""AOC""","""politics_2019""","""politics""","""Lol. I was ask…","""Just_WoW_Thing…"
"""gq5rux6""",1615164573,"""AOC""","""politics_2019""","""politics""","""stfu Liberal.""","""gbsedillo20"""
"""gj3jztp""",1610531437,"""AOC""","""politics_2019""","""politics""",""" &gt;Fuck that…","""64590949354397…"
"""esrc8yq""",1562221646,"""AOC""","""politics_2019""","""politics""","""Cauliflower is…","""TobiKato"""
"""gyjpk7u""",1621327024,"""AOC""","""politics_2019""","""politics""","""If they build …","""the_lonely_gam…"


##### Save as csv

In [84]:
political_df.write_csv('~/sports-language-in-politics/data/processed/politics_sample.csv', separator=",")

##### Download sample sports comments table and save

Follow same steps as political comments table

In [93]:
table_id = 'cmst-reddit-analysis.sports_language_in_politics.sample_sports_comments'

rows_iter = client.list_rows(table_id)  # max_results=10
rows = list(rows_iter)
print("Downloaded {} rows from table {}".format(len(rows), table_id))

df_dict = {key:[] for key in keys}
for row in rows:
    for key in keys:
        df_dict[key].append(row[columns[key]])

sports_df = pl.DataFrame(df_dict).drop_nulls()

sports_df.write_csv('~/sports-language-in-politics/data/processed/sports_sample.csv', separator=",")

Downloaded 4306780 rows from table cmst-reddit-analysis.sports_language_in_politics.sample_sports_comments
