In [15]:
from google.cloud import bigquery
import google.auth
import polars as pl

##### Get credentials to access reddit dataset

In [16]:
# with Application Default Credentials (ADC) :
# install gcloud cli from : https://cloud.google.com/sdk/docs/install
# create credential file : gcloud auth application-default login

# with service account :
# from google.oauth2 import service_account
# credentials = service_account.Credentials.from_service_account_file('/path/to/key.json')
# docs : https://google-auth.readthedocs.io/en/master/user-guide.html

credentials, project = google.auth.default()
project_id = 'cmst-reddit-analysis'
client = bigquery.Client(credentials=credentials,project=project_id)



##### Download from political comments table

In [None]:
table_id = 'cmst-reddit-analysis.politics_in_sports.comments_2015-21_politics_and_sports'

# 10m at a time
rows_iter = client.list_rows(table_id, max_results=20000000)
rows = list(rows_iter)
print("Downloaded {} rows from table {}".format(len(rows), table_id))

In [None]:
len(rows)

##### Download sample political comments table

Let's look at a sample

In [6]:
print(rows[0])

Row(('hg3k6ci', 't3_pvt2eg', 't1_hg05a93', 1633870611, 'eu', 'politics_2019', 'politics', 0, "&gt; Under the handover treaty\n\nIt is entirely unrealistic for the UK to set any kind of conditions, when HK issue was illegal from the Chinese standpoint. That land is Chinese and they will do with it what they will, if you like it or not.\n\n&gt;Do you think the EU is stupid enough to trust the CCP after that\n\nYou forget that the factor of Opium wars is very much alive in China and their aspiration to power is partly grounded in the need to prevent any such occurences in the future. Franky, it is irrelevant what EU thinks on this. You should ask yourself does EU inspire any trust or credibility to start with, because I think it doesn't. You can't complain about foreign human rights violations next to the elephant in the room, the US with their wars of aggression. EU is allied with a declared enemy of China and accomplice in almost every American post-ww2 deed in some way. They themselves

Store all the columns

In [8]:
columns = {
    'id': 0,
    'link_id': 1,
    'parent_id': 2,
    'created_utc': 3,
    'subreddit': 4,
    'category': 5,
    'super_category': 6,
    'score': 7,
    'body': 8,
    'author': 9,
    'gilded': 10
}

Let's keep id, created time, subreddit, category, super category, body and author. And create a dataframe to store the data

In [9]:
# make sure these are the same keys as in columns above
keys = ['id', 'created_utc', 'subreddit', 'category', 'super_category', 'body', 'author']
df_dict = {key:[] for key in keys}

for row in rows:
    for key in keys:
        df_dict[key].append(row[columns[key]])

political_df = pl.DataFrame(df_dict).drop_nulls()
political_df.head()

id,created_utc,subreddit,category,super_category,body,author
str,i64,str,str,str,str,str
"""hg3k6ci""",1633870611,"""eu""","""politics_2019""","""politics""","""&gt; Under the…","""delete013"""
"""esqrmpn""",1562208395,"""AOC""","""politics_2019""","""politics""","""Yet another de…","""CaptainAssPlun…"
"""fqhvq8d""",1589385433,"""AOC""","""politics_2019""","""politics""","""http://www.chi…","""RanDomino5"""
"""fq6h3em""",1589131399,"""AOC""","""politics_2019""","""politics""","""https://www.no…","""RanDomino5"""
"""ga2pgeh""",1603646163,"""AOC""","""politics_2019""","""politics""","""He has a hard …","""ChiraqBluline"""


In [10]:
len(political_df)

10000000

##### Save as csv

In [11]:
political_df.write_csv('~/sports-language-in-politics/data/processed/politics_main_10m.csv', separator=",")

##### Download sample sports comments table and save

Follow same steps as political comments table

In [93]:
table_id = 'cmst-reddit-analysis.sports_language_in_politics.sample_sports_comments'

rows_iter = client.list_rows(table_id)  # max_results=10
rows = list(rows_iter)
print("Downloaded {} rows from table {}".format(len(rows), table_id))

df_dict = {key:[] for key in keys}
for row in rows:
    for key in keys:
        df_dict[key].append(row[columns[key]])

sports_df = pl.DataFrame(df_dict).drop_nulls()

sports_df.write_csv('~/sports-language-in-politics/data/processed/sports_sample.csv', separator=",")

Downloaded 4306780 rows from table cmst-reddit-analysis.sports_language_in_politics.sample_sports_comments
