In [2]:
%pip install --upgrade firebase-admin

Collecting firebase-admin
  Downloading firebase_admin-4.5.0-py3-none-any.whl (111 kB)
[K     |████████████████████████████████| 111 kB 9.4 MB/s eta 0:00:01
[?25hCollecting cachecontrol>=0.12.6
  Downloading CacheControl-0.12.6-py2.py3-none-any.whl (19 kB)
Installing collected packages: cachecontrol, firebase-admin
Successfully installed cachecontrol-0.12.6 firebase-admin-4.5.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

In [4]:
# Use the application default credentials
cred = credentials.ApplicationDefault()
firebase_admin.initialize_app(cred, {
  'projectId': 'tldr-278619',
})

db = firestore.client()

In [5]:
def is_good_tldr(doc_id, doc_dict):
    if doc_dict.get('skip_reason') == 'p':
        print(f'Document {doc_id} should be published')
        return True
    
    doc_mark_for_publish = doc_dict.get('published', False)
   
    return doc_mark_for_publish

def output_doc(writer, doc):
    doc_id = doc.id
    doc_dict = doc.to_dict()
    if not doc_dict:
        return

    csv_row = {
        'id': doc_id,
        'title': doc_dict['title'],
        'summary': doc_dict['summary'],
        'good_tldr': is_good_tldr(doc_id, doc_dict)
    }
    writer.writerow(csv_row)

In [6]:
docs = db.collection(u'urls').stream()

In [7]:
import itertools

import logging
import os

import csv


docs_count = 0

with open('local-summary-training-data.csv', 'w', newline='') as csvfile:
    fieldnames = ['id', 'title', 'summary', 'good_tldr']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()

    for doc in docs:
        output_doc(writer, doc)
        docs_count += 1
        
print(f'Processed {docs_count} documents')

Document https___arstechnica.com_gaming_2020_09_microsoft-confirms-499-xbox-series-x-for-nov-10_ should be published
Document https___arstechnica.com_gaming_2020_10_facebooks-cloud-gaming-offering-focuses-on-free-to-play-mobile-games_ should be published
Document https___techcrunch.com_2020_07_07_nvidias-ampere-gpus-come-to-google-cloud_amp_ should be published
Document https___techcrunch.com_2020_09_08_general-motors-takes-2-billion-stake-electric-truck-startup-nikola_ should be published
Document https___techcrunch.com_2020_09_14_bytedance-wont-sell-tiktok-to-microsoft-or-oracle_ should be published
Document https___techcrunch.com_2020_09_24_google-meet-and-other-google-services-go-down_ should be published
Document https___thenextweb.com_apps_2020_09_14_oracle-beats-microsofts-bid-for-tiktok_ should be published
Document https___thenextweb.com_plugged_2020_09_09_android-11-here-are-the-8-best-new-features_ should be published
Document https___u.today_1000000-in-bitcoin-fails-to-help

In [9]:
with open('local-summary-training-data.csv', 'r') as lines:
    for line in itertools.islice(lines, 20):
        print(line)

id,title,summary,good_tldr

http___techcrunch.com_2017_02_23_website-builder-wix-acquires-art-community-deviantart-for-36m_,Website builder Wix acquires art community DeviantArt for $36M – TechCrunch,"Wix .com has made another acquisition to build out the tools that it provides to users to build and administer websites: it has acquired DeviantArt, an online community for artists, designers and art/design enthusiasts with some 325 million individual pieces of original art and more than 40 million registered members, for $36 million in cash, including $3 million of assumed liabilities. Updated detail related to DeviantArt’s valuation prior to its sale.",False

http___techcrunch.com_2020_07_02_festos-latest-biomimetic-robots-are-a-flying-feathered-bird-and-ball-bottomed-helper-arm_,Festo’s latest biomimetic robots are a flying feathered bird and ball-bottomed helper arm – TechCrunch,"You could be excused for thinking that German robotics company Festo does nothing but put together fabulou

In [26]:
from google.cloud import storage
    
# https://console.cloud.google.com/storage/browser/[bucket-id]/
client = storage.Client(project='tldr-news-discovery')
bucket = client.get_bucket('tldr-training-dataset')
blob = bucket.blob('summary-training-data.csv')
blob.upload_from_filename('local-summary-training-data.csv')