In [1]:
import os
import subprocess
import requests
import gzip
import warcio
import itertools
import concurrent.futures

import numpy as np
import pandas as pd

from datetime import date, timedelta
from bs4 import BeautifulSoup
import newspaper

In [2]:
# Search CC-NEWS bucket
ccnews_bucket = subprocess.check_output(["aws", "s3", "ls", "--recursive", "--no-sign-request", 
                                         "s3://commoncrawl/crawl-data/CC-NEWS/"])

In [3]:
url_prefix = "https://commoncrawl.s3.amazonaws.com/"

# Get all URLs for CC-NEWS datasets
ccnews_info = ccnews_bucket.decode().split('\n')[:-1]
ccnews_urls = [url_prefix + info.split(' ')[-1] 
               for info in ccnews_info]

In [4]:
# Subset URLs based on dates
start_date = date(2021, 3, 4)
end_date = date(2021, 3, 6)

date_range = end_date - start_date
dates = []
for i in range(date_range.days + 1):
    day = (start_date + timedelta(days=i)).strftime("%Y%m%d")
    dates.append(day)

ccnews_urls_subset = [url for url in ccnews_urls 
                      for date in dates if date in url]

In [5]:
def download_ccnews_file(url):
    """
    Does not download file if there already exists a corresponding file
    """
    f_path = os.path.join("CC-NEWS warc gz", url.split('/')[-1])
    
    if not os.path.exists(f_path):
        print(f"Downloading file from:   {url}")
        r = requests.get(url)
        
        with open(f_path, 'wb') as f_out:
            f_out.write(r.content)
    else:
        print(f"File already downloaded: {url}")

In [6]:
%%time
with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
    executor.map(download_ccnews_file, ccnews_urls_subset[:8])

File already downloaded: https://commoncrawl.s3.amazonaws.com/crawl-data/CC-NEWS/2021/03/CC-NEWS-20210304010722-00535.warc.gzFile already downloaded: https://commoncrawl.s3.amazonaws.com/crawl-data/CC-NEWS/2021/03/CC-NEWS-20210304063408-00538.warc.gz
File already downloaded: https://commoncrawl.s3.amazonaws.com/crawl-data/CC-NEWS/2021/03/CC-NEWS-20210304080104-00539.warc.gz
File already downloaded: https://commoncrawl.s3.amazonaws.com/crawl-data/CC-NEWS/2021/03/CC-NEWS-20210304103139-00541.warc.gzFile already downloaded: https://commoncrawl.s3.amazonaws.com/crawl-data/CC-NEWS/2021/03/CC-NEWS-20210304113444-00542.warc.gz

File already downloaded: https://commoncrawl.s3.amazonaws.com/crawl-data/CC-NEWS/2021/03/CC-NEWS-20210304045648-00537.warc.gz
File already downloaded: https://commoncrawl.s3.amazonaws.com/crawl-data/CC-NEWS/2021/03/CC-NEWS-20210304092001-00540.warc.gz

File already downloaded: https://commoncrawl.s3.amazonaws.com/crawl-data/CC-NEWS/2021/03/CC-NEWS-20210304030328-00536.

In [6]:
def process_ccnews_warc_gz(fname):
    print(f"Processing: {fname}")
    articles_list = []
    fpath = os.path.join("CC-NEWS warc gz", fname)
    
    with gzip.open(fpath, 'rb') as f_in:
        for record in warcio.archiveiterator.ArchiveIterator(f_in):
            # rec_type='warcinfo' is metadata for the entire WARC batch
            # rec_type='request' is the information sent from client to server

            if record.rec_type == 'response':
                target_url = record.rec_headers.get_header('WARC-Target-URI')
                # Only take New Zealand URLs
                # -> Is there a better way to verify the source of the articles?
                if '.nz/' not in target_url: continue

                content_type = record.http_headers.get_header('Content-Type')
                # default encoding assumed to be utf-8
                if content_type is None:
                    content_type = "text/html; charset=utf-8"
                elif ";" not in content_type:
                    content_type = content_type + "; charset=utf-8"
                
                #
                try:
                    doctype, raw_encoding = content_type.split(';')[:2]
                    encoding = raw_encoding.split('charset=')[1]
                except:
                    print("Unable to be parsed:", content_type)
                
                # Process article - can only process HTML 
                if not doctype == 'text/html': continue

                try:
                    html = record.content_stream().read().decode(encoding)
                except:
                    print(f"Unable to be decoded: {target_url}")
                article = newspaper.Article(url='')
                article.set_html(html)
                article.parse()
                text = article.text
                # Exclude webpage if newspaper3k package cannot parse any text
                if pd.isnull(text): continue
#                 if text.isna():# Throws error on strings
                if text == "NaN":
                    print("YELLOW")
                    continue
                
#                 soup = BeautifulSoup(html)
#                 text = soup.find_all('p')
#                 fixed_text = " ".join([sentence.get_text(strip=True) for sentence in text])

                datetime = record.rec_headers.get_header('WARC-Date')
                articles_list.append([
                    datetime,
                    target_url,
                    text
                ])

    return articles_list

In [7]:
%%time
fnames = [fname for fname in os.listdir("CC-NEWS warc gz") if fname != ".ipynb_checkpoints"][:1]
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
    articles_raw = list(executor.map(process_ccnews_warc_gz, fnames))

Processing: CC-NEWS-20210306223840-00585.warc.gz
CPU times: user 20.7 ms, sys: 16.1 ms, total: 36.8 ms
Wall time: 54 s


In [30]:
# Flatten articles_raw to get list of lists
articles_list = list(itertools.chain.from_iterable(articles_raw))

# Convert list of lists to DataFrame
articles_df = pd.DataFrame.from_records(articles_list,
                                        columns = ['Datetime', 'URL', 'Text'])

# Save DataFrame to CSV
articles_df.to_csv('nz-articles.csv', index=False)

In [None]:
articles_df.info()

In [None]:
articles_df.head()

In [None]:
articles_df.loc[articles_df['URL'].str.contains("roger-tuivasa-sheck-admits-feeling-weight-lifted-off-his-shoulders-now-switch-rugby-union-confirmed"), 'Text']

In [None]:
articles_df.iloc[122, 2]

In [None]:
for i in range(100): print(articles_df.iloc[i, 1])

In [41]:
fpath = "CC-NEWS warc gz/CC-NEWS-20210304010722-00535.warc.gz"
i = 0
    
with gzip.open(fpath, 'rb') as f_in:
    for record in warcio.archiveiterator.ArchiveIterator(f_in):
        # rec_type='warcinfo' is metadata for the entire WARC batch
        # rec_type='request' is the information sent from client to server

        if record.rec_type == 'response':
            print(record.rec_headers)
            print()
            target_url = record.rec_headers.get_header('WARC-Target-URI')
            # Only take New Zealand URLs
            # -> Is there a better way to verify the source of the articles?
            #if '.nz/' not in target_url: continue
                
            i += 1
            if i == 10: break

#             content_type = record.http_headers.get_header('Content-Type')
#             # default encoding assumed to be utf-8
#             if content_type is None:
#                 content_type = "text/html; charset=utf-8"
#             elif ";" not in content_type:
#                 content_type = content_type + "; charset=utf-8"

#             #
#             try:
#                 doctype, raw_encoding = content_type.split(';')[:2]
#                 encoding = raw_encoding.split('charset=')[1]
#             except:
#                 print("Unable to be parsed:", content_type)

#             # Process article - can only process HTML 
#             if not doctype == 'text/html': continue

#             try:
#                 html = record.content_stream().read().decode(encoding)
#             except:
#                 print(f"Unable to be decoded: {target_url}")
#             article = newspaper.Article(url='')
#             article.set_html(html)
#             article.parse()
#             text = article.text
#             # Exclude webpage if newspaper3k package cannot parse any text
#             if pd.isnull(text): continue
#                 if text.isna():# Throws error on strings
#             if text == "NaN":
#                 print("YELLOW")
#                 continue

#                 soup = BeautifulSoup(html)
#                 text = soup.find_all('p')
#                 fixed_text = " ".join([sentence.get_text(strip=True) for sentence in text])

#             datetime = record.rec_headers.get_header('WARC-Date')
#             articles_list.append([
#                 datetime,
#                 target_url,
#                 text
#             ])

WARC/1.0
WARC-Record-ID: <urn:uuid:f0380079-8238-498a-9d6e-1597952b4973>
Content-Length: 75230
WARC-Date: 2021-03-04T01:07:22Z
WARC-Type: response
WARC-Target-URI: https://www.debate.com.mx/culiacan/Busca-Infonavit-subir-el-numero-en-aprobacion-de-creditosen-Sinaloa-20210303-0267.html
Content-Type: application/http; msgtype=response
WARC-Payload-Digest: sha1:FBQBDFRHJMNPE2OOGCXIEVOWWOV3FZDB
WARC-Block-Digest: sha1:GM5KU3RTQOKOAP7XZSAXTSJICAN6TALW


WARC/1.0
WARC-Record-ID: <urn:uuid:3d68adc6-9b2c-4be6-b16b-ac0079c35f15>
Content-Length: 42113
WARC-Date: 2021-03-04T01:07:22Z
WARC-Type: response
WARC-Target-URI: http://www.digitaljournal.com/tech-and-science/science/op-ed-earth-magnetic-pole-flip-info-non-info-and-godawful-info/article/586410
Content-Type: application/http; msgtype=response
WARC-Payload-Digest: sha1:AKDQ5JO73Z6LM3WWWQHW7ZXWPUK4OBBC
WARC-Block-Digest: sha1:RC4T4RHGUUJKUTAMD6EPUDQHGYLCPNDQ


WARC/1.0
WARC-Record-ID: <urn:uuid:fecd28be-80f1-4f8f-84bf-c14d2776e342>
Content-Le