In [1]:
class Document:
    def __init__(self, doc_url, doc_id, sz_bytes, sz_words):
        self.url = doc_url       # document url
        self.id = doc_id         # unique document id (int)
        self.sz_bytes = sz_bytes # document size in bytes before deleting html markup
        self.sz_words = sz_words # number of words in document before deleting html markup
        self.words = []          # list of words in document after deleting html markup
        self.links = []          # lisk of links in document

In [2]:
import base64
from bs4 import BeautifulSoup
import pickle
import os

DOCUMENT_TAG = "<document>"
DOC_URL_TAG = "<docURL>"
DOC_ID_TAG = "<docID>"

def preprocess_doc(doc, doc_url, doc_id):
    document = Document(doc_url, doc_id, len(doc), len(doc.split()))
    try:
        soup = BeautifulSoup(doc, "html.parser")
    except:
        print("incorrect document:", doc_url, doc_id)
        return document
    for script in soup(["script", "style"]):
        script.extract()
    for link in soup.find_all('a', href=True):
        document.links.append(link["href"])
    document.words = soup.get_text(separator=" ").split()
    return document
    
def preprocess_file(d, f):
    print("preprocessing", os.path.join(d, f))
    with open(os.path.join(d, f), "rt", encoding="cp1251") as fin, open(os.path.join(d, f.replace(".xml", ".out")), "wb") as fout:
        for line in fin:
            if line.startswith(DOCUMENT_TAG):
                doc = line[37:-11]
            elif line.startswith(DOC_URL_TAG):
                doc_url = line[8:-10]
            elif line.startswith(DOC_ID_TAG):
                doc_id = line[7:-20]
                document = preprocess_doc(base64.b64decode(doc), base64.b64decode(doc_url), doc_id)
                pickle.dump(document, fout)

def preprocess_collection(directory):
    for file in os.listdir(directory):
        if (file.endswith(".xml")):
            preprocess_file(directory, file)

In [3]:
class BaseDocumentProcessor:
    def process(self, document):
        pass
    def result(self):
        pass

def process_file(d, f, processor):
    print("processing", os.path.join(d, f))
    with open(os.path.join(d, f), "rb") as fin:
        while True:
            try:
                document = pickle.load(fin)
                processor.process(document)
            except:
                break

def process_collection(directory, processor):
    for file in os.listdir(directory):
        if (file.endswith(".out")):
            process_file(directory, file, processor)

In [182]:
COLLECTION_DIRECTORY = "byweb" # directory with .out files to process

class PrimaryStatsGetter(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
        self._docs_count = 0
    
    def process(self, document):
        """ document: Document (see first cell)
            process each document here """
        self._docs_count += 1
    
    def result(self):
        """ summarize and output all data """
        print("total documents:", self._docs_count)

processor = PrimaryStatsGetter()        
process_collection(COLLECTION_DIRECTORY, processor)
processor.result()

processing byweb\byweb.0.out
processing byweb\byweb.1.out
processing byweb\byweb.2.out
processing byweb\byweb.3.out
processing byweb\byweb.4.out
processing byweb\byweb.5.out
processing byweb\byweb.6.out
processing byweb\byweb.7.out
processing byweb\byweb.8.out
processing byweb\byweb.9.out
total documents: 200000
