In [1]:
class Document:
    def __init__(self, doc_url, doc_id, sz_bytes, sz_words):
        self.url = doc_url       # document url
        self.id = doc_id         # unique document id (int)
        self.sz_bytes = sz_bytes # document size in bytes before deleting html markup
        self.sz_words = sz_words # number of words in document before deleting html markup
        self.words = []          # list of words in document after deleting html markup
        self.links = []          # lisk of links in document

In [2]:
import base64
from bs4 import BeautifulSoup
import pickle
import os

DOCUMENT_TAG = "<document>"
DOC_URL_TAG = "<docURL>"
DOC_ID_TAG = "<docID>"

def preprocess_doc(doc, doc_url, doc_id):
    document = Document(doc_url, doc_id, len(doc), len(doc.split()))
    try:
        soup = BeautifulSoup(doc, "html.parser")
    except:
        print("incorrect document:", doc_url, doc_id)
        return document
    for script in soup(["script", "style"]):
        script.extract()
    for link in soup.find_all('a', href=True):
        document.links.append(link["href"])
    document.words = soup.get_text(separator=" ").split()
    return document
    
def preprocess_file(d, f):
    print("preprocessing", os.path.join(d, f))
    with open(os.path.join(d, f), "rt", encoding="cp1251") as fin, open(os.path.join(d, f.replace(".xml", ".out")), "wb") as fout:
        for line in fin:
            if line.startswith(DOCUMENT_TAG):
                doc = line[37:-11]
            elif line.startswith(DOC_URL_TAG):
                doc_url = line[8:-10]
            elif line.startswith(DOC_ID_TAG):
                doc_id = line[7:-20]
                document = preprocess_doc(base64.b64decode(doc), base64.b64decode(doc_url), doc_id)
                pickle.dump(document, fout)

def preprocess_collection(directory):
    for file in os.listdir(directory):
        if (file.endswith(".xml")):
            preprocess_file(directory, file)

In [3]:
class BaseDocumentProcessor:
    def process(self, document):
        pass
    def result(self):
        pass

def process_file(d, f, processor):
    print("processing", os.path.join(d, f))
    with open(os.path.join(d, f), "rb") as fin:
        while True:
            try:
                document = pickle.load(fin)
            except:
                break
            processor.process(document)

def process_collection(directory, processor):
    for file in os.listdir(directory):
        if (file.endswith(".out")):
            process_file(directory, file, processor)

In [4]:
import matplotlib.pyplot as plt
from statistics import mean

COLLECTION_DIRECTORY = "byweb" # directory with .out files to process

class PrimaryStatsGetter(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
        self._docs_count = 0
        self._sz_bytes_html = []
        self._sz_words_html = []
        self._sz_bytes = []
        self._sz_words = []
    
    def process(self, document):
        """ document: Document (see first cell)
            process each document here """
        self._docs_count += 1
        self._sz_bytes_html.append(document.sz_bytes)
        self._sz_words_html.append(document.sz_words)
        self._sz_bytes.append(len("".join(document.words)))
        self._sz_words.append(len(document.words))
    
    def result(self):
        """ summarize and output all data """
        size_ratio = [x / y for x, y in zip(self._sz_bytes, self._sz_bytes_html)]
        print("total documents:", self._docs_count)
        print("average doc size in bytes:", round(mean(self._sz_bytes_html), 0))
        print("average text size in bytes:", round(mean(self._sz_bytes), 0))
        print("average word count including html:", round(mean(self._sz_words_html), 0))
        print("average word count:", round(mean(self._sz_words), 0))
        print("average size ration:", mean(size_ratio))
        
        plt.figure(figsize=(14, 5))
        plt.hist(self._sz_bytes_html, bins=100, range=(0, 200000))
        plt.legend(labels=("total doc size",), fontsize="x-large")
        
        plt.figure(figsize=(14, 5))
        plt.hist(self._sz_bytes, bins=100, range=(0, 20000))
        plt.legend(labels=("total text size",), fontsize="x-large")
        
        plt.figure(figsize=(14, 5))
        plt.hist(self._sz_words_html, bins=100, range=(0, 10000))
        plt.legend(labels=("words including html",), fontsize="x-large")
        
        plt.figure(figsize=(14, 5))
        plt.hist(self._sz_words, bins=100, range=(0, 4000))
        plt.legend(labels=("words",), fontsize="x-large")
        
        plt.figure(figsize=(14, 5))
        plt.hist(size_ratio, bins=100)
        plt.legend(labels=("size ratio",), fontsize="x-large")
        
        plt.show()
        
processor = PrimaryStatsGetter()        
process_collection(COLLECTION_DIRECTORY, processor)
processor.result()

processing byweb/byweb.0.out
processing byweb/byweb.9.out
processing byweb/byweb.1.out
processing byweb/byweb.5.out
processing byweb/byweb.8.out
processing byweb/byweb.7.out
processing byweb/byweb.3.out
processing byweb/byweb.2.out
processing byweb/byweb.6.out
processing byweb/byweb.4.out
total documents: 200000
average doc size in bytes: 39504.0
average text size in bytes: 5172.0
average word count including html: 2732.0
average word count: 819.0
average size ration: 0.13424972203267163


<Figure size 1400x500 with 1 Axes>

<Figure size 1400x500 with 1 Axes>

<Figure size 1400x500 with 1 Axes>

<Figure size 1400x500 with 1 Axes>

<Figure size 1400x500 with 1 Axes>