In [1]:
class Document:
    def __init__(self, doc_url, doc_id, sz_bytes, sz_words):
        self.url = doc_url       # document url
        self.id = doc_id         # unique document id (int)
        self.sz_bytes = sz_bytes # document size in bytes before deleting html markup
        self.sz_words = sz_words # number of words in document before deleting html markup
        self.words = []          # list of words in document after deleting html markup
        self.links = []          # lisk of links in document

In [2]:
import base64
from bs4 import BeautifulSoup
import pickle
import os

DOCUMENT_TAG = "<document>"
DOC_URL_TAG = "<docURL>"
DOC_ID_TAG = "<docID>"

def preprocess_doc(doc, doc_url, doc_id):
    document = Document(doc_url, doc_id, len(doc), len(doc.split()))
    try:
        soup = BeautifulSoup(doc, "html.parser")
    except:
        print("incorrect document:", doc_url, doc_id)
        return document
    for script in soup(["script", "style"]):
        script.extract()
    for link in soup.find_all('a', href=True):
        document.links.append(link["href"])
    document.words = soup.get_text(separator=" ").split()
    return document
    
def preprocess_file(d, f):
    print("preprocessing", os.path.join(d, f))
    with open(os.path.join(d, f), "rt", encoding="cp1251") as fin, open(os.path.join(d, f.replace(".xml", ".out")), "wb") as fout:
        for line in fin:
            if line.startswith(DOCUMENT_TAG):
                doc = line[37:-11]
            elif line.startswith(DOC_URL_TAG):
                doc_url = line[8:-10]
            elif line.startswith(DOC_ID_TAG):
                doc_id = line[7:-20]
                document = preprocess_doc(base64.b64decode(doc), base64.b64decode(doc_url), doc_id)
                pickle.dump(document, fout)

def preprocess_collection(directory):
    for file in os.listdir(directory):
        if (file.endswith(".xml")):
            preprocess_file(directory, file)

In [3]:
from tqdm import tqdm
pbar = tqdm(total = 200000 * 3)

  0%|          | 0/600000 [00:00<?, ?it/s]

In [4]:

class BaseDocumentProcessor:
    def process(self, document):
        pass
    def result(self):
        pass

def process_file(d, f, processor):
    print("processing", os.path.join(d, f))
    with open(os.path.join(d, f), "rb") as fin:
        while True:
            pbar.update(1)
            try:
                document = pickle.load(fin)
            except:
                break
            processor.process(document)

def process_collection(directory, processor):
    for file in os.listdir(directory):
        if (file.endswith(".out")):
            process_file(directory, file, processor)

In [5]:
import matplotlib.pyplot as plt
from statistics import mean

COLLECTION_DIRECTORY = "byweb" # directory with .out files to process

class PrimaryStatsGetter(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
        self._docs_count = 0
        self._sz_bytes_html = []
        self._sz_words_html = []
        self._sz_bytes = []
        self._sz_words = []
    
    def process(self, document):
        """ document: Document (see first cell)
            process each document here """
        self._docs_count += 1
        self._sz_bytes_html.append(document.sz_bytes)
        self._sz_words_html.append(document.sz_words)
        self._sz_bytes.append(len("".join(document.words)))
        self._sz_words.append(len(document.words))
    
    def result(self):
        """ summarize and output all data """
        size_ratio = [x / y for x, y in zip(self._sz_bytes, self._sz_bytes_html)]
        print("total documents:", self._docs_count)
        print("average doc size in bytes:", round(mean(self._sz_bytes_html), 0))
        print("average text size in bytes:", round(mean(self._sz_bytes), 0))
        print("average word count including html:", round(mean(self._sz_words_html), 0))
        print("average word count:", round(mean(self._sz_words), 0))
        print("average size ration:", mean(size_ratio))
        
        plt.figure(figsize=(14, 5))
        plt.hist(self._sz_bytes_html, bins=100, range=(0, 200000))
        plt.legend(labels=("total doc size",), fontsize="x-large")
        
        plt.figure(figsize=(14, 5))
        plt.hist(self._sz_bytes, bins=100, range=(0, 20000))
        plt.legend(labels=("total text size",), fontsize="x-large")
        
        plt.figure(figsize=(14, 5))
        plt.hist(self._sz_words_html, bins=100, range=(0, 10000))
        plt.legend(labels=("words including html",), fontsize="x-large")
        
        plt.figure(figsize=(14, 5))
        plt.hist(self._sz_words, bins=100, range=(0, 4000))
        plt.legend(labels=("words",), fontsize="x-large")
        
        plt.figure(figsize=(14, 5))
        plt.hist(size_ratio, bins=100)
        plt.legend(labels=("size ratio",), fontsize="x-large")
        
        plt.show()
        
processor = PrimaryStatsGetter()        
process_collection(COLLECTION_DIRECTORY, processor)
processor.result()

  0%|          | 209/600000 [00:03<281:46:02,  1.69s/it]

processing byweb/byweb.5.out


  3%|▎         | 20044/600000 [00:33<13:11, 732.92it/s] 

processing byweb/byweb.4.out


  7%|▋         | 40018/600000 [00:59<11:07, 838.68it/s] 

processing byweb/byweb.6.out


 10%|█         | 60138/600000 [01:21<08:42, 1032.90it/s]

processing byweb/byweb.7.out


 13%|█▎        | 80015/600000 [01:40<09:01, 960.62it/s] 

processing byweb/byweb.3.out


 17%|█▋        | 100041/600000 [02:01<07:57, 1047.77it/s]

processing byweb/byweb.2.out


 20%|██        | 120156/600000 [02:19<07:55, 1008.56it/s]

processing byweb/byweb.0.out


 23%|██▎       | 140213/600000 [02:37<06:28, 1184.77it/s]

processing byweb/byweb.1.out


 27%|██▋       | 160144/600000 [02:53<07:06, 1030.11it/s]

processing byweb/byweb.9.out


 30%|███       | 180126/600000 [03:09<07:34, 923.63it/s] 

processing byweb/byweb.8.out


 33%|███▎      | 200010/600000 [03:26<04:08, 1610.12it/s]

total documents: 200000
average doc size in bytes: 39504.0
average text size in bytes: 5172.0
average word count including html: 2732.0
average word count: 819.0
average size ration: 0.13424972203267163


<Figure size 1400x500 with 1 Axes>

<Figure size 1400x500 with 1 Axes>

<Figure size 1400x500 with 1 Axes>

<Figure size 1400x500 with 1 Axes>

<Figure size 1400x500 with 1 Axes>

In [6]:
COLLECTION_DIRECTORY = "byweb" # directory with .out files to process
document_urls = {}

class GetDocUrls(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
    
    def process(self, document):
        """ document: Document (see first cell)
            process each document here """
        document_url = str(document.url)[2:-1].split("?")[0].split("#")[0]
        if document_url[-1] == '/':
            document_url = document_url[:-1]
        document_urls[document_url] = True
        
    def result(self):
        pass
        
processor = GetDocUrls()        
process_collection(COLLECTION_DIRECTORY, processor)
processor.result()

processing byweb/byweb.5.out


 37%|███▋      | 220134/600000 [03:47<07:04, 895.08it/s] 

processing byweb/byweb.4.out


 40%|████      | 240159/600000 [04:06<05:38, 1063.95it/s]

processing byweb/byweb.6.out


 43%|████▎     | 260180/600000 [04:24<04:39, 1216.73it/s]

processing byweb/byweb.7.out


 47%|████▋     | 280051/600000 [04:42<04:44, 1125.89it/s]

processing byweb/byweb.3.out


 50%|█████     | 300055/600000 [05:02<04:35, 1089.22it/s]

processing byweb/byweb.2.out


 53%|█████▎    | 320168/600000 [05:21<04:06, 1134.35it/s]

processing byweb/byweb.0.out


 57%|█████▋    | 340148/600000 [05:41<03:39, 1185.02it/s]

processing byweb/byweb.1.out


 60%|██████    | 360174/600000 [05:56<03:36, 1106.36it/s]

processing byweb/byweb.9.out


 63%|██████▎   | 380186/600000 [06:15<03:19, 1100.66it/s]

processing byweb/byweb.8.out


 67%|██████▋   | 399957/600000 [06:34<02:56, 1132.03it/s]

In [7]:
from tqdm import tqdm_notebook
import networkx as nx
import operator

COLLECTION_DIRECTORY = "byweb" # directory with .out files to process

class GraphBuider(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
        self.edge_list = []
    
    def process(self, document):
        """ document: Document (see first cell)
            process each document here """
        links = document.links
        document_url = str(document.url)[2:-1].split("?")[0].split("#")[0]
        if document_url[-1] == '/':
            document_url = document_url[:-1]
        links_absolute_no_params = []
        for link in links:
            if not link:
                continue
            link_absolute_path = link    
            if not (link_absolute_path.startswith("http://") or link_absolute_path.startswith("https://")):
                if link_absolute_path[0] == '.':
                    link_absolute_path = link_absolute_path[2:]
                link_absolute_path = document_url.rsplit("/", 1)[0] + "/" + link_absolute_path
            link_absolute_no_params = link_absolute_path.split("?")[0].split("#")[0]
            if link_absolute_no_params[-1] == '/':
                    link_absolute_no_params = link_absolute_no_params[:-1]
            if not (link_absolute_no_params in links_absolute_no_params):
                links_absolute_no_params.append(link_absolute_no_params)
        for link in links_absolute_no_params:
            if link in document_urls:
                self.edge_list.append((document_url, link))
        
    def result(self):
        graph = nx.DiGraph(self.edge_list)
        top_300_page_rank = sorted(nx.pagerank(graph).items(), key=operator.itemgetter(1), reverse=True)[:300]
        top_300_page_rank = [x[0] for x in top_300_page_rank]
        with open("graph4.csv", "wt") as fout:
            for in_vertex, out_vertex in tqdm_notebook(graph.edges()):
                if in_vertex in top_300_page_rank and out_vertex in top_300_page_rank:
                    fout.write(in_vertex)
                    fout.write(";")
                    fout.write(out_vertex)
                    fout.write('\n')    
        
processor = GraphBuider()        
process_collection(COLLECTION_DIRECTORY, processor)
processor.result()

 67%|██████▋   | 400089/600000 [06:35<07:54, 420.91it/s] 

processing byweb/byweb.5.out


 70%|███████   | 420153/600000 [06:57<03:35, 836.20it/s] 

processing byweb/byweb.4.out


 73%|███████▎  | 440138/600000 [07:18<03:10, 840.19it/s] 

processing byweb/byweb.6.out


 77%|███████▋  | 460149/600000 [07:40<02:34, 906.78it/s] 

processing byweb/byweb.7.out


 80%|████████  | 480053/600000 [08:03<02:15, 882.33it/s] 

processing byweb/byweb.3.out


 83%|████████▎ | 500126/600000 [08:28<02:19, 714.92it/s] 

processing byweb/byweb.2.out


 87%|████████▋ | 520150/600000 [08:51<01:28, 898.12it/s] 

processing byweb/byweb.0.out


 90%|█████████ | 540160/600000 [09:15<01:02, 960.44it/s]

processing byweb/byweb.1.out


 93%|█████████▎| 560130/600000 [09:39<00:52, 753.85it/s] 

processing byweb/byweb.9.out


 97%|█████████▋| 580136/600000 [10:04<00:28, 700.61it/s] 

processing byweb/byweb.8.out


100%|█████████▉| 599947/600000 [10:31<00:00, 798.47it/s]

HBox(children=(IntProgress(value=0, max=364160), HTML(value='')))




600030it [10:50, 798.47it/s]                            