In [1]:
class Document:
    def __init__(self, doc_url, doc_id, sz_bytes, sz_words):
        self.url = doc_url       # document url
        self.id = doc_id         # unique document id (int)
        self.sz_bytes = sz_bytes # document size in bytes before deleting html markup
        self.sz_words = sz_words # number of words in document before deleting html markup
        self.words = []          # list of words in document after deleting html markup
        self.links = []          # lisk of links in document

In [2]:
import base64
from bs4 import BeautifulSoup
import pickle
import os

DOCUMENT_TAG = "<document>"
DOC_URL_TAG = "<docURL>"
DOC_ID_TAG = "<docID>"

def preprocess_doc(doc, doc_url, doc_id):
    document = Document(doc_url, doc_id, len(doc), len(doc.split()))
    try:
        soup = BeautifulSoup(doc, "html.parser")
    except:
        print("incorrect document:", doc_url, doc_id)
        return document
    for script in soup(["script", "style"]):
        script.extract()
    for link in soup.find_all('a', href=True):
        document.links.append(link["href"])
    document.words = soup.get_text(separator=" ").split()
    return document
    
def preprocess_file(d, f):
    print("preprocessing", os.path.join(d, f))
    with open(os.path.join(d, f), "rt", encoding="cp1251") as fin, open(os.path.join(d, f.replace(".xml", ".out")), "wb") as fout:
        for line in fin:
            if line.startswith(DOCUMENT_TAG):
                doc = line[37:-11]
            elif line.startswith(DOC_URL_TAG):
                doc_url = line[8:-10]
            elif line.startswith(DOC_ID_TAG):
                doc_id = line[7:-20]
                document = preprocess_doc(base64.b64decode(doc), base64.b64decode(doc_url), doc_id)
                pickle.dump(document, fout)

def preprocess_collection(directory):
    for file in os.listdir(directory):
        if (file.endswith(".xml")):
            preprocess_file(directory, file)

In [3]:
from tqdm import tqdm
pbar = tqdm(total = 200000 * 3)

class BaseDocumentProcessor:
    def process(self, document):
        pass
    def result(self):
        pass

def process_file(d, f, processor):
    print("processing", os.path.join(d, f))
    with open(os.path.join(d, f), "rb") as fin:
        while True:
            pbar.update(1)
            try:
                document = pickle.load(fin)
            except:
                break
            processor.process(document)

def process_collection(directory, processor):
    for file in os.listdir(directory):
        if (file.endswith(".out")):
            process_file(directory, file, processor)

  0%|          | 0/600000 [00:00<?, ?it/s]

In [4]:
import matplotlib.pyplot as plt
from statistics import mean

COLLECTION_DIRECTORY = "byweb" # directory with .out files to process

class PrimaryStatsGetter(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
        self._docs_count = 0
        self._sz_bytes_html = []
        self._sz_words_html = []
        self._sz_bytes = []
        self._sz_words = []
    
    def process(self, document):
        """ document: Document (see first cell)
            process each document here """
        self._docs_count += 1
        self._sz_bytes_html.append(document.sz_bytes)
        self._sz_words_html.append(document.sz_words)
        self._sz_bytes.append(len("".join(document.words)))
        self._sz_words.append(len(document.words))
    
    def result(self):
        """ summarize and output all data """
        size_ratio = [x / y for x, y in zip(self._sz_bytes, self._sz_bytes_html)]
        print("total documents:", self._docs_count)
        print("average doc size in bytes:", round(mean(self._sz_bytes_html), 0))
        print("average text size in bytes:", round(mean(self._sz_bytes), 0))
        print("average word count including html:", round(mean(self._sz_words_html), 0))
        print("average word count:", round(mean(self._sz_words), 0))
        print("average size ration:", mean(size_ratio))
        
        plt.figure(figsize=(14, 5))
        plt.hist(self._sz_bytes_html, bins=100, range=(0, 200000))
        plt.legend(labels=("total doc size",), fontsize="x-large")
        
        plt.figure(figsize=(14, 5))
        plt.hist(self._sz_bytes, bins=100, range=(0, 20000))
        plt.legend(labels=("total text size",), fontsize="x-large")
        
        plt.figure(figsize=(14, 5))
        plt.hist(self._sz_words_html, bins=100, range=(0, 10000))
        plt.legend(labels=("words including html",), fontsize="x-large")
        
        plt.figure(figsize=(14, 5))
        plt.hist(self._sz_words, bins=100, range=(0, 4000))
        plt.legend(labels=("words",), fontsize="x-large")
        
        plt.figure(figsize=(14, 5))
        plt.hist(size_ratio, bins=100)
        plt.legend(labels=("size ratio",), fontsize="x-large")
        
        plt.show()
        
#processor = PrimaryStatsGetter()        
#process_collection(COLLECTION_DIRECTORY, processor)
#processor.result()

In [5]:
from tqdm import tqdm_notebook
import networkx as nx
import operator

COLLECTION_DIRECTORY = "byweb" # directory with .out files to process
document_urls = {}

class GetDocUrls(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
    
    def process(self, document):
        """ document: Document (see first cell)
            process each document here """
        document_url = str(document.url)[2:-1].split("?")[0].split("#")[0]
        if document_url[-1] == '/':
            document_url = document_url[:-1]
        document_urls[document_url] = True
        
    def result(self):
        pass
        
processor = GetDocUrls()        
process_collection(COLLECTION_DIRECTORY, processor)
processor.result()

  0%|          | 99/600000 [00:01<187:50:45,  1.13s/it]

processing byweb/byweb.5.out


  3%|▎         | 20127/600000 [00:22<11:07, 868.96it/s] 

processing byweb/byweb.4.out


  7%|▋         | 40174/600000 [00:42<08:46, 1063.53it/s]

processing byweb/byweb.6.out


 10%|█         | 60005/600000 [00:59<07:47, 1154.32it/s]

processing byweb/byweb.7.out


 13%|█▎        | 80026/600000 [01:20<08:56, 969.24it/s] 

processing byweb/byweb.3.out


 17%|█▋        | 100021/600000 [01:43<08:39, 961.96it/s]

processing byweb/byweb.2.out


 20%|██        | 120040/600000 [02:04<07:14, 1105.56it/s]

processing byweb/byweb.0.out


 23%|██▎       | 140156/600000 [02:26<07:51, 976.27it/s] 

processing byweb/byweb.1.out


 27%|██▋       | 160132/600000 [02:46<07:36, 964.42it/s] 

processing byweb/byweb.9.out


 30%|███       | 180027/600000 [03:06<06:35, 1063.07it/s]

processing byweb/byweb.8.out


 33%|███▎      | 199972/600000 [03:28<08:34, 777.63it/s] 

In [6]:
#document_urls

In [8]:
from tqdm import tqdm_notebook
import networkx as nx
import operator

COLLECTION_DIRECTORY = "byweb" # directory with .out files to process

class GraphBuider(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
        self.edge_list = []
    
    def process(self, document):
        """ document: Document (see first cell)
            process each document here """
        links_long = document.links
        document_url = str(document.url)[2:-1].split("?")[0].split("#")[0]
        if document_url[-1] == '/':
            document_url = document_url[:-1]
        links_short = []
        for link in links_long:
            if not link:
                continue
            short_link = link
            if not (short_link.startswith("http://") or short_link.startswith("https://")):
                if short_link[0] == '.':
                    short_link = short_link[2:]
                short_link = document_url.rsplit("/", 1)[0] + "/" + short_link
            short_link = short_link.split("?")[0].split("#")[0]
            if short_link[-1] == '/':
                    short_link = short_link[:-1]
            if not (short_link in links_short):
                links_short.append(short_link)
        for link in links_short:
            if link in document_urls:
                self.edge_list.append((document_url, link))
        
    def result(self):
        print("!!")
        graph = nx.DiGraph(self.edge_list)
        top_300_page_rank = sorted(nx.pagerank(graph).items(), key=operator.itemgetter(1), reverse=True)[:300]
        top_300_page_rank = [x[0] for x in top_300_page_rank]
        print(top_300_page_rank)
        with open("graph4.csv", "wt") as fout:
            for inp, outp in tqdm_notebook(graph.edges()):
                if inp in top_300_page_rank and outp in top_300_page_rank:
                    fout.write(inp)
                    fout.write(";")
                    fout.write(outp)
                    fout.write('\n')    
        
processor = GraphBuider()        
process_collection(COLLECTION_DIRECTORY, processor)
processor.result()

 67%|██████▋   | 400044/600000 [10:28<22:06:31,  2.51it/s]

processing byweb/byweb.5.out


 70%|███████   | 420031/600000 [11:03<04:45, 629.63it/s]  

processing byweb/byweb.4.out


 73%|███████▎  | 440047/600000 [11:39<04:23, 606.58it/s]

processing byweb/byweb.6.out


 77%|███████▋  | 460038/600000 [12:15<03:42, 629.89it/s]

processing byweb/byweb.7.out


 80%|████████  | 480098/600000 [12:50<03:48, 524.94it/s]

processing byweb/byweb.3.out


 83%|████████▎ | 500102/600000 [13:25<02:40, 622.15it/s]

processing byweb/byweb.2.out


 87%|████████▋ | 520127/600000 [13:55<01:54, 698.98it/s]

processing byweb/byweb.0.out


 90%|█████████ | 540114/600000 [14:27<01:19, 755.70it/s]

processing byweb/byweb.1.out


 93%|█████████▎| 560048/600000 [14:58<00:58, 687.40it/s]

processing byweb/byweb.9.out


 97%|█████████▋| 580075/600000 [15:27<00:31, 639.93it/s] 

processing byweb/byweb.8.out


600009it [16:00, 677.90it/s]                            

!!


600030it [16:10, 677.90it/s]

['http://profile.tut.by', 'http://www.tut.by', 'http://catalog.tut.by', 'http://mail.tut.by/faq.html', 'http://np.by', 'http://www.rodina.by/info/prize/index/index.phtml', 'http://list.np.by', 'http://avto.np.by', 'http://news.np.by', 'http://chat.np.by', 'http://media.telecom.by/forum/index.php', 'http://pritchi.castle.by', 'http://news.br.by', 'http://cards.br.by', 'http://media.telecom.by/forum/search.php', 'http://media.telecom.by/forum/viewforum.php', 'http://oz.by/piter', 'http://profile.tut.by/newtut.html', 'http://bs.by', 'http://oz.by/auctions/topic.phtml', 'http://profile.tut.by/deluser.html', 'http://forum.everyday.by/index.php', 'http://kino.br.by/tv', 'http://www.au78.by', 'http://shop.n1.by', 'http://www2.kopeyka.by', 'http://www2.kopeyka.by/prodavzam.html', 'http://www2.kopeyka.by/bug-report.html', 'http://mail.tut.by', 'http://www.velcom.by/ru/services/gprs', 'http://content.tut.by', 'http://news.sportpanorama.by', 'http://www.newsite.by', 'http://www.my.mts.by/forum.ht

HBox(children=(IntProgress(value=0, max=364160), HTML(value='')))


