In [None]:
class Document:
    def __init__(self, doc_url, doc_id, sz_bytes, sz_words):
        self.url = doc_url       # document url
        self.id = doc_id         # unique document id (str)
        self.sz_bytes = sz_bytes # document size in bytes before deleting html markup
        self.sz_words = sz_words # number of words in document before deleting html markup
        self.words = []          # list of words in document after deleting html markup
        self.links = []          # lisk of links in document

In [None]:
import base64
from bs4 import BeautifulSoup
import pickle
import os

DOCUMENT_TAG = "<document>"
DOC_URL_TAG = "<docURL>"
DOC_ID_TAG = "<docID>"

def preprocess_doc(doc, doc_url, doc_id):
    document = Document(doc_url, doc_id, len(doc), len(doc.split()))
    try:
        soup = BeautifulSoup(doc, "html.parser")
    except:
        print("incorrect document:", doc_url, doc_id)
        return document
    for script in soup(["script", "style"]):
        script.extract()
    for link in soup.find_all('a', href=True):
        document.links.append(link["href"])
    document.words = soup.get_text(separator=" ").split()
    return document
    
def preprocess_file(d, f):
    print("preprocessing", os.path.join(d, f))
    with open(os.path.join(d, f), "rt", encoding="cp1251") as fin, open(os.path.join(d, f.replace(".xml", ".out")), "wb") as fout:
        for line in fin:
            if line.startswith(DOCUMENT_TAG):
                doc = line[37:-11]
            elif line.startswith(DOC_URL_TAG):
                doc_url = line[8:-10]
            elif line.startswith(DOC_ID_TAG):
                doc_id = line[7:-20]
                document = preprocess_doc(base64.b64decode(doc), base64.b64decode(doc_url), doc_id)
                pickle.dump(document, fout)

def preprocess_collection(directory):
    for file in os.listdir(directory):
        if (file.endswith(".xml")):
            preprocess_file(directory, file)

In [None]:
from tqdm import tqdm

class BaseDocumentProcessor:
    def process(self, document):
        pass
    def result(self):
        pass

def process_file(d, f, processor, pbar):
    print("processing", os.path.join(d, f))
    with open(os.path.join(d, f), "rb") as fin:
        while True:
             pbar.update(1)
            try:
                document = pickle.load(fin)
            except:
                break
            processor.process(document)

def process_collection(directory, processor):
    pbar = tqdm(total = 200000)
    for file in os.listdir(directory):
        if (file.endswith(".out")):
            process_file(directory, file, processor, pbar)

In [None]:
import matplotlib.pyplot as plt
from statistics import mean

COLLECTION_DIRECTORY = "byweb" # directory with .out files to process

class PrimaryStatsGetter(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
        self._docs_count = 0
        self._sz_bytes_html = []
        self._sz_words_html = []
        self._sz_bytes = []
        self._sz_words = []
    
    def process(self, document):
        """ document: Document (see first cell)
            process each document here """
        self._docs_count += 1
        self._sz_bytes_html.append(document.sz_bytes)
        self._sz_words_html.append(document.sz_words)
        self._sz_bytes.append(len("".join(document.words)))
        self._sz_words.append(len(document.words))
    
    def result(self):
        """ summarize and output all data """
        size_ratio = [x / y for x, y in zip(self._sz_bytes, self._sz_bytes_html)]
        print("total documents:", self._docs_count)
        print("average doc size in bytes:", round(mean(self._sz_bytes_html), 0))
        print("average text size in bytes:", round(mean(self._sz_bytes), 0))
        print("average word count including html:", round(mean(self._sz_words_html), 0))
        print("average word count:", round(mean(self._sz_words), 0))
        print("average size ratio:", mean(size_ratio))
        
        plt.figure(figsize=(8, 5))
        plt.hist(self._sz_bytes_html, bins=100, range=(0, 200000))
        plt.legend(labels=("Размер документа с разметкой (в байтах)",), fontsize="x-large")
        
        plt.figure(figsize=(8, 5))
        plt.hist(self._sz_bytes, bins=100, range=(0, 20000))
        plt.legend(labels=("Размер документа без разметки (в байтах)",), fontsize="x-large")
        
        plt.figure(figsize=(8, 5))
        plt.hist(self._sz_words_html, bins=100, range=(0, 10000))
        plt.legend(labels=("Количество слов в документе с разметкой",), fontsize="x-large")
        
        plt.figure(figsize=(8, 5))
        plt.hist(self._sz_words, bins=100, range=(0, 4000))
        plt.legend(labels=("Количество слов в документе без разметки",), fontsize="x-large")
        
        plt.figure(figsize=(16, 5))
        plt.hist(size_ratio, bins=100)
        plt.legend(labels=("Отношение объема текста к объему исходного документа",), fontsize="x-large")
        
        plt.show()
        
processor = PrimaryStatsGetter()        
process_collection(COLLECTION_DIRECTORY, processor)
processor.result()

In [None]:
COLLECTION_DIRECTORY = "byweb" # directory with .out files to process
document_urls = {}

class GetDocUrls(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
    
    def process(self, document):
        """ document: Document (see first cell)
            process each document here """
        document_url = str(document.url)[2:-1].split("?")[0].split("#")[0]
        if document_url[-1] == '/':
            document_url = document_url[:-1]
        document_urls[document_url] = True
        
    def result(self):
        pass
        
processor = GetDocUrls()        
process_collection(COLLECTION_DIRECTORY, processor)
processor.result()

In [None]:
from tqdm import tqdm_notebook
from pymystem3 import Mystem
import re
import os
import pickle
import math
from statistics import mean

COLLECTION_DIRECTORY = "byweb"

stop_words = [
    'и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне', 'было', 'вот', 'от', 'меня', 'еще', 'нет', 'о', 'из', 'ему', 'теперь', 'когда', 'даже', 'ну', 'вдруг', 'ли', 'если', 'уже', 'или', 'ни', 'быть', 'был', 'него', 'до', 'вас', 'нибудь', 'опять', 'уж', 'вам', 'ведь', 'там', 'потом', 'себя', 'ничего', 'ей', 'может', 'они', 'тут', 'где', 'есть', 'надо', 'ней', 'для', 'мы', 'тебя', 'их', 'чем', 'была', 'сам', 'чтоб', 'без', 'будто', 'чего', 'раз', 'тоже', 'себе', 'под', 'будет', 'ж', 'тогда', 'кто', 'этот', 'того', 'потому', 'этого', 'какой', 'совсем', 'ним', 'здесь', 'этом', 'один', 'почти', 'мой', 'тем', 'чтобы', 'нее', 'сейчас', 'были', 'куда', 'зачем', 'всех', 'никогда', 'можно', 'при', 'наконец', 'два', 'об', 'другой', 'хоть', 'после', 'над', 'больше', 'тот', 'через', 'эти', 'нас', 'про', 'всего', 'них', 'какая', 'много', 'разве', 'три', 'эту', 'моя', 'впрочем', 'хорошо', 'свою', 'этой', 'перед', 'иногда', 'лучше', 'чуть', 'том', 'нельзя', 'такой', 'им', 'более', 'всегда', 'конечно', 'всю', 'между',
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'you\'re', 'you\'ve', 'you\'ll', 'you\'d', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'she\'s', 'her', 'hers', 'herself', 'it', 'it\'s', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'that\'ll', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'don\'t', 'should', 'should\'ve', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'aren\'t', 'couldn', 'couldn\'t', 'didn', 'didn\'t', 'doesn', 'doesn\'t', 'hadn', 'hadn\'t', 'hasn', 'hasn\'t', 'haven', 'haven\'t', 'isn', 'isn\'t', 'ma', 'mightn', 'mightn\'t', 'mustn', 'mustn\'t', 'needn', 'needn\'t', 'shan', 'shan\'t', 'shouldn', 'shouldn\'t', 'wasn', 'wasn\'t', 'weren', 'weren\'t', 'won', 'won\'t', 'wouldn', 'wouldn\'t'
]

class MystemProcessor(BaseDocumentProcessor):
    def __init__(self):
        self.m = Mystem(grammar_info=False, disambiguation=False)
        reg = re.compile('[a-zа-яё0-9]')
        self.latin_reg = re.compile('[a-z]')
        self.filterFunc = lambda w : reg.match(w)
        self.total_words_count = 0
        self.stop_words_count = 0
        self.total_symbol_count = 0
        self.latin_words_count = 0
        self.words_to_count = dict()
        self.words_to_document = dict()
        self.doc_count = 0
    
    def process(self, document):
        self.doc_count += 1
        if (self.doc_count % 10000 == 0):
            print(self.doc_count)
        text = ' '.join(document.words).lower()
        lemmas = self.m.lemmatize(text)
        words = list(filter(self.filterFunc, lemmas))
    

        for word in words:
            self.total_words_count += 1
            self.stop_words_count += 1 if word in stop_words else 0
            self.total_symbol_count += len(word)
            self.latin_words_count += 1 if self.latin_reg.match(word) else 0
            if (word in self.words_to_count):
                self.words_to_count[word] += 1
            else:
                self.words_to_count[word] = 1
                
            if (word in self.words_to_document):
                self.words_to_document[word].add(document.id)
            else:
                self.words_to_document[word] = { document.id }
        
         
    def result(self):
        words = list(self.words_to_document)
        print('Доля стоп слов в коллекции', self.stop_words_count / self.total_words_count)
        print('Доля слов латиницей в коллекции', self.latin_words_count / self.total_words_count)
        print('Средняя длина слова в коллекции: ', self.total_symbol_count / self.total_words_count)

        idf = [ (word, math.log(200000 / len(self.words_to_document[word]), 2)) for word in words ]
        cf = [(word, self.words_to_count[word] / self.total_words_count) for word in words ]

        symb = mean([len(w) for w in words])
        print('Средняя длина слова в словаре: ', symb)

        with open ("words.tmp", "wb") as fout:
            pickle.dump(words, fout)
            pickle.dump(cf, fout)
            pickle.dump(idf, fout)

mystem_processor = MystemProcessor()
process_collection(COLLECTION_DIRECTORY, mystem_processor)
mystem_processor.result()

In [None]:
import math
import matplotlib.pyplot as plt

with open ("words.tmp", "rb") as fin:
    words = pickle.load(fin)
    cf = pickle.load(fin)
    idf = pickle.load(fin)
    
sortkey = lambda x : x[1]

idf.sort(key = sortkey, reverse = False)
print('idf по возрастанию')
print(*idf[:20], sep="\n")

idf.sort(key = sortkey, reverse = True)
print('idf по убыванию')
print(*idf[:20], sep="\n")

cf.sort(key = sortkey, reverse = False)
print('cf по возрастанию')
print(*cf[:20], sep="\n")

cf.sort(key = sortkey, reverse = True)
print('cf по убыванию')
print(*cf[:20], sep="\n")

fig = plt.figure(figsize=(10, 5))
cf_rank = fig.add_subplot(1, 1, 1)
cf_rank.set_xlabel('log(ранг слова)')
cf_rank.set_ylabel('log(частота слова)')
cf_rank.set_xscale('log')
cf_rank.set_yscale('log')
cf_rank.plot([y for x, y in cf], color='blue')
plt.show()

In [None]:
COLLECTION_DIRECTORY = "byweb" # directory with .out files to process
document_urls = {}

class GetDocUrls(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
    
    def process(self, document):
        """ document: Document (see first cell)
            process each document here """
        document_url = str(document.url)[2:-1].split("?")[0].split("#")[0]
        if document_url[-1] == '/':
            document_url = document_url[:-1]
        document_urls[document_url] = True
        
    def result(self):
        pass
        
processor = GetDocUrls()        
process_collection(COLLECTION_DIRECTORY, processor)
processor.result()

In [None]:
from tqdm import tqdm_notebook
import networkx as nx
import operator

COLLECTION_DIRECTORY = "byweb" # directory with .out files to process

class GraphBuider(BaseDocumentProcessor):
    def __init__(self):
        """ do all initialization here """
        self.edge_list = []
    
    def process(self, document):
        """ document: Document (see first cell)
            process each document here """
        links = document.links
        document_url = str(document.url)[2:-1].split("?")[0].split("#")[0]
        if document_url[-1] == '/':
            document_url = document_url[:-1]
        links_absolute_no_params = []
        for link in links:
            if not link:
                continue
            link_absolute_path = link    
            if not (link_absolute_path.startswith("http://") or link_absolute_path.startswith("https://")):
                if link_absolute_path[0] == '.':
                    link_absolute_path = link_absolute_path[2:]
                link_absolute_path = document_url.rsplit("/", 1)[0] + "/" + link_absolute_path
            link_absolute_no_params = link_absolute_path.split("?")[0].split("#")[0]
            if link_absolute_no_params[-1] == '/':
                    link_absolute_no_params = link_absolute_no_params[:-1]
            if not (link_absolute_no_params in links_absolute_no_params):
                links_absolute_no_params.append(link_absolute_no_params)
        for link in links_absolute_no_params:
            if link in document_urls:
                self.edge_list.append((document_url, link))
        
    def result(self):
        graph = nx.DiGraph(self.edge_list)
        top_300_page_rank = sorted(nx.pagerank(graph).items(), key=operator.itemgetter(1), reverse=True)[:300]
        top_300_page_rank = [x[0] for x in top_300_page_rank]
        with open("graph4.csv", "wt") as fout:
            for in_vertex, out_vertex in tqdm_notebook(graph.edges()):
                if in_vertex in top_300_page_rank and out_vertex in top_300_page_rank:
                    fout.write(in_vertex)
                    fout.write(";")
                    fout.write(out_vertex)
                    fout.write('\n')    
        
processor = GraphBuider()        
process_collection(COLLECTION_DIRECTORY, processor)
processor.result()