In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
from urllib.parse import urlparse
from urllib.parse import urldefrag
from urllib.request import urlopen
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from file_storage import FileStorage
import time
import datetime
%matplotlib inline

In [5]:
fs = FileStorage('storage', readonly=True)

In [6]:
cur_keys = set(fs.keys())

In [7]:
cur_keys

{'https://simple.wikipedia.org/wiki/Me_%26_My_Katamari',
 'https://simple.wikipedia.org/wiki/Soft_drinks',
 'https://simple.wikipedia.org/wiki/Movius_Line',
 'https://simple.wikipedia.org/wiki/294',
 'https://simple.wikipedia.org/wiki/Lightweight',
 'https://simple.wikipedia.org/wiki/Princess_Jasmine',
 'https://simple.wikipedia.org/wiki/A-Sides',
 'https://simple.wikipedia.org/wiki/Colorimetric_analysis',
 'https://simple.wikipedia.org/wiki/Category:People_from_Saint_Petersburg',
 'https://simple.wikipedia.org/wiki/Gy%C5%91z%C5%91_Kulcs%C3%A1r',
 'https://simple.wikipedia.org/wiki/Abdus_Salim_Khan',
 'https://simple.wikipedia.org/wiki/Category:French_television_series',
 'https://simple.wikipedia.org/wiki/Sara_Rue',
 'https://simple.wikipedia.org/wiki/Brauerei_Kaiserdom',
 'https://simple.wikipedia.org/wiki/Category:Demographics_by_country',
 'https://simple.wikipedia.org/wiki/Al-Farabi',
 'https://simple.wikipedia.org/wiki/Craniata',
 'https://simple.wikipedia.org/wiki/Category:Disea

In [8]:
len(cur_keys), fs.count()

(172262, 172262)

In [9]:
bad_parts = [
             '/wiki/Help:', '/wiki/Help_talk:',
             '/wiki/File:', '/wiki/Media:', '/wiki/MediaWiki:', '/wiki/MediaWiki_talk:',
             '/wiki/Module:', '/wiki/Talk:', '/wiki/Category:', '/wiki/Category_talk:',
             '/wiki/User:', '/wiki/User_talk:', '/wiki/Special:',
             '/wiki/Template:', '/wiki/Template_talk:', '/wiki/Wikipedia:', '/wiki/Wikipedia_talk:'
            ]

In [10]:
for k in list(cur_keys):
    bool_lsit = [p in k for p in bad_parts]
    bad = any(bool_lsit)
    if bad:
        cur_keys.remove(k)

In [11]:
len(cur_keys)

142683

In [12]:
fs.read('https://simple.wikipedia.org/wiki/EFL_Cup')

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>EFL Cup - Simple English Wikipedia, the free encyclopedia</title>\n<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"EFL_Cup","wgTitle":"EFL Cup","wgCurRevisionId":5608523,"wgRevisionId":5608523,"wgArticleId":65242,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Sports stubs","Football League Cup"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","Se

In [13]:
graph = {key: set() for key in cur_keys}

In [15]:
len(graph)

142683

In [16]:
from tqdm import tqdm

In [18]:
for key in tqdm(cur_keys):
    doc = fs.read(key)
    parser = BeautifulSoup(doc)
    for link in parser.findAll('a'):
        next_url = urljoin(key, link.get('href'))
        next_url = urldefrag(next_url).url
        parsed_next = urlparse(next_url)
        if (next_url in cur_keys) and (next_url not in graph[key]):
            graph[key].add(next_url)

100%|██████████| 142683/142683 [59:13<00:00, 40.16it/s] 


In [20]:
graph['https://simple.wikipedia.org/wiki/Main_Page']

{'https://simple.wikipedia.org/wiki/1910_Cuba_hurricane',
 'https://simple.wikipedia.org/wiki/Academy_Award_for_Best_Supporting_Actor',
 'https://simple.wikipedia.org/wiki/Algebra',
 'https://simple.wikipedia.org/wiki/Animation',
 'https://simple.wikipedia.org/wiki/Anthropology',
 'https://simple.wikipedia.org/wiki/Archaeology',
 'https://simple.wikipedia.org/wiki/Architecture',
 'https://simple.wikipedia.org/wiki/Art',
 'https://simple.wikipedia.org/wiki/Astronomy',
 'https://simple.wikipedia.org/wiki/Atheism',
 'https://simple.wikipedia.org/wiki/Bah%C3%A1%27%C3%AD_Faith',
 'https://simple.wikipedia.org/wiki/Bankruptcy',
 'https://simple.wikipedia.org/wiki/Basic_English',
 'https://simple.wikipedia.org/wiki/Biology',
 'https://simple.wikipedia.org/wiki/Book',
 'https://simple.wikipedia.org/wiki/Buddhism',
 'https://simple.wikipedia.org/wiki/Caribbean_Sea',
 'https://simple.wikipedia.org/wiki/Cartoonist',
 'https://simple.wikipedia.org/wiki/Chemistry',
 'https://simple.wikipedia.org/wi

In [21]:
M = np.zeros((len(cur_keys), len(cur_keys)))

MemoryError: 

In [22]:
import scipy.sparse as ss

In [23]:
M = ss.dok_matrix((len(cur_keys), len(cur_keys)), dtype=np.float32)

In [25]:
M.shape

(142683, 142683)

In [26]:
key_to_idx = {}
for i, key in enumerate(graph.keys()):
    key_to_idx[key] = i

In [27]:
key_to_idx['https://simple.wikipedia.org/wiki/Main_Page']

81100

In [29]:
idx_to_key = [''] * len(cur_keys)
for key, idx in key_to_idx.items():
    idx_to_key[idx] = key

In [30]:
for i, key in enumerate(idx_to_key):
    assert i == key_to_idx[key]

In [31]:
for key, val in tqdm(graph.items()):
    j = key_to_idx[key]
    num = 1.0 / len(val)
    for dest in val:
        i = key_to_idx[dest]
        M[i, j] = num

100%|██████████| 142683/142683 [00:49<00:00, 2907.49it/s]


In [32]:
M.getnnz()

6713602

In [74]:
PR = np.ones((len(cur_keys), 1), dtype=np.float32)
PR /= len(cur_keys)
PR = ss.dok_matrix(PR, shape=(len(cur_keys), 1), dtype=np.float32)

In [75]:
PR.shape, PR.getnnz()

((142683, 1), 142683)

In [76]:
delta = 0.85
delta_2 = (1.0 - delta) / len(cur_keys)

In [77]:
M_2 = delta * M

In [79]:
E = np.ones((len(cur_keys), 1), dtype=np.float32)
E *= delta_2
E = ss.dok_matrix(E, shape=(len(cur_keys), 1), dtype=np.float32)

In [80]:
for i in tqdm(range(1000)):
    new_PR = M_2.dot(PR) + E
    diff = (PR - new_PR).toarray()
    norm = np.sum(diff ** 2)
    PR = new_PR



  0%|          | 0/1000 [00:00<?, ?it/s][A[A

  0%|          | 1/1000 [00:01<30:53,  1.85s/it][A[A

  0%|          | 2/1000 [00:03<29:48,  1.79s/it][A[A

  0%|          | 3/1000 [00:05<29:05,  1.75s/it][A[A

  0%|          | 4/1000 [00:06<28:35,  1.72s/it][A[A

  0%|          | 5/1000 [00:08<28:14,  1.70s/it][A[A

  1%|          | 6/1000 [00:10<27:59,  1.69s/it][A[A

  1%|          | 7/1000 [00:11<27:50,  1.68s/it][A[A

  1%|          | 8/1000 [00:13<27:42,  1.68s/it][A[A

  1%|          | 9/1000 [00:15<27:37,  1.67s/it][A[A

  1%|          | 10/1000 [00:16<27:31,  1.67s/it][A[A

  1%|          | 11/1000 [00:18<27:26,  1.66s/it][A[A

  1%|          | 12/1000 [00:20<27:23,  1.66s/it][A[A

  1%|▏         | 13/1000 [00:21<27:19,  1.66s/it][A[A

  1%|▏         | 14/1000 [00:23<27:17,  1.66s/it][A[A

  2%|▏         | 15/1000 [00:25<27:15,  1.66s/it][A[A

  2%|▏         | 16/1000 [00:26<27:14,  1.66s/it][A[A

  2%|▏         | 17/1000 [00:28<27:12,  1.66s/i

 14%|█▍        | 143/1000 [03:58<23:39,  1.66s/it][A[A

 14%|█▍        | 144/1000 [03:59<23:38,  1.66s/it][A[A

 14%|█▍        | 145/1000 [04:01<23:36,  1.66s/it][A[A

 15%|█▍        | 146/1000 [04:03<23:34,  1.66s/it][A[A

 15%|█▍        | 147/1000 [04:04<23:32,  1.66s/it][A[A

 15%|█▍        | 148/1000 [04:06<23:30,  1.66s/it][A[A

 15%|█▍        | 149/1000 [04:07<23:27,  1.65s/it][A[A

 15%|█▌        | 150/1000 [04:09<23:27,  1.66s/it][A[A

 15%|█▌        | 151/1000 [04:11<23:24,  1.65s/it][A[A

 15%|█▌        | 152/1000 [04:12<23:22,  1.65s/it][A[A

 15%|█▌        | 153/1000 [04:14<23:48,  1.69s/it][A[A

 15%|█▌        | 154/1000 [04:16<23:47,  1.69s/it][A[A

 16%|█▌        | 155/1000 [04:18<23:36,  1.68s/it][A[A

 16%|█▌        | 156/1000 [04:19<23:29,  1.67s/it][A[A

 16%|█▌        | 157/1000 [04:21<23:23,  1.66s/it][A[A

 16%|█▌        | 158/1000 [04:22<23:19,  1.66s/it][A[A

 16%|█▌        | 159/1000 [04:24<23:14,  1.66s/it][A[A

 16%|█▌       

 28%|██▊       | 284/1000 [07:51<19:47,  1.66s/it][A[A

 28%|██▊       | 285/1000 [07:53<19:44,  1.66s/it][A[A

 29%|██▊       | 286/1000 [07:55<19:44,  1.66s/it][A[A

 29%|██▊       | 287/1000 [07:56<19:42,  1.66s/it][A[A

 29%|██▉       | 288/1000 [07:58<19:40,  1.66s/it][A[A

 29%|██▉       | 289/1000 [08:00<19:38,  1.66s/it][A[A

 29%|██▉       | 290/1000 [08:01<19:36,  1.66s/it][A[A

 29%|██▉       | 291/1000 [08:03<19:33,  1.65s/it][A[A

 29%|██▉       | 292/1000 [08:05<19:50,  1.68s/it][A[A

 29%|██▉       | 293/1000 [08:06<19:50,  1.68s/it][A[A

 29%|██▉       | 294/1000 [08:08<19:43,  1.68s/it][A[A

 30%|██▉       | 295/1000 [08:10<19:35,  1.67s/it][A[A

 30%|██▉       | 296/1000 [08:11<19:32,  1.67s/it][A[A

 30%|██▉       | 297/1000 [08:13<19:27,  1.66s/it][A[A

 30%|██▉       | 298/1000 [08:15<19:25,  1.66s/it][A[A

 30%|██▉       | 299/1000 [08:16<19:22,  1.66s/it][A[A

 30%|███       | 300/1000 [08:18<19:20,  1.66s/it][A[A

 30%|███      

 42%|████▎     | 425/1000 [11:47<15:53,  1.66s/it][A[A

 43%|████▎     | 426/1000 [11:49<15:49,  1.65s/it][A[A

 43%|████▎     | 427/1000 [11:51<15:47,  1.65s/it][A[A

 43%|████▎     | 428/1000 [11:52<15:48,  1.66s/it][A[A

 43%|████▎     | 429/1000 [11:54<15:48,  1.66s/it][A[A

 43%|████▎     | 430/1000 [11:56<15:50,  1.67s/it][A[A

 43%|████▎     | 431/1000 [11:57<15:45,  1.66s/it][A[A

 43%|████▎     | 432/1000 [11:59<15:43,  1.66s/it][A[A

 43%|████▎     | 433/1000 [12:01<15:39,  1.66s/it][A[A

 43%|████▎     | 434/1000 [12:02<15:37,  1.66s/it][A[A

 44%|████▎     | 435/1000 [12:04<15:34,  1.65s/it][A[A

 44%|████▎     | 436/1000 [12:06<15:33,  1.66s/it][A[A

 44%|████▎     | 437/1000 [12:07<15:32,  1.66s/it][A[A

 44%|████▍     | 438/1000 [12:09<15:30,  1.66s/it][A[A

 44%|████▍     | 439/1000 [12:11<15:28,  1.65s/it][A[A

 44%|████▍     | 440/1000 [12:12<15:27,  1.66s/it][A[A

 44%|████▍     | 441/1000 [12:14<15:24,  1.65s/it][A[A

 44%|████▍    

 57%|█████▋    | 566/1000 [15:41<11:59,  1.66s/it][A[A

 57%|█████▋    | 567/1000 [15:43<11:56,  1.66s/it][A[A

 57%|█████▋    | 568/1000 [15:45<11:58,  1.66s/it][A[A

 57%|█████▋    | 569/1000 [15:46<11:55,  1.66s/it][A[A

 57%|█████▋    | 570/1000 [15:48<11:53,  1.66s/it][A[A

 57%|█████▋    | 571/1000 [15:50<11:51,  1.66s/it][A[A

 57%|█████▋    | 572/1000 [15:51<11:49,  1.66s/it][A[A

 57%|█████▋    | 573/1000 [15:53<11:46,  1.66s/it][A[A

 57%|█████▋    | 574/1000 [15:55<11:45,  1.66s/it][A[A

 57%|█████▊    | 575/1000 [15:56<11:43,  1.66s/it][A[A

 58%|█████▊    | 576/1000 [15:58<11:43,  1.66s/it][A[A

 58%|█████▊    | 577/1000 [16:00<11:40,  1.66s/it][A[A

 58%|█████▊    | 578/1000 [16:01<11:38,  1.66s/it][A[A

 58%|█████▊    | 579/1000 [16:03<11:36,  1.65s/it][A[A

 58%|█████▊    | 580/1000 [16:05<11:35,  1.66s/it][A[A

 58%|█████▊    | 581/1000 [16:06<11:32,  1.65s/it][A[A

 58%|█████▊    | 582/1000 [16:08<11:32,  1.66s/it][A[A

 58%|█████▊   

 71%|███████   | 707/1000 [19:35<08:05,  1.66s/it][A[A

 71%|███████   | 708/1000 [19:37<08:03,  1.66s/it][A[A

 71%|███████   | 709/1000 [19:38<08:01,  1.65s/it][A[A

 71%|███████   | 710/1000 [19:40<08:00,  1.66s/it][A[A

 71%|███████   | 711/1000 [19:42<07:58,  1.65s/it][A[A

 71%|███████   | 712/1000 [19:43<07:56,  1.66s/it][A[A

 71%|███████▏  | 713/1000 [19:45<07:54,  1.65s/it][A[A

 71%|███████▏  | 714/1000 [19:47<07:53,  1.66s/it][A[A

 72%|███████▏  | 715/1000 [19:48<07:51,  1.65s/it][A[A

 72%|███████▏  | 716/1000 [19:50<07:50,  1.66s/it][A[A

 72%|███████▏  | 717/1000 [19:52<07:48,  1.65s/it][A[A

 72%|███████▏  | 718/1000 [19:53<07:46,  1.66s/it][A[A

 72%|███████▏  | 719/1000 [19:55<07:45,  1.66s/it][A[A

 72%|███████▏  | 720/1000 [19:57<07:44,  1.66s/it][A[A

 72%|███████▏  | 721/1000 [19:58<07:42,  1.66s/it][A[A

 72%|███████▏  | 722/1000 [20:00<07:40,  1.66s/it][A[A

 72%|███████▏  | 723/1000 [20:02<07:38,  1.66s/it][A[A

 72%|███████▏ 

 85%|████████▍ | 848/1000 [23:29<04:12,  1.66s/it][A[A

 85%|████████▍ | 849/1000 [23:31<04:10,  1.66s/it][A[A

 85%|████████▌ | 850/1000 [23:33<04:08,  1.66s/it][A[A

 85%|████████▌ | 851/1000 [23:34<04:06,  1.65s/it][A[A

 85%|████████▌ | 852/1000 [23:36<04:04,  1.65s/it][A[A

 85%|████████▌ | 853/1000 [23:37<04:02,  1.65s/it][A[A

 85%|████████▌ | 854/1000 [23:39<04:00,  1.65s/it][A[A

 86%|████████▌ | 855/1000 [23:41<03:58,  1.65s/it][A[A

 86%|████████▌ | 856/1000 [23:42<03:57,  1.65s/it][A[A

 86%|████████▌ | 857/1000 [23:44<03:55,  1.64s/it][A[A

 86%|████████▌ | 858/1000 [23:46<03:53,  1.64s/it][A[A

 86%|████████▌ | 859/1000 [23:47<03:51,  1.64s/it][A[A

 86%|████████▌ | 860/1000 [23:49<03:50,  1.64s/it][A[A

 86%|████████▌ | 861/1000 [23:51<03:48,  1.64s/it][A[A

 86%|████████▌ | 862/1000 [23:52<03:47,  1.65s/it][A[A

 86%|████████▋ | 863/1000 [23:54<03:45,  1.65s/it][A[A

 86%|████████▋ | 864/1000 [23:56<03:44,  1.65s/it][A[A

 86%|████████▋

 99%|█████████▉| 989/1000 [27:23<00:18,  1.66s/it][A[A

 99%|█████████▉| 990/1000 [27:24<00:16,  1.66s/it][A[A

 99%|█████████▉| 991/1000 [27:26<00:14,  1.66s/it][A[A

 99%|█████████▉| 992/1000 [27:28<00:13,  1.66s/it][A[A

 99%|█████████▉| 993/1000 [27:29<00:11,  1.66s/it][A[A

 99%|█████████▉| 994/1000 [27:31<00:09,  1.66s/it][A[A

100%|█████████▉| 995/1000 [27:33<00:08,  1.66s/it][A[A

100%|█████████▉| 996/1000 [27:34<00:06,  1.66s/it][A[A

100%|█████████▉| 997/1000 [27:36<00:04,  1.66s/it][A[A

100%|█████████▉| 998/1000 [27:38<00:03,  1.66s/it][A[A

100%|█████████▉| 999/1000 [27:39<00:01,  1.66s/it][A[A

100%|██████████| 1000/1000 [27:41<00:00,  1.66s/it][A[A

[A[A

In [81]:
norm

0.0

Можно сказать, что вероятности сошлись

In [82]:
np_PR = PR.toarray().ravel()

In [83]:
argsort = np.argsort(np_PR)

In [84]:
argsort = argsort[-1::-1]
argsort

array([ 81100,  52118,  20124, ...,  96278, 142363,   8119])

In [85]:
sort = np.sort(np_PR)
sort = sort[-1::-1]
sort

array([4.1236065e-02, 4.5967977e-03, 3.2565314e-03, ..., 1.0541443e-06,
       1.0536571e-06, 1.0530936e-06], dtype=float32)

In [86]:
np_PR.sum()

1.0000083

In [87]:
sort = sort / sort[0]
sort

array([1.0000000e+00, 1.1147518e-01, 7.8972891e-02, ..., 2.5563648e-05,
       2.5551835e-05, 2.5538167e-05], dtype=float32)

In [88]:
with open('res.txt', 'w') as f:
    for idx, pr in zip(argsort, sort):
        f.write('{}\t{}\n'.format(idx_to_key[idx], pr))