In [687]:
import pandas as pd
import numpy as np
import textwrap
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import requests
from bs4 import BeautifulSoup

In [688]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Viktor_Shevchuk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Viktor_Shevchuk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [689]:
URL1 = "https://zakon.rada.gov.ua/laws/show/en/1207-18/conv/print"
URL2 = "https://zakon.rada.gov.ua/laws/show/en/1700-18/print"
page = requests.get(URL1)

# print(page.text) 


In [690]:
soup = BeautifulSoup(page.content, "html.parser")
# soup

In [691]:
paragraphs = soup.select('.rvts0 > .rvps2')
# paragraphs

In [692]:
legal_stop_words = [
    "amended", "restated", "be stated", "supplemented", "section", "paragraph", "article", "as follows", "clauses"
]


In [693]:
def isSentenceWithStopWord(para):
    for word in legal_stop_words:
        sentenceWithStopWord = para.find(
            text=lambda t: word in t.text.lower()
        )

        if sentenceWithStopWord:
            return True


In [694]:
def isProvisions(para):
   return para.find(
        text=lambda t: "final and transitional provisions" in t.text.lower()
    )

In [695]:
def isArticleEnsuring(para):
  return para.find("span", class_="rvts37")


In [696]:
def isArticleTitle(para):
    return para.find("span", class_="rvts9")


In [697]:
def isClosingChar(letter):
  return letter == "." or letter == ")"


In [698]:
def discardOrderedListNumbers(text):
    isFirstCharNumeric = text[0].isdigit()
    isSecondCharNumeric = text[1].isdigit()
    isThirdCharNumeric = text[2].isdigit()

    if isFirstCharNumeric and isClosingChar(text[1]):
        return text[2:].strip()
    elif isFirstCharNumeric and isSecondCharNumeric and isClosingChar(text[2]):
        return text[3:].strip()
    elif isFirstCharNumeric and isSecondCharNumeric and isThirdCharNumeric and isClosingChar(text[3]):
        return text[4:].strip()

    return text


In [699]:
def clean(paragraphs):
    cleaned = []
    articleEnsuring = False

    for para in paragraphs:
        if articleEnsuring and not isArticleTitle(para):
            break
        elif articleEnsuring and isArticleTitle(para):
            articleEnsuring = False

        if isArticleEnsuring(para):
            articleEnsuring = True
            break

        if isProvisions(para):
            return cleaned
 
        if not isSentenceWithStopWord(para) and not isArticleTitle(para):
            cleaned.append(discardOrderedListNumbers(para.text.strip()))

    return cleaned


In [700]:
cleaned_paragraphs = clean(paragraphs)
# cleaned_paragraphs

In [701]:
featurizer = TfidfVectorizer(
    stop_words=stopwords.words('english'),
    # norm='l1'
)
# featurizer


In [702]:
X = featurizer.fit_transform(
    cleaned_paragraphs
)
# X

In [703]:
def get_sentence_score(tfidf_row):
    # return the average of the non-zero values of the tf-idf vector representation of a sentence
    x = tfidf_row[tfidf_row != 0]

    return x.mean()

In [704]:
scores = np.zeros(len(cleaned_paragraphs))
for i in range(len(cleaned_paragraphs)):
    score = get_sentence_score(X[i, :])
    scores[i] = score


In [705]:
sort_idx = np.argsort(-scores)
# sort_idx

In [706]:
for i in sort_idx[:10]:
    print("%.2f: %s" % (scores[i], cleaned_paragraphs[i]))

0.38: The date of the beginning of the temporary occupation is 20 February 2014.
0.36: For the purposes of this Law, the temporarily occupied territory is defined as:
0.30: The temporarily occupied territory of Ukraine (hereinafter referred to as the temporarily occupied territory) is an integral part of the territory of Ukraine, which is subject to the Constitution and laws of Ukraine.
0.29: land territory of the Autonomous Republic of Crimea and the city of Sevastopol, the internal waters of these territories of Ukraine;
0.28: Property rights within the temporarily occupied territory shall be protected under the legislation of Ukraine.
0.26: The legal regime of the temporarily occupied territory may be established, changed or abolished exclusively by the laws of Ukraine.
0.26: The state of Ukraine shall by all possible means ensure compensation for pecuniary and non-pecuniary damage by the Russian Federation.
0.26: Foreigners and stateless persons shall be allowed into and out of the

In [707]:
title = soup.find("span", class_="rvts23").text 
title


'On Ensuring Civil Rights and Freedoms, and the Legal Regime on the Temporarily Occupied Territory of Ukraine'

In [708]:
def tfIDFSummarize(text):
    X = featurizer.fit_transform(text)
    scores = np.zeros(len(text))

    for i in range(len(text)):
        score = get_sentence_score(X[i, :])
        scores[i] = score

    sort_idx = np.argsort(-scores)

    for i in sort_idx[:10]:
        print("%.2f: %s" % (scores[i], text[i]))


In [709]:
# tfIDFSummarize(cleaned_paragraphs)


In [710]:
S = cosine_similarity(X)
# S

In [711]:
S.shape

(46, 46)

In [712]:
len(cleaned_paragraphs)


46

In [713]:
# normalize similarity matrix
S /= S.sum(axis=1, keepdims=True)


In [714]:
S[0].sum()

1.0

In [715]:
# uniform transition matrix
U = np.ones_like(S) / len(S)


In [716]:
U[0].sum()

0.9999999999999999

In [717]:
# smoothed similarity matrix
factor = 0.15 
S = (1 - factor) * S + factor * U

In [718]:
S[0].sum()

0.9999999999999999

In [719]:
# find the limiting / stationary distribution
eigenvals, eigenvecs = np.linalg.eig(S.T)


In [720]:
eigenvals

array([1.        , 0.68713955, 0.42094194, 0.3902211 , 0.36261406,
       0.33983174, 0.30824458, 0.29818267, 0.28105248, 0.26026103,
       0.2509966 , 0.23497632, 0.21903648, 0.21127277, 0.2053398 ,
       0.20304684, 0.0426177 , 0.04853621, 0.05312734, 0.18979891,
       0.06089023, 0.06295134, 0.18126754, 0.06906333, 0.17444816,
       0.07529121, 0.07869511, 0.08382169, 0.08666179, 0.16919783,
       0.16465437, 0.15737268, 0.093671  , 0.10032396, 0.10265545,
       0.10356505, 0.10638905, 0.11199114, 0.14941862, 0.14243006,
       0.14174633, 0.13896901, 0.11960622, 0.12389636, 0.12711658,
       0.12741678])

In [721]:
eigenvecs[:, 0]


array([0.15847942, 0.13767847, 0.16954673, 0.08885199, 0.15719089,
       0.12470143, 0.1083043 , 0.11710738, 0.15380697, 0.17897055,
       0.1264791 , 0.20154232, 0.1333414 , 0.18827908, 0.16113784,
       0.14354468, 0.14429043, 0.150828  , 0.10553467, 0.14741334,
       0.16941742, 0.13308572, 0.10533194, 0.14358128, 0.15855451,
       0.14620688, 0.16648069, 0.13244102, 0.15683581, 0.13814949,
       0.17838724, 0.09965675, 0.10409286, 0.10630752, 0.16467182,
       0.12708417, 0.16618928, 0.13852189, 0.1305057 , 0.16851976,
       0.18262022, 0.16155457, 0.15653402, 0.17898771, 0.12744973,
       0.14084469])

In [722]:
eigenvecs[:, 0].dot(S)


array([0.15847942, 0.13767847, 0.16954673, 0.08885199, 0.15719089,
       0.12470143, 0.1083043 , 0.11710738, 0.15380697, 0.17897055,
       0.1264791 , 0.20154232, 0.1333414 , 0.18827908, 0.16113784,
       0.14354468, 0.14429043, 0.150828  , 0.10553467, 0.14741334,
       0.16941742, 0.13308572, 0.10533194, 0.14358128, 0.15855451,
       0.14620688, 0.16648069, 0.13244102, 0.15683581, 0.13814949,
       0.17838724, 0.09965675, 0.10409286, 0.10630752, 0.16467182,
       0.12708417, 0.16618928, 0.13852189, 0.1305057 , 0.16851976,
       0.18262022, 0.16155457, 0.15653402, 0.17898771, 0.12744973,
       0.14084469])

In [723]:
scores = eigenvecs[:, 0] / eigenvecs[:, 0].sum()
sort_idx = np.argsort(-scores)


In [724]:
for i in sort_idx[:10]:
    print("%.2f: %s" % (scores[i], cleaned_paragraphs[i]))


0.03: Ukraine shall take all necessary measures to guarantee human and civil rights and freedoms provided for by the Constitution and laws of Ukraine, international treaties, to all citizens of Ukraine residing within the temporarily occupied territory.
0.03: Responsibility for violating human and civil rights and freedoms defined by the Constitution and laws of Ukraine within the temporarily occupied territory shall be assigned to the Russian Federation as an occupying state under the norms and principles of international law.
0.03: Property rights within the temporarily occupied territory shall be protected under the legislation of Ukraine.
0.03: Acquisition and termination of ownership right to real estate located within the temporarily occupied territory shall be carried out following the provisions of the legislation of Ukraine outside the temporarily occupied territory. If it is impossible for the state registrar to exercise the powers of state registration of real rights to real

In [725]:
def textRankSummarize(text, factor=0.15):
    X = featurizer.fit_transform(text)
    S = cosine_similarity(X)  # compute similarity matrix
    S /= S.sum(axis=1, keepdims=True)  # normalize similarity matrix
    U = np.ones_like(S) / len(S)  # uniform transition matrix
    S = (1 - factor) * S + factor * U  # smooth similarity matrix
    # find the limiting / stationary distribution
    eigenvals, eigenvecs = np.linalg.eig(S.T)
    scores = eigenvecs[:, 0] / eigenvecs[:, 0].sum()  # compute scores
    sort_idx = np.argsort(-scores)  # sort scores

    for i in sort_idx[:10]: 
        print("%.2f: %s" % (scores[i], text[i]))


In [726]:
textRankSummarize(cleaned_paragraphs)


0.03: Ukraine shall take all necessary measures to guarantee human and civil rights and freedoms provided for by the Constitution and laws of Ukraine, international treaties, to all citizens of Ukraine residing within the temporarily occupied territory.
0.03: Responsibility for violating human and civil rights and freedoms defined by the Constitution and laws of Ukraine within the temporarily occupied territory shall be assigned to the Russian Federation as an occupying state under the norms and principles of international law.
0.03: Property rights within the temporarily occupied territory shall be protected under the legislation of Ukraine.
0.03: Acquisition and termination of ownership right to real estate located within the temporarily occupied territory shall be carried out following the provisions of the legislation of Ukraine outside the temporarily occupied territory. If it is impossible for the state registrar to exercise the powers of state registration of real rights to real