### Download PDFs

In [1]:
import os
from pprint import pprint

if not os.path.exists('output/stf-adi'):
    os.makedirs('output/stf-adi')

In [2]:
import zipfile
import json
import random

zipRef = zipfile.ZipFile('input/stf_adi-2017.zip', 'r')
zipInfos = zipRef.infolist()
 
for zipInfo in zipInfos:
    zipInfo.filename = 'input/stf_adi-2017.json'
    zipRef.extract(zipInfo)

zipRef.close()

with open('input/stf_adi-2017.json', encoding='utf-8') as f:
    data = json.load(f)

print("Loaded " + str(len(data)) + " ADIs")
example = random.choice(data)
print("Random example: " + str(example["classeProcesso"]) + " " + str(example["numeroProcesso"]) + " - " + str(example["relatorAtual"]) + " / " + str(example["assunto"]))

Loaded 177 ADIs
Random example: ADI 5787 - MIN. LUIZ FUX / ['DIREITO ADMINISTRATIVO E OUTRAS MATÉRIAS DE DIREITO PÚBLICO', 'Política fundiária e da reforma agrária']


In [3]:
import copy

processos = list(map(lambda j: {"andamentos": j["andamentos"], "numeroProcesso": j["numeroProcesso"] }, data))

andamentos = []       
for processo in processos:

    for andamento in processo["andamentos"]:
        andamentoIdentificado = copy.copy(andamento)
        andamentoIdentificado["numeroProcesso"] = processo["numeroProcesso"]
        andamentos.append(andamentoIdentificado)

documentos = []
for andamento in andamentos:
    if (andamento["documento"]["link"]):
        documentos.append({ "numeroProcesso": andamento["numeroProcesso"], "url": andamento["documento"]["link"]})

        
print("Recuperados todos os PDFs com inteiro teor das decisões")
pprint(documentos[0:3])


Recuperados todos os PDFs com inteiro teor das decisões
[{'numeroProcesso': '5634',
  'url': 'http://www.stf.jus.br/portal/processo/verProcessoPeca.asp?id=311057090&tipoApp=.pdf'},
 {'numeroProcesso': '5635',
  'url': 'http://www.stf.jus.br/portal/processo/verProcessoPeca.asp?id=311738408&tipoApp=.pdf'},
 {'numeroProcesso': '5635',
  'url': 'http://www.stf.jus.br/portal/processo/verProcessoPeca.asp?id=311650892&tipoApp=.pdf'}]


In [4]:
import sys
import modules.downloadutils as downloadutils

for doc in documentos:
    try:
        downloadutils.download_file(doc["url"], 'output/stf-adi-' + str(doc["numeroProcesso"]) + '.pdf', 'application/pdf')
        sys.stdout.write('.')
    except Exception as e:
        pprint(str(doc) + ' => ' + str(e))
    

.........................................................................................................................................("{'url': "
 "'http://www.stf.jus.br/portal/processo/verProcessoPeca.asp?id=311827077&tipoApp=.pdf', "
 "'numeroProcesso': '5684'} => 500 Server Error: Internal Server Error for "
 'url: '
 'http://www.stf.jus.br/portal/processo/verProcessoPeca.asp?id=311827077&tipoApp=.pdf')
("{'url': "
 "'http://www.stf.jus.br/portal/processo/verProcessoPeca.asp?id=311670004&tipoApp=.pdf', "
 "'numeroProcesso': '5684'} => 500 Server Error: Internal Server Error for "
 'url: '
 'http://www.stf.jus.br/portal/processo/verProcessoPeca.asp?id=311670004&tipoApp=.pdf')
.....................("{'url': "
 "'http://www.stf.jus.br/portal/processo/verProcessoPeca.asp?id=312097064&tipoApp=.pdf', "
 "'numeroProcesso': '5697'} => 500 Server Error: Internal Server Error for "
 'url: '
 'http://www.stf.jus.br/portal/processo/verProcessoPeca.asp?id=312097064&tipoApp=.pdf')
...........

In [5]:
from os import listdir
from os.path import isfile, join

pdfs_path = 'output/stf-adi'

pdf_files = [f for f in listdir(pdfs_path) if isfile(join(pdfs_path, f)) and f.lower().endswith('.pdf')]

print('index <=> PDF')
for idx, pf in enumerate(pdf_files):
    print(str(idx) + '<=>' + pf)
    
print('Found ' + str(len(pdf_files)) + ' pdf files (decisoes)')

index <=> PDF
0<=>stf-adi-5606.pdf
1<=>stf-adi-5625.pdf
2<=>stf-adi-5627.pdf
3<=>stf-adi-5628.pdf
4<=>stf-adi-5630.pdf
5<=>stf-adi-5631.pdf
6<=>stf-adi-5632.pdf
7<=>stf-adi-5633.pdf
8<=>stf-adi-5634.pdf
9<=>stf-adi-5635.pdf
10<=>stf-adi-5636.pdf
11<=>stf-adi-5637.pdf
12<=>stf-adi-5638.pdf
13<=>stf-adi-5639.pdf
14<=>stf-adi-5640.pdf
15<=>stf-adi-5641.pdf
16<=>stf-adi-5642.pdf
17<=>stf-adi-5643.pdf
18<=>stf-adi-5644.pdf
19<=>stf-adi-5645.pdf
20<=>stf-adi-5646.pdf
21<=>stf-adi-5647.pdf
22<=>stf-adi-5648.pdf
23<=>stf-adi-5649.pdf
24<=>stf-adi-5650.pdf
25<=>stf-adi-5651.pdf
26<=>stf-adi-5652.pdf
27<=>stf-adi-5653.pdf
28<=>stf-adi-5654.pdf
29<=>stf-adi-5655.pdf
30<=>stf-adi-5656.pdf
31<=>stf-adi-5657.pdf
32<=>stf-adi-5658.pdf
33<=>stf-adi-5659.pdf
34<=>stf-adi-5660.pdf
35<=>stf-adi-5661.pdf
36<=>stf-adi-5662.pdf
37<=>stf-adi-5663.pdf
38<=>stf-adi-5664.pdf
39<=>stf-adi-5665.pdf
40<=>stf-adi-5666.pdf
41<=>stf-adi-5667.pdf
42<=>stf-adi-5668.pdf
43<=>stf-adi-5669.pdf
44<=>stf-adi-5670.pdf
45<=>s

In [6]:
import modules.nlputils as nlputils
import sys
import slate3k as slate

print('Loading text from pdfs')
decisoes = []
for f in pdf_files:
    with open(pdfs_path + '/' + f,'rb') as pf:
        pages = slate.PDF(pf)
        text = "\n".join(pages)
        decisoes.append(text)
    sys.stdout.write('.')

print('\nSAMPLE TEXT EXTRACTED FROM PDF')
print(decisoes[3])

Loading text from pdfs
................................................................



...................................................



................................................
SAMPLE TEXT EXTRACTED FROM PDF
Documento assinado digitalmente conforme MP n° 2.200-2/2001 de 24/08/2001, que institui a Infraestrutura de Chaves Públicas Brasileira - ICP-Brasil. O
documento pode ser acessado no endereço eletrônico http://www.stf.jus.br/portal/autenticacao/ sob o número 12142542.

AÇÃO DIRETA DE INCONSTITUCIONALIDADE 5.628 DISTRITO FEDERALRELATOR:MIN. TEORI ZAVASCKIREQTE.(S):GOVERNADOR DO ESTADO DO ACRE PROC.(A/S)(ES):PROCURADOR-GERAL DO ESTADO DO ACRE INTDO.(A/S):PRESIDENTE DA REPÚBLICA ADV.(A/S):ADVOGADO-GERAL DA UNIÃO INTDO.(A/S):CONGRESSO NACIONAL ADV.(A/S):ADVOGADO-GERAL DA UNIÃO DESPACHO: Trata-se de ação direta de inconstitucionalidade com pedido de medida cautelar, ajuizada pelo Governador do Acre, em que questiona a constitucionalidade do art. 1-A da Lei federal 10.336/2001, com redação dada pela Lei 10.866/2004, que determina: “A União entregará aos Estados e ao Distrito Federal, para ser aplicado, obrigatoria

In [7]:
# import os.path
# import PyPDF2

# with open("output/stf-adi/stf-adi-5632.pdf",'rb') as pf:
#     pdfReader = PyPDF2.PdfFileReader(pf)
#     num_pages = pdfReader.numPages
#     count = 0
#     text = ''
#     while count < num_pages:
#         pageObj = pdfReader.getPage(count)
#         count +=1
#         text += pageObj.extractText()
#     print(text)

In [8]:
# import os.path
# import slate3k as slate

# with open("output/stf-adi/stf-adi-5632.pdf",'rb') as pf:
#     extracted_text = slate.PDF(pf)
# #     print(extracted_text)     


### Find similar

In [26]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

stopwords = set([line.strip() for line in open('stopwords.txt', 'r', encoding='utf-8')])
vec = TfidfVectorizer(min_df=0.1, max_df=0.5, analyzer='word', ngram_range=(1, 2), stop_words=stopwords)

def cos_similarity(textlist):
    tfidf = vec.fit_transform(textlist)
    return (tfidf * tfidf.T).toarray()

similarity_matrix = cos_similarity(decisoes)
print(similarity_matrix)

[[1.         0.55881383 0.66126765 ... 0.07087145 0.08406423 0.14998463]
 [0.55881383 1.         0.66909825 ... 0.07699421 0.10938629 0.04400694]
 [0.66126765 0.66909825 1.         ... 0.04367822 0.01271179 0.03118266]
 ...
 [0.07087145 0.07699421 0.04367822 ... 1.         0.50404712 0.4923654 ]
 [0.08406423 0.10938629 0.01271179 ... 0.50404712 1.         0.52767272]
 [0.14998463 0.04400694 0.03118266 ... 0.4923654  0.52767272 1.        ]]


In [27]:
for a in range(len(similarity_matrix)):
    for b in range(len(similarity_matrix[a])):
        v = similarity_matrix[a][b]
        if v>0.9 and a!=b:
            print(pdf_files[a] + ' ~= ' + pdf_files[b] + ': ' + str(v))

stf-adi-5633.pdf ~= stf-adi-5680.pdf: 0.9458844842616504
stf-adi-5650.pdf ~= stf-adi-5761.pdf: 0.9451656726516124
stf-adi-5654.pdf ~= stf-adi-5662.pdf: 0.9814235665105178
stf-adi-5656.pdf ~= stf-adi-5673.pdf: 0.9733944679911054
stf-adi-5659.pdf ~= stf-adi-5660.pdf: 0.9208477506800684
stf-adi-5660.pdf ~= stf-adi-5659.pdf: 0.9208477506800684
stf-adi-5662.pdf ~= stf-adi-5654.pdf: 0.9814235665105178
stf-adi-5662.pdf ~= stf-adi-5667.pdf: 0.9085058481856567
stf-adi-5663.pdf ~= stf-adi-5747.pdf: 0.9124152649137226
stf-adi-5667.pdf ~= stf-adi-5662.pdf: 0.9085058481856567
stf-adi-5673.pdf ~= stf-adi-5656.pdf: 0.9733944679911054
stf-adi-5680.pdf ~= stf-adi-5633.pdf: 0.9458844842616504
stf-adi-5686.pdf ~= stf-adi-5687.pdf: 0.9527235540395138
stf-adi-5686.pdf ~= stf-adi-5695.pdf: 0.9517357667247778
stf-adi-5687.pdf ~= stf-adi-5686.pdf: 0.9527235540395138
stf-adi-5687.pdf ~= stf-adi-5695.pdf: 0.9519888220769862
stf-adi-5695.pdf ~= stf-adi-5686.pdf: 0.9517357667247778
stf-adi-5695.pdf ~= stf-adi-568