<a href="https://colab.research.google.com/github/zuckberj/jurisprudence-data-analysis/blob/master/jurisprudencia_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install aiohttp tqdm

from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import requests
import aiohttp
import asyncio
import json

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
async def request_url(session, url, params):
    #proxies = {"http": "http://127.0.0.1:8080", "https": "http://127.0.0.1:8080"}
    #async with session.get(url+"/"+path,proxy="http://127.0.0.1:8080",ssl=ssl.SSLContext()) as response:
    try:
        headers = {'accept': 'application/json'}
        async with session.post(url, data=params, headers=headers) as response:
            if response.status != 200:
                return None
            return await response.text()
    except:
        return None

def filter_fields():
  return ["acordao_ata^3",
          "documental_acordao_mesmo_sentido_lista_texto",
          "documental_doutrina_texto",
          "documental_indexacao_texto",
          "documental_jurisprudencia_citada_texto",
          "documental_legislacao_citada_texto",
          "documental_observacao_texto",
          "documental_publicacao_lista_texto",
          "documental_tese_tema_texto^3",
          "documental_tese_texto^3",
          "ementa_texto^3",
          "ministro_facet",
          "orgao_julgador",
          "partes_lista_texto",
          "procedencia_geografica_completo",
          "processo_classe_processual_unificada_extenso",
          "titulo^6",
          "colac_numero",
          "colac_pagina",
          "decisao_texto^2",
          "documental_decisao_mesmo_sentido_lista_texto",
          "processo_precedente_texto",
          "sumula_texto^3",
          "conteudo_texto"]

def query_builder(query_term, size, page):
  query = {
    "query":{
        "function_score":{
            "query":{
                "bool":{
                    "filter":[
                        {
                            "query_string":{
                                "default_operator":"AND",
                                "fields": filter_fields(),
                                "query": query_term,
                                "quote_analyzer":"standard_analyzer",
                                "quote_field_suffix":".standard",
                                "type":"cross_fields",
                                "fuzziness":"AUTO:4,7"
                            }
                        }
                    ]
                }
            }
        }
    },
    "_source":[
        "inteiro_teor_url"
    ],
    "aggs":{
        "base_agg":{
            "filters":{
                "filters":{
                    "acordaos":{
                        "match":{"base":"acordaos"}
                    },
                    "sumulas":{
                        "match":{"base":"sumulas"}
                    },
                    "decisoes":{
                        "match":{"base":"decisoes"}
                    },
                    "informativos":{
                        "match":{"base":"informativos"}
                    }
                }
            }
        }
    },
    "size": size,
    "from": page
  }
  return json.dump(query)

async def scrap_single(session, search_query, size, max_page=-1):
  url = 'https://jurisprudencia.stf.jus.br/api/search/search'
  index = 0
  eof = False
  pdf_urls = []
  while not eof and index != max_page*size:
    params = query_builder()
    response = await request_url(session, url, params)
    results = json.loads(response)['result']['hits']['hits']
    pdf_urls = pdf_urls + [result['_source']['inteiro_teor_url'] for result in results if result['_source']!={}]
    if not results:
      eof = True
    index += size
    return pdf_urls
  
def download(url, file_name=''):
  if file_name == '':
    file_name = url.split('=')[1]+'.pdf'
  print(file_name)
  try:
    response = requests.get(url, stream=True)
    with open(file_name, "wb") as handle:
      for data in tqdm(response.iter_content()):
        handle.write(data)
    return True
  except Exception as e:
    print("Error while downloading. Trying Again. "+str(e))
    return False

def download_list(urls_):
  urls = set(urls_)
  for url in urls:
    result = download(url)
    print(result)

In [5]:
async def request_test_url():
    url_test = 'https://jurisprudencia.stf.jus.br/api/search/search'
    search_query = '*'
    page = 0
    size = 10
    params = query_builder(search_query, size, page)
    async with aiohttp.ClientSession() as session:
        response = await request_url(session, url_test, params)
        print(response)

loop = asyncio.get_event_loop()
loop.create_task(request_test_url())

<Task pending coro=<request_test_url() running at <ipython-input-5-8ae3ee472b9b>:1>>

In [6]:
async def scrap_single_test():
  search_query = 'Lula'
  size = 150
  max_page = -1
  async with aiohttp.ClientSession() as session:
    result = await scrap_single(session, search_query, size, max_page)
    print(len(result))
    print(result)

loop = asyncio.get_event_loop()
loop.create_task(scrap_single_test())

<Task pending coro=<scrap_single_test() running at <ipython-input-6-61055f8e4f1a>:1>>

In [7]:
async def download_test():
  urlTest='http://www.stf.jus.br/portal/inteiroTeor/obterInteiroTeor.asp?idDocumento=3442694'
  response=Download(urlTest)
  print(response)

loop = asyncio.get_event_loop()
loop.create_task(download_test())

<Task pending coro=<download_test() running at <ipython-input-7-036663147e8f>:1>>

In [8]:
def download_list_test():
  urls = ['http://www.stf.jus.br/portal/inteiroTeor/obterInteiroTeor.asp?idDocumento=11480004',
          'http://www.stf.jus.br/portal/inteiroTeor/obterInteiroTeor.asp?idDocumento=11480004',
          'http://www.stf.jus.br/portal/inteiroTeor/obterInteiroTeor.asp?idDocumento=3442694']
  result = download_list(urls)
    
download_list_test()

3442694.pdf


360141it [00:04, 84961.05it/s] 


True
11480004.pdf


394772it [00:04, 93653.31it/s] 

True



