## Scraping federal judges rulings 

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import re
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', -1)
import datetime
import time
from random import randint
import os

In [2]:
request_headers = {
    "method": "GET",
    "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
}

In [3]:
#url example

#https://www.bger.ch/ext/eurospider/live/fr/php/aza/http/index.php?lang=fr&type=simple_query&query_words=&lang=fr&top_subcollection_aza=all&from_date=01.01.2000&to_date=31.12.2000&x=15&y=6



### Get list of url for decisions from 01.01.2000 - 01.04.2018

In [4]:
# Delete \n in strings

def clean_string(string):
    string = re.sub("\n ","", string)
    string = re.sub("\n","", string)
    return string

In [5]:
# Extract date format from date + id string

def get_date(string):
    match = re.search('\d\d\.\d\d\.\d{4}', string)
    date = datetime.datetime.strptime(match.group(), '%d.%m.%Y').date()
    return date

In [6]:
# Create a dataframe with all url, their date, ID and court information (while avoiding a bunch of scraping errors)


def get_url(text):

    rulings = []
    
    for li in text.findAll('li'):
        ruling = {}
                    
        #Get date & ID code
        
        if li.find('span', {'class': 'rank_title'}) is None:
            print("Date/ID code error found")
            ruling['date'] = np.nan
            ruling['date_id'] = np.nan
            ruling['date'] = np.nan
            ruling['id_code'] = np.nan
                   
        else:
            rank_title = li.find('span', {'class': 'rank_title'})
            # Get the string containing date+ID
            ruling['date_id'] = clean_string(rank_title.text)
            # Get date alone
            ruling['date'] = get_date(ruling['date_id'])

            if ruling['date_id'].find('\d\d\.\d\d\.\d{4}') is None:
                print("Date/ID code format error found in: ", ruling['date_id'])
                ruling['id_code'] = np.nan
            else:
                #Get ID code alone
                ruling['id_code'] = re.sub('\d\d\.\d\d\.\d{4} ', '', ruling['date_id'])

        # Get URL
        
        if li.find('a').get('href') is None:
            print("URL error found in:", ruling['date_id'])
            ruling['url'] = np.nan
        
        else: 
            ruling['url'] = li.find('a').get('href')
            

        # Get court, subject and object information
        
        if li.find('div', {'class': 'rank_data'}) is None:
            print("Court/Subject/Object error found")
            ruling['court'] = np.nan
            ruling['subject'] = np.nan
            ruling['object'] = np.nan
        
        else: 
            
            rank_data = li.find('div', {'class': 'rank_data'})
            
            if rank_data.find('div', {'class': 'court small normal'}) is None:
                ruling['court'] = np.nan 
            else:
                ruling['court'] = clean_string(rank_data.find('div', {'class': 'court small normal'}).text)

            if rank_data.find('div', {'class': 'subject small normal'}) is None:
                ruling['subject'] = np.nan
            else:
                ruling['subject'] = clean_string(rank_data.find('div', {'class': 'subject small normal'}).text)

            if rank_data.find('div', {'class': 'object small normal'}) is None:
                    ruling['object'] = np.nan
            else:
                ruling['object'] = clean_string(rank_data.find('div', {'class': 'object small normal'}).text)
        

        rulings.append(ruling)

    df_rulings = pd.DataFrame(rulings)

    return df_rulings

In [8]:
# Execute main function

# Extract urls by year 
# Test

for year in range(2000, 2018):
    
    year = str(year)
    print("Year: ", year)

    # Loop through search results page for each year
    i = 1
    while(True):
        try:
            page_number = str(i)
            url = 'https://www.bger.ch/ext/eurospider/live/fr/php/aza/http/index.php?lang=fr&type=simple_query&page='+page_number+'&from_date=01.01.'+year+'&to_date=31.12.'+year+'&sort=relevance&insertion_date=&top_subcollection_aza=all&query_words='
            html = requests.get(url, headers=request_headers)
            soup = BeautifulSoup(html.text, "html5lib")
            rank_soup = soup.find('div', {'class': 'ranklist_content'})

            if i == 1:
                urls = get_url(rank_soup)
            else:
                urls = pd.concat([urls, get_url(rank_soup)])
                
            # Add year in dataframe    
            urls['year'] = year

            time.sleep(randint(1, 5))
            
            print("page no: ", i)
            i += 1
            
        except:
            urls.to_csv("urls/urls"+ year + ".csv")
            break
            


    
    #Export year urls
    urls.to_csv("urls/urls"+ year + ".csv")
    
    

Year:  2000
page no:  2
page no:  3
page no:  4
Year:  2001
page no:  2
Year:  2002


In [398]:
all_years_urls

Unnamed: 0,court,date,date_id,id_code,object,subject,url,year
0,Cour de droit pénal,2017-12-29,29.12.2017 6B 930/2017,6B 930/2017,"Ordonnance de non-entrée en matière (dénonciation calomnieuse, induction de la justice en erreur), qualité pour recourir au Tribunal fédéral",Procédure pénale,https://www.bger.ch/ext/eurospider/live/fr/php/aza/http/index.php?lang=fr&type=highlight_simple_query&page=1&from_date=01.01.2017&to_date=31.12.2017&sort=relevance&insertion_date=&top_subcollection_aza=all&query_words=&rank=1&azaclir=aza&highlight_docid=aza%3A%2F%2F29-12-2017-6B_930-2017&number_of_ranks=7451,2017
1,Strafrechtliche Abteilung,2017-12-29,29.12.2017 6B 910/2017,6B 910/2017,"Widerhandlung gegen das Tierschutz- und Hundegesetz; Willkür, rechtliches Gehör",Straftaten,https://www.bger.ch/ext/eurospider/live/fr/php/aza/http/index.php?lang=fr&type=highlight_simple_query&page=1&from_date=01.01.2017&to_date=31.12.2017&sort=relevance&insertion_date=&top_subcollection_aza=all&query_words=&rank=2&azaclir=aza&highlight_docid=aza%3A%2F%2F29-12-2017-6B_910-2017&number_of_ranks=7451,2017
2,Cour de droit pénal,2017-12-29,29.12.2017 6B 1340/2016,6B 1340/2016,"Violation grave qualifiée des règles de la circulation routière, droit d'être entendu",Infractions,https://www.bger.ch/ext/eurospider/live/fr/php/aza/http/index.php?lang=fr&type=highlight_simple_query&page=1&from_date=01.01.2017&to_date=31.12.2017&sort=relevance&insertion_date=&top_subcollection_aza=all&query_words=&rank=3&azaclir=aza&highlight_docid=aza%3A%2F%2F29-12-2017-6B_1340-2016&number_of_ranks=7451,2017
3,II. zivilrechtliche Abteilung,2017-12-29,29.12.2017 5A 1013/2017,5A 1013/2017,Rechtsverweigerung (Rechtskraftbescheinigung betreffend Ehescheidung),Familienrecht,https://www.bger.ch/ext/eurospider/live/fr/php/aza/http/index.php?lang=fr&type=highlight_simple_query&page=1&from_date=01.01.2017&to_date=31.12.2017&sort=relevance&insertion_date=&top_subcollection_aza=all&query_words=&rank=4&azaclir=aza&highlight_docid=aza%3A%2F%2F29-12-2017-5A_1013-2017&number_of_ranks=7451,2017
4,II. zivilrechtliche Abteilung,2017-12-29,29.12.2017 5D 264/2017,5D 264/2017,Definitive Rechtsöffnung,Schuldbetreibungs- und Konkursrecht,https://www.bger.ch/ext/eurospider/live/fr/php/aza/http/index.php?lang=fr&type=highlight_simple_query&page=1&from_date=01.01.2017&to_date=31.12.2017&sort=relevance&insertion_date=&top_subcollection_aza=all&query_words=&rank=5&azaclir=aza&highlight_docid=aza%3A%2F%2F29-12-2017-5D_264-2017&number_of_ranks=7451,2017
5,II. zivilrechtliche Abteilung,2017-12-29,29.12.2017 5A 393/2017,5A 393/2017,Medizinische Nachbetreuung,Familienrecht,https://www.bger.ch/ext/eurospider/live/fr/php/aza/http/index.php?lang=fr&type=highlight_simple_query&page=1&from_date=01.01.2017&to_date=31.12.2017&sort=relevance&insertion_date=&top_subcollection_aza=all&query_words=&rank=6&azaclir=aza&highlight_docid=aza%3A%2F%2F29-12-2017-5A_393-2017&number_of_ranks=7451,2017
6,Cour de droit pénal,2017-12-29,29.12.2017 6B 941/2017,6B 941/2017,Ordonnance de non-entrée en matière (abus de pouvoir); demande de récusation; motivation du recours en matière pénale au Tribunal fédéral,Procédure pénale,https://www.bger.ch/ext/eurospider/live/fr/php/aza/http/index.php?lang=fr&type=highlight_simple_query&page=1&from_date=01.01.2017&to_date=31.12.2017&sort=relevance&insertion_date=&top_subcollection_aza=all&query_words=&rank=7&azaclir=aza&highlight_docid=aza%3A%2F%2F29-12-2017-6B_941-2017&number_of_ranks=7451,2017
7,Cour de droit pénal,2017-12-29,29.12.2017 6B 940/2017,6B 940/2017,"Ordonnance de non-entrée en matière (abus de pouvoir, falsification ou suppression d'informations); demande de récusation; qualité pour recourir au Tribunal fédéral",Procédure pénale,https://www.bger.ch/ext/eurospider/live/fr/php/aza/http/index.php?lang=fr&type=highlight_simple_query&page=1&from_date=01.01.2017&to_date=31.12.2017&sort=relevance&insertion_date=&top_subcollection_aza=all&query_words=&rank=8&azaclir=aza&highlight_docid=aza%3A%2F%2F29-12-2017-6B_940-2017&number_of_ranks=7451,2017
8,Cour de droit pénal,2017-12-29,29.12.2017 6B 673/2016,6B 673/2016,Diffamation,Infractions,https://www.bger.ch/ext/eurospider/live/fr/php/aza/http/index.php?lang=fr&type=highlight_simple_query&page=1&from_date=01.01.2017&to_date=31.12.2017&sort=relevance&insertion_date=&top_subcollection_aza=all&query_words=&rank=9&azaclir=aza&highlight_docid=aza%3A%2F%2F29-12-2017-6B_673-2016&number_of_ranks=7451,2017
9,II. zivilrechtliche Abteilung,2017-12-29,29.12.2017 5A 1028/2017,5A 1028/2017,Nichtigkeit einer Betreibung,Schuldbetreibungs- und Konkursrecht,https://www.bger.ch/ext/eurospider/live/fr/php/aza/http/index.php?lang=fr&type=highlight_simple_query&page=1&from_date=01.01.2017&to_date=31.12.2017&sort=relevance&insertion_date=&top_subcollection_aza=all&query_words=&rank=10&azaclir=aza&highlight_docid=aza%3A%2F%2F29-12-2017-5A_1028-2017&number_of_ranks=7451,2017
