## Short cron version


In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import re
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', -1)
import datetime
import time
from random import randint
import os
import glob
import smtplib
from calendar import monthrange

### Functions

In [2]:
request_headers = {
    "method": "GET",
    "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
}

In [3]:
# Delete \n in strings

def clean_string(string):
    string = re.sub("\n ","", string)
    string = re.sub("\n","", string)
    return string

In [4]:
# Extract date format from date + id string

def get_date(string):
    match = re.search('\d\d\.\d\d\.\d{4}', string)
    #date = datetime.datetime.strptime(match.group(), '%d.%m.%Y').date()
    date = pd.to_datetime(match.group(), format = "%d.%m.%Y")

    return date

In [5]:
# Create a dataframe with all url, their date, ID and court information (while avoiding a bunch of scraping errors)


def get_url(text):

    rulings = []
    
    # Si on trouve des urls
    if len(text.findAll('li')) > 0:

        # On passe a travers et on les enregistre
        for li in text.findAll('li'):
            ruling = {}

            #Get date & ID code

            if li.find('span', {'class': 'rank_title'}) is None:
                print("Date/ID code error found")
                ruling['date'] = np.nan
                ruling['date_id'] = np.nan
                ruling['date'] = np.nan
                ruling['id_code'] = np.nan

            else:
                rank_title = li.find('span', {'class': 'rank_title'})
                # Get the string containing date+ID
                ruling['date_id'] = clean_string(rank_title.text)
                # Get date alone
                ruling['date'] = get_date(ruling['date_id'])
                ruling['year'] = pd.to_datetime(ruling['date']).year


                if ruling['date_id'].find('\d\d\.\d\d\.\d{4}') is None:
                    print("Date/ID code format error found in: ", ruling['date_id'])
                    ruling['id_code'] = np.nan
                else:
                    #Get ID code alone
                    ruling['id_code'] = re.sub('\d\d\.\d\d\.\d{4} ', '', ruling['date_id'])

            # Get URL

            if li.find('a').get('href') is None:
                print("URL error found in:", ruling['date_id'])
                ruling['url'] = np.nan

            else: 
                ruling['url'] = li.find('a').get('href')


            # Get court, subject and object information

            if li.find('div', {'class': 'rank_data'}) is None:
                print("Court/Subject/Object error found")
                ruling['court'] = np.nan
                ruling['subject'] = np.nan
                ruling['object'] = np.nan

            else: 

                rank_data = li.find('div', {'class': 'rank_data'})

                if rank_data.find('div', {'class': 'court small normal'}) is None:
                    ruling['court'] = np.nan 
                else:
                    ruling['court'] = clean_string(rank_data.find('div', {'class': 'court small normal'}).text)

                if rank_data.find('div', {'class': 'subject small normal'}) is None:
                    ruling['subject'] = np.nan
                else:
                    ruling['subject'] = clean_string(rank_data.find('div', {'class': 'subject small normal'}).text)

                if rank_data.find('div', {'class': 'object small normal'}) is None:
                        ruling['object'] = np.nan
                else:
                    ruling['object'] = clean_string(rank_data.find('div', {'class': 'object small normal'}).text)


            rulings.append(ruling)
            df_rulings = pd.DataFrame(rulings)

    # Si on ne trouve pas d'urls
    else:
        columns = ['court', 'date', 'date_id', 'id_code', 'object', 'subject', 'url', 'year']
        
        df_rulings = pd.DataFrame(columns=columns)


    return df_rulings

### Save evolution of rulings over time

In [32]:

# Find number of result pages today

search_url = "https://www.bger.ch/ext/eurospider/live/fr/php/aza/http/index.php?lang=fr&type=simple_query&query_words=&lang=fr&top_subcollection_aza=all&from_date=&to_date=&x=29&y=14"
html = requests.get(search_url, headers=request_headers)

connection_attempts = 0
while(True):
    if html.status_code == 200:
        break
    else:
        connection_attempts +=1

    if connection_attempts > 10:
        print("CONNECTION ERROR")
        break
soup = BeautifulSoup(html.text, "html5lib")
page_header = soup.find('div', {'class': 'ranklist_header center'})

total_pages = re.search('\d{6}', page_header.text).group(0)
print(total_pages)


117989


In [None]:
# Initiate log file with scrape date and number of rulings on site 
# ! Run once then comment this cell ! 

#page_increase = pd.DataFrame([{
#    'scrape_date': datetime.datetime.today().strftime('%Y-%m-%d-%H-%M'), 
#    'page_total': total_pages,
#    'id': scrape_id
#}])  

#page_increase.to_csv("urls/logs/page_increase_doc.csv", index=False)

In [None]:
page_increase_doc = pd.read_csv("urls/logs/page_increase_doc.csv")
scrape_id = len(page_increase_doc)

page_increase = pd.DataFrame([{
    'scrape_date': datetime.datetime.today().strftime('%Y-%m-%d-%H-%M'), 
    'page_total': total_pages,
    'id': scrape_id
}])                

# Open and update document
update_page_increase_doc = page_increase_doc.append(page_increase)
update_page_increase_doc.to_csv('urls/logs/page_increase_doc.csv', index=False)

### 1. Scrape and save each month in a csv

In [33]:
# Create today's folder to store scraped files (run once only)

todays_folder = datetime.datetime.today().strftime('%Y-%m-%d')
os.makedirs('urls/'+ todays_folder)

In [35]:
# Loop through search results page for each year
for year in range(2000, int(datetime.datetime.today().strftime('%Y'))+1):
    
    year_int = year
    year_str = str(year)
    
    # Loop through search results page for each month
    for month in range(1,13):
        
        print("MONTH:", month, "/",year)
                
        # monthrange(2011, 2) returns weekday of the first day of the month and number of days in the month
        end_month_str = str(monthrange(year, month)[1])
        
        # Make sure all months numbers have two digits
        formatted_month_str = str('%02d' % month)
        
        # Loop through search results page
        page_number = 1
        while(True):
            try:
                print('page_number_start', str(page_number))
                url = 'https://www.bger.ch/ext/eurospider/live/fr/php/aza/http/index.php?lang=fr&type=simple_query&page='+str(page_number)+'&from_date=01.'+formatted_month_str+'.'+year_str+'&to_date='+end_month_str+'.'+formatted_month_str+'.'+year_str+'&sort=relevance&insertion_date=&top_subcollection_aza=all&query_words='
                        
                connection_attempts = 0
                # Loop until connection works
                
                while(True):
                    
                    html = requests.get(url, headers=request_headers)
                    if html.status_code == 200:
                        break
                    else:
                        connection_attempts +=1
                        if connection_attempts > 10:
                            print("CONNECTION ERROR")
                        
                            # Email notification
                            server = smtplib.SMTP('smtp.gmail.com', 587)
                            server.starttls()
                            server.login("fanny.giroud@24heures.ch", "6v7pl7S8Fr")

                            msg = "ERROR #1 while scraping: connection for request failed after 10 attempts"
                            server.sendmail("fanny.giroud@24heures.ch", "fanny.giroud@24heures.ch", msg)
                            server.quit()
                        
                            break
                
                # Recupere la div qui contient les urls
                soup = BeautifulSoup(html.text, "html5lib")
                rank_soup = soup.find('div', {'class': 'ranklist_content'})
            
                # Si on est sur la premiere page il faut creer une df
                if page_number == 1:
                    urls = get_url(rank_soup)
                    
                # Sinon, la concaténer avec les nouveaux résultats
                else:
                    urls = pd.concat([urls, get_url(rank_soup)])

                # Add year in dataframe    
                urls['year'] = year
                urls['month'] = month
           
                
                # Si on ne trouve plus rien sur la page, on a fini, on peut sauver le mois
                if len(rank_soup.findAll('li')) == 0 :
                    
                    print("Pas d'elements dans la page, on sauve le mois")
                    print("Taille du df", len(urls))
                    
                    # Parsed all the pages, getting an empty one and saving
                    urls.to_csv("urls/"+ todays_folder + '/' + datetime.datetime.today().strftime('%Y-%m-%d')+ "_scraping_" + str(month) + "_" + str(year) + ".csv", index = False)
                    
                    # On sort de la boucle du mois
                    break
                
                # Si on trouve encore quelque chose sur la page
                else:
                    # On fait une pause avant de charger la page suivante
                    time.sleep(randint(1, 4))

                    page_number += 1

    
            except Exception as e:
                print(e)
                print("-------UNEXPECTED ERROR------------")
                break
        
    

MONTH: 5 / 2018
page_number_start 1
page_number_start 2
Pas d'elements dans la page, on sauve le mois
Taille du df 20
MONTH: 6 / 2018
page_number_start 1
page_number_start 2
Pas d'elements dans la page, on sauve le mois
Taille du df 20
MONTH: 7 / 2018
page_number_start 1
Pas d'elements dans la page, on sauve le mois
Taille du df 0


### Concat scraped urls

In [37]:
filenames = glob.glob('urls/'+todays_folder + "/*.csv")
list_of_dfs = [pd.read_csv(filename) for filename in filenames]

combined_df = pd.concat(list_of_dfs)

In [39]:
combined_df.reset_index(drop=True).to_csv('urls/concats/' + todays_folder +'.csv', index = False)
print('save concat ok')

save concat ok


In [48]:
# Email notifications

server = smtplib.SMTP('smtp.gmail.com', 587)
server.starttls()
server.login("fanny.giroud@24heures.ch", "6v7pl7S8Fr")

msg = "Scrape of the day ok. URLs list is updated"
server.sendmail("fanny.giroud@24heures.ch", "fanny.giroud@24heures.ch", msg)
server.quit()

(221, b'2.0.0 closing connection d18-v6sm2322550eds.40 - gsmtp')