## Short cron version


In [8]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import re
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', -1)
import datetime
import time
from random import randint
import os
import glob
import smtplib
from calendar import monthrange


### Functions

In [9]:
request_headers = {
    "method": "GET",
    "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
}

In [10]:
# Delete \n in strings

def clean_string(string):
    try:
        string = re.sub("\n ","", string)
        string = re.sub("\n","", string)
        return string
    except:
        print(e)
        print('\n Error during clean_string function')
        # Save log info 
        f = open("urls/" + todays_folder + "/errors_log.txt", "a")
        f.write("Error during clean_string function   " + " year: "+ year_str + " month: "+ formatted_month_str + " last page number: " + str(page_number) + "\n")
        f.close()

In [4]:
# Extract date format from date + id string

def get_date(string):
    try:
        match = re.search('\d\d\.\d\d\.\d{4}', string)
        #date = datetime.datetime.strptime(match.group(), '%d.%m.%Y').date()
        date = pd.to_datetime(match.group(), format = "%d.%m.%Y")

        return date
    except:
        print(e)
        print('\n Error during get_date function')
    # Save log info 
        f = open("urls/" + todays_folder + "/errors_log.txt", "a")
        f.write("Error during get_date function   " + " year: "+ year_str + " month: "+ formatted_month_str + " last page number: " + str(page_number) + "\n")
        f.close()

In [5]:
# Create a dataframe with all url, their date, ID and court information (while avoiding a bunch of scraping errors)


def get_url(text):
    
    try:

        rulings = []

        # If urls are found
        if len(text.findAll('li')) > 0:

            # Parse and save
            for li in text.findAll('li'):
                ruling = {}

                #Get date & ID code

                if li.find('span', {'class': 'rank_title'}) is None:
                    print("Date/ID code error found")
                    ruling['date'] = np.nan
                    ruling['date_id'] = np.nan
                    ruling['date'] = np.nan
                    ruling['id_code'] = np.nan

                else:
                    rank_title = li.find('span', {'class': 'rank_title'})
                    # Get the string containing date+ID
                    ruling['date_id'] = clean_string(rank_title.text)
                    # Get date alone
                    ruling['date'] = get_date(ruling['date_id'])
                    ruling['year'] = pd.to_datetime(ruling['date']).year


                    if ruling['date_id'].find('\d\d\.\d\d\.\d{4}') is None:
                        print("Date/ID code format error found in: ", ruling['date_id'])
                        ruling['id_code'] = np.nan
                    else:
                        #Get ID code alone
                        ruling['id_code'] = re.sub('\d\d\.\d\d\.\d{4} ', '', ruling['date_id'])

                # Get URL

                if li.find('a').get('href') is None:
                    print("URL error found in:", ruling['date_id'])
                    ruling['url'] = np.nan

                else: 
                    ruling['url'] = li.find('a').get('href')


                # Get court, subject and object information when existing

                if li.find('div', {'class': 'rank_data'}) is None:
                    print("Court/Subject/Object error found")
                    ruling['court'] = np.nan
                    ruling['subject'] = np.nan
                    ruling['object'] = np.nan

                else: 

                    rank_data = li.find('div', {'class': 'rank_data'})

                    if rank_data.find('div', {'class': 'court small normal'}) is None:
                        ruling['court'] = np.nan 
                    else:
                        ruling['court'] = clean_string(rank_data.find('div', {'class': 'court small normal'}).text)

                    if rank_data.find('div', {'class': 'subject small normal'}) is None:
                        ruling['subject'] = np.nan
                    else:
                        ruling['subject'] = clean_string(rank_data.find('div', {'class': 'subject small normal'}).text)

                    if rank_data.find('div', {'class': 'object small normal'}) is None:
                            ruling['object'] = np.nan
                    else:
                        ruling['object'] = clean_string(rank_data.find('div', {'class': 'object small normal'}).text)


                rulings.append(ruling)
                df_rulings = pd.DataFrame(rulings)

        # If no urls found, provide column headers only
        else:
            columns = ['court', 'date', 'date_id', 'id_code', 'object', 'subject', 'url', 'year']

            df_rulings = pd.DataFrame(columns=columns)


        return df_rulings
    
    except:
        print(e)
        print("\n Error during get_text function \n")
        # Save log info 
        f = open("urls/" + todays_folder + "/errors_log.txt", "a")
        f.write("Error during get_text function   " + " year: "+ year_str + " month: "+ formatted_month_str + " last page number: " + str(page_number) + "\n")
        f.close()

### 1. Scrape and save each month in a csv

In [6]:
# Create today's folder to store scraped files

todays_folder = datetime.datetime.today().strftime('%Y-%m-%d')

if os.path.exists("urls/"+todays_folder) == False:

    os.makedirs('urls/'+ todays_folder)

In [7]:
# Display search results by year, month and page

total_urls = 0
failed_urls = []
start_time = datetime.datetime.now()

for year in range(2000, int(datetime.datetime.today().strftime('%Y'))+1):
#for year in range(2008, 2009):

    year_int = year
    year_str = str(year)
    
    for month in range(1,13):
    #for month in range(2,3):
        
        print("MONTH:", month, "/",year)
                
        # Find how many days are in the month:    
        # monthrange(2011, 2) returns weekday of the first day of the month (0) and number of days in the month (1)
        end_month_str = str(monthrange(year, month)[1])
        
        # Make sure all months numbers have two digits
        formatted_month_str = str('%02d' % month)
        
        # Loop through pages
        page_number = 1
        while(True):
            connection_failed = False
            try:
                print('page_number_start', str(page_number))
                url = 'https://www.bger.ch/ext/eurospider/live/fr/php/aza/http/index.php?lang=fr&type=simple_query&page='+str(page_number)+'&from_date=01.'+formatted_month_str+'.'+year_str+'&to_date='+end_month_str+'.'+formatted_month_str+'.'+year_str+'&sort=relevance&insertion_date=&top_subcollection_aza=all&query_words='
                
                # Loop until connection works
                connection_attempts = 0
                
                while(True):
                    try:
                        html = requests.get(url, headers=request_headers)
                        soup = BeautifulSoup(html.text, "html5lib")
                        rank_soup = soup.find('div', {'class': 'ranklist_content'})
                        #print("html requested ok")
                       
                        # If html is not empty, quit loop
                        if rank_soup != None:
                            break
                        # Div is not found; try connecting again 10 times    
                        else:
                            connection_attempts +=1

                            f = open("urls/" + todays_folder + "/errors_log.txt", "a")
                            f.write("! Unable to find .ranklist_content: "+ year_str + " month: "+ formatted_month_str + " page number: " + str(page_number) + " attempt: " + str(connection_attempts) + "\n")
                            f.close()

                            # After 10 times, give up, break the loop
                            if connection_attempts > 10:
                                print("CONNECTION ERROR")

                                f = open("urls/" + todays_folder + "/errors_log.txt", "a")
                                f.write("!! CONNECTION ERROR" + " year: "+ year_str + " month: "+ formatted_month_str + " page number: " + str(page_number) + " connection attempt no: " + str(connection_attempts) +"\n")
                                f.close()

                                connection_failed = True
                                print('break connection attempts loop')
                                break
        
                    # If url can't be reached at all
                    except:
                        connection_attempts +=1

                        f = open("urls/" + todays_folder + "/errors_log.txt", "a")
                        f.write("! Connection attempt year: "+ year_str + " month: "+ formatted_month_str + " page number: " + str(page_number) + " attempt: " + str(connection_attempts) + "\n")
                        f.close()

                        if connection_attempts > 10:
                            print("CONNECTION ERROR")

                            f = open("urls/" + todays_folder + "/errors_log.txt", "a")
                            f.write("!! CONNECTION ERROR" + " year: "+ year_str + " month: "+ formatted_month_str + " page number: " + str(page_number) + " connection attempt no: " + str(connection_attempts) +"\n")
                            f.close()

                            connection_failed = True

                            print('break connection attempts loop')
                            break

                    # Pause before trying again    
                    time.sleep(5)
                
                if connection_failed:
                    
                    failed_urls.append(url)
                    
                    f = open("urls/" + todays_folder + "/errors_log.txt", "a")
                    f.write('! Skip page: ' + str(page_number) + " year: "+ year_str + " month: "+ formatted_month_str + "\n")
                    f.close()
                    page_number += 1
                    continue
                    
                
                #********************************************************************************************    
                # Create dataframe
            
                # Initiate df on page 1
                if page_number == 1:
                    urls = get_url(rank_soup)
                    
                # Else concat with previous results
                else:
                    urls = pd.concat([urls, get_url(rank_soup)])

                # Add these variables (where we are in the loops)   
                urls['year'] = year
                urls['month'] = month
           
                
                # When no results is found on page, it's the page after the last page, so we can save the df
                if len(rank_soup.findAll('li')) == 0 :
                    print("Pas d'elements dans la page, on sauve le mois")
                    print("Taille du df", len(urls))
                    
                    urls.to_csv("urls/"+ todays_folder + '/' + datetime.datetime.today().strftime('%Y-%m-%d')+ "_scraping_" + str(month) + "_" + str(year) + ".csv", index = False, encoding = 'utf-8')

                    #increment total counter
                    total_urls = total_urls + len(urls)

                    # Save this month connection ok log info 
                    f = open("urls/" + todays_folder + "/errors_log.txt", "a")
                    f.write("Connection ok   " + " year: "+ year_str + " month: "+ formatted_month_str + " last page number: " + str(page_number) + " month urls: " + str(len(urls)) + " total urls: " + str(total_urls) + "\n")
                    f.close()

                    # On to the next month
                    break
              
                        
                # If  there is stuff on the displayed page:
                else:
                    # Pause before loading next result page
                    time.sleep(randint(1, 4))

                    page_number += 1
                    

            # If the "try" section above raises an error (url doesn't exist, or file doesn't exist, etc.):
            except Exception as e:
                print(e)
                print("-------UNEXPECTED ERROR------------")
                
                f = open("urls/" + todays_folder + "/errors_log.txt", "a")
                f.write(str(e) + "\n UNEXPECTED ERROR" + " year: "+ year_str + " month: "+ formatted_month_str + " page number: " + str(page_number) + "\n")
                f.close()
                
                break
                
    duration = datetime.datetime.now() - start_time
    f = open("urls/" + todays_folder + "/errors_log.txt", "a")
    f.write("Year " + year_str + " finished in " + str(duration) + " seconds. \n")
    f.close()   


MONTH: 1 / 2000
page_number_start 1
page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_number_start 11
page_number_start 12
page_number_start 13
page_number_start 14
page_number_start 15
page_number_start 16
page_number_start 17
page_number_start 18
page_number_start 19
page_number_start 20
page_number_start 21
page_number_start 22
page_number_start 23
page_number_start 24
page_number_start 25
page_number_start 26
page_number_start 27
page_number_start 28
page_number_start 29
page_number_start 30
page_number_start 31
page_number_start 32
page_number_start 33
page_number_start 34
Pas d'elements dans la page, on sauve le mois
Taille du df 324
MONTH: 2 / 2000
page_number_start 1
page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10

page_number_start 27
page_number_start 28
page_number_start 29
page_number_start 30
page_number_start 31
page_number_start 32
page_number_start 33
page_number_start 34
page_number_start 35
page_number_start 36
page_number_start 37
Pas d'elements dans la page, on sauve le mois
Taille du df 354
MONTH: 12 / 2000
page_number_start 1
page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_number_start 11
page_number_start 12
page_number_start 13
page_number_start 14
page_number_start 15
page_number_start 16
page_number_start 17
page_number_start 18
page_number_start 19
page_number_start 20
page_number_start 21
page_number_start 22
page_number_start 23
page_number_start 24
page_number_start 25
page_number_start 26
page_number_start 27
page_number_start 28
page_number_start 29
page_number_start 30
page_number_start 31
page_number_start 32
page_number_start 33
Pas d

page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_number_start 11
page_number_start 12
page_number_start 13
page_number_start 14
page_number_start 15
page_number_start 16
page_number_start 17
page_number_start 18
page_number_start 19
page_number_start 20
page_number_start 21
page_number_start 22
page_number_start 23
page_number_start 24
page_number_start 25
page_number_start 26
page_number_start 27
page_number_start 28
page_number_start 29
Pas d'elements dans la page, on sauve le mois
Taille du df 278
MONTH: 10 / 2001
page_number_start 1
page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_number_start 11
page_number_start 12
page_number_start 13
page_number_start 14
page_number_start 15
page_number_start 16
page_number_start 17
page_number_start 18
page_number_start 19
page_number_start 20
page_number_start 21
page_num

page_number_start 12
page_number_start 13
page_number_start 14
page_number_start 15
page_number_start 16
page_number_start 17
page_number_start 18
page_number_start 19
page_number_start 20
page_number_start 21
page_number_start 22
page_number_start 23
page_number_start 24
page_number_start 25
page_number_start 26
page_number_start 27
page_number_start 28
page_number_start 29
page_number_start 30
page_number_start 31
page_number_start 32
page_number_start 33
page_number_start 34
page_number_start 35
page_number_start 36
page_number_start 37
page_number_start 38
page_number_start 39
Pas d'elements dans la page, on sauve le mois
Taille du df 375
MONTH: 7 / 2002
page_number_start 1
page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_number_start 11
page_number_start 12
page_number_start 13
page_number_start 14
page_number_start 15
page_number_start 16
page_n

page_number_start 28
page_number_start 29
page_number_start 30
page_number_start 31
page_number_start 32
page_number_start 33
page_number_start 34
page_number_start 35
page_number_start 36
page_number_start 37
page_number_start 38
page_number_start 39
page_number_start 40
page_number_start 41
page_number_start 42
page_number_start 43
page_number_start 44
page_number_start 45
page_number_start 46
page_number_start 47
Pas d'elements dans la page, on sauve le mois
Taille du df 455
MONTH: 4 / 2003
page_number_start 1
page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_number_start 11
page_number_start 12
page_number_start 13
page_number_start 14
page_number_start 15
page_number_start 16
page_number_start 17
page_number_start 18
page_number_start 19
page_number_start 20
page_number_start 21
page_number_start 22
page_number_start 23
page_number_start 24
page_n

page_number_start 38
page_number_start 39
page_number_start 40
page_number_start 41
Pas d'elements dans la page, on sauve le mois
Taille du df 393
MONTH: 12 / 2003
page_number_start 1
page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_number_start 11
page_number_start 12
page_number_start 13
page_number_start 14
page_number_start 15
page_number_start 16
page_number_start 17
page_number_start 18
page_number_start 19
page_number_start 20
page_number_start 21
page_number_start 22
page_number_start 23
page_number_start 24
page_number_start 25
page_number_start 26
page_number_start 27
page_number_start 28
page_number_start 29
page_number_start 30
page_number_start 31
page_number_start 32
page_number_start 33
page_number_start 34
page_number_start 35
page_number_start 36
page_number_start 37
page_number_start 38
page_number_start 39
page_number_start 40
page_

page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_number_start 11
page_number_start 12
page_number_start 13
page_number_start 14
page_number_start 15
page_number_start 16
page_number_start 17
page_number_start 18
page_number_start 19
page_number_start 20
page_number_start 21
page_number_start 22
page_number_start 23
page_number_start 24
page_number_start 25
page_number_start 26
page_number_start 27
page_number_start 28
page_number_start 29
page_number_start 30
page_number_start 31
page_number_start 32
page_number_start 33
page_number_start 34
page_number_start 35
page_number_start 36
page_number_start 37
page_number_start 38
page_number_start 39
page_number_start 40
Pas d'elements dans la page, on sauve le mois
Taille du df 386
MONTH: 9 / 2004
page_number_start 1
page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_numb

page_number_start 12
page_number_start 13
page_number_start 14
page_number_start 15
page_number_start 16
page_number_start 17
page_number_start 18
page_number_start 19
page_number_start 20
page_number_start 21
page_number_start 22
page_number_start 23
page_number_start 24
page_number_start 25
page_number_start 26
page_number_start 27
page_number_start 28
page_number_start 29
page_number_start 30
page_number_start 31
page_number_start 32
page_number_start 33
page_number_start 34
page_number_start 35
page_number_start 36
page_number_start 37
page_number_start 38
page_number_start 39
page_number_start 40
page_number_start 41
page_number_start 42
page_number_start 43
page_number_start 44
page_number_start 45
page_number_start 46
page_number_start 47
page_number_start 48
page_number_start 49
page_number_start 50
Pas d'elements dans la page, on sauve le mois
Taille du df 485
MONTH: 5 / 2005
page_number_start 1
page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
pa

page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_number_start 11
page_number_start 12
page_number_start 13
page_number_start 14
page_number_start 15
page_number_start 16
page_number_start 17
page_number_start 18
page_number_start 19
page_number_start 20
page_number_start 21
page_number_start 22
page_number_start 23
page_number_start 24
page_number_start 25
page_number_start 26
page_number_start 27
page_number_start 28
page_number_start 29
page_number_start 30
page_number_start 31
page_number_start 32
page_number_start 33
page_number_start 34
page_number_start 35
page_number_start 36
page_number_start 37
page_number_start 38
page_number_start 39
Pas d'elements dans la page, on sauve le mois
Taille du df 379
MONTH: 1 / 2006
page_number_start 1
page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number

page_number_start 38
page_number_start 39
page_number_start 40
page_number_start 41
page_number_start 42
page_number_start 43
page_number_start 44
Pas d'elements dans la page, on sauve le mois
Taille du df 423
MONTH: 8 / 2006
page_number_start 1
page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_number_start 11
page_number_start 12
page_number_start 13
page_number_start 14
page_number_start 15
page_number_start 16
page_number_start 17
page_number_start 18
page_number_start 19
page_number_start 20
page_number_start 21
page_number_start 22
page_number_start 23
page_number_start 24
page_number_start 25
page_number_start 26
page_number_start 27
page_number_start 28
page_number_start 29
page_number_start 30
page_number_start 31
page_number_start 32
page_number_start 33
page_number_start 34
page_number_start 35
page_number_start 36
page_number_start 37
page_n

page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_number_start 11
page_number_start 12
page_number_start 13
page_number_start 14
page_number_start 15
page_number_start 16
page_number_start 17
page_number_start 18
page_number_start 19
page_number_start 20
page_number_start 21
page_number_start 22
page_number_start 23
page_number_start 24
page_number_start 25
page_number_start 26
page_number_start 27
page_number_start 28
page_number_start 29
page_number_start 30
page_number_start 31
page_number_start 32
page_number_start 33
page_number_start 34
page_number_start 35
page_number_start 36
page_number_start 37
page_number_start 38
page_number_start 39
page_number_start 40
page_number_start 41
page_number_start 42
page_number_start 43
page_number_start 44
page_number_start 45
page_number_start 46
page_number_start 47
page_number_start 48
page_number_start 49


page_number_start 41
page_number_start 42
page_number_start 43
page_number_start 44
page_number_start 45
page_number_start 46
page_number_start 47
page_number_start 48
page_number_start 49
page_number_start 50
page_number_start 51
page_number_start 52
page_number_start 53
page_number_start 54
page_number_start 55
page_number_start 56
page_number_start 57
page_number_start 58
page_number_start 59
page_number_start 60
page_number_start 61
Pas d'elements dans la page, on sauve le mois
Taille du df 599
MONTH: 9 / 2007
page_number_start 1
page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_number_start 11
page_number_start 12
page_number_start 13
page_number_start 14
page_number_start 15
page_number_start 16
page_number_start 17
page_number_start 18
page_number_start 19
page_number_start 20
page_number_start 21
page_number_start 22
page_number_start 23
page_n

page_number_start 31
page_number_start 32
page_number_start 33
page_number_start 34
page_number_start 35
page_number_start 36
page_number_start 37
page_number_start 38
page_number_start 39
page_number_start 40
page_number_start 41
page_number_start 42
page_number_start 43
page_number_start 44
page_number_start 45
page_number_start 46
page_number_start 47
page_number_start 48
page_number_start 49
page_number_start 50
page_number_start 51
page_number_start 52
page_number_start 53
page_number_start 54
page_number_start 55
page_number_start 56
page_number_start 57
page_number_start 58
page_number_start 59
page_number_start 60
page_number_start 61
page_number_start 62
page_number_start 63
page_number_start 64
Pas d'elements dans la page, on sauve le mois
Taille du df 624
MONTH: 3 / 2008
page_number_start 1
page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_n

page_number_start 23
page_number_start 24
page_number_start 25
page_number_start 26
page_number_start 27
page_number_start 28
page_number_start 29
page_number_start 30
page_number_start 31
page_number_start 32
page_number_start 33
page_number_start 34
page_number_start 35
page_number_start 36
page_number_start 37
page_number_start 38
page_number_start 39
page_number_start 40
page_number_start 41
page_number_start 42
page_number_start 43
page_number_start 44
page_number_start 45
page_number_start 46
page_number_start 47
page_number_start 48
page_number_start 49
page_number_start 50
page_number_start 51
page_number_start 52
page_number_start 53
Pas d'elements dans la page, on sauve le mois
Taille du df 514
MONTH: 9 / 2008
page_number_start 1
page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_number_start 11
page_number_start 12
page_number_start 13
page_n

page_number_start 35
page_number_start 36
page_number_start 37
page_number_start 38
page_number_start 39
page_number_start 40
page_number_start 41
page_number_start 42
page_number_start 43
page_number_start 44
page_number_start 45
page_number_start 46
page_number_start 47
page_number_start 48
page_number_start 49
page_number_start 50
page_number_start 51
page_number_start 52
page_number_start 53
page_number_start 54
page_number_start 55
page_number_start 56
page_number_start 57
page_number_start 58
page_number_start 59
Pas d'elements dans la page, on sauve le mois
Taille du df 580
MONTH: 3 / 2009
page_number_start 1
page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_number_start 11
page_number_start 12
page_number_start 13
page_number_start 14
page_number_start 15
page_number_start 16
page_number_start 17
page_number_start 18
page_number_start 19
page_n

page_number_start 29
page_number_start 30
page_number_start 31
page_number_start 32
page_number_start 33
page_number_start 34
page_number_start 35
page_number_start 36
page_number_start 37
page_number_start 38
page_number_start 39
page_number_start 40
page_number_start 41
page_number_start 42
page_number_start 43
page_number_start 44
page_number_start 45
page_number_start 46
page_number_start 47
page_number_start 48
page_number_start 49
page_number_start 50
Pas d'elements dans la page, on sauve le mois
Taille du df 483
MONTH: 9 / 2009
page_number_start 1
page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_number_start 11
page_number_start 12
page_number_start 13
page_number_start 14
page_number_start 15
page_number_start 16
page_number_start 17
page_number_start 18
page_number_start 19
page_number_start 20
page_number_start 21
page_number_start 22
page_n

page_number_start 54
page_number_start 55
page_number_start 56
page_number_start 57
page_number_start 58
page_number_start 59
page_number_start 60
page_number_start 61
page_number_start 62
page_number_start 63
page_number_start 64
page_number_start 65
Pas d'elements dans la page, on sauve le mois
Taille du df 632
MONTH: 3 / 2010
page_number_start 1
page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_number_start 11
page_number_start 12
page_number_start 13
page_number_start 14
page_number_start 15
page_number_start 16
page_number_start 17
page_number_start 18
page_number_start 19
page_number_start 20
page_number_start 21
page_number_start 22
page_number_start 23
page_number_start 24
page_number_start 25
page_number_start 26
page_number_start 27
page_number_start 28
page_number_start 29
page_number_start 30
page_number_start 31
page_number_start 32
page_n

page_number_start 36
page_number_start 37
page_number_start 38
page_number_start 39
page_number_start 40
page_number_start 41
page_number_start 42
page_number_start 43
page_number_start 44
page_number_start 45
page_number_start 46
page_number_start 47
page_number_start 48
page_number_start 49
page_number_start 50
page_number_start 51
page_number_start 52
page_number_start 53
page_number_start 54
page_number_start 55
Pas d'elements dans la page, on sauve le mois
Taille du df 538
MONTH: 9 / 2010
page_number_start 1
page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_number_start 11
page_number_start 12
page_number_start 13
page_number_start 14
page_number_start 15
page_number_start 16
page_number_start 17
page_number_start 18
page_number_start 19
page_number_start 20
page_number_start 21
page_number_start 22
page_number_start 23
page_number_start 24
page_n

page_number_start 49
page_number_start 50
page_number_start 51
page_number_start 52
page_number_start 53
page_number_start 54
page_number_start 55
page_number_start 56
page_number_start 57
Pas d'elements dans la page, on sauve le mois
Taille du df 555
MONTH: 3 / 2011
page_number_start 1
page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_number_start 11
page_number_start 12
page_number_start 13
page_number_start 14
page_number_start 15
page_number_start 16
page_number_start 17
page_number_start 18
page_number_start 19
page_number_start 20
page_number_start 21
page_number_start 22
page_number_start 23
page_number_start 24
page_number_start 25
page_number_start 26
page_number_start 27
page_number_start 28
page_number_start 29
page_number_start 30
page_number_start 31
page_number_start 32
page_number_start 33
page_number_start 34
page_number_start 35
page_n

page_number_start 49
page_number_start 50
page_number_start 51
Pas d'elements dans la page, on sauve le mois
Taille du df 499
MONTH: 9 / 2011
page_number_start 1
page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_number_start 11
page_number_start 12
page_number_start 13
page_number_start 14
page_number_start 15
page_number_start 16
page_number_start 17
page_number_start 18
page_number_start 19
page_number_start 20
page_number_start 21
page_number_start 22
page_number_start 23
page_number_start 24
page_number_start 25
page_number_start 26
page_number_start 27
page_number_start 28
page_number_start 29
page_number_start 30
page_number_start 31
page_number_start 32
page_number_start 33
page_number_start 34
page_number_start 35
page_number_start 36
page_number_start 37
page_number_start 38
page_number_start 39
page_number_start 40
page_number_start 41
page_n

page_number_start 57
page_number_start 58
page_number_start 59
page_number_start 60
page_number_start 61
page_number_start 62
page_number_start 63
page_number_start 64
page_number_start 65
page_number_start 66
page_number_start 67
Pas d'elements dans la page, on sauve le mois
Taille du df 659
MONTH: 3 / 2012
page_number_start 1
page_number_start 2
page_number_start 3
page_number_start 4
page_number_start 5
page_number_start 6
page_number_start 7
page_number_start 8
page_number_start 9
page_number_start 10
page_number_start 11
page_number_start 12
page_number_start 13
page_number_start 14
page_number_start 15
page_number_start 16
page_number_start 17
page_number_start 18
page_number_start 19
page_number_start 20
page_number_start 21
page_number_start 22
page_number_start 23
page_number_start 24
page_number_start 25
page_number_start 26
page_number_start 27
page_number_start 28
page_number_start 29
page_number_start 30
page_number_start 31
page_number_start 32
page_number_start 33
page_n

KeyboardInterrupt: 

### Concat scraped urls, save, notify

In [None]:
filenames = glob.glob('urls/'+todays_folder + "/*.csv")
list_of_dfs = [pd.read_csv(filename) for filename in filenames]

combined_df = pd.concat(list_of_dfs)

# export list of failed urls
failed_df = pd.DataFrame(failed_urls)
failed_df.to_csv('urls/failed_urls/' + todays_folder + '.csv')

In [None]:
combined_df.reset_index(drop=True).to_csv('urls/concats/' + todays_folder + '.csv', index = False)
print('save concat ok')

In [None]:
duration = datetime.datetime.now() - start_time
f = open("urls/" + todays_folder + "/errors_log.txt", "a")
f.write("Total duration in seconds: " + str(duration) + "\n")
f.close()

In [None]:
# Email notifications

server = smtplib.SMTP('smtp.gmail.com', 587)
server.starttls()
server.login("fanny.giroud@24heures.ch", "6v7pl7S8Fr")

msg = "Scrape of the day ok. URLs list is updated in" + str(duration) + " seconds."
server.sendmail("fanny.giroud@24heures.ch", "fanny.giroud@24heures.ch", msg)
server.quit()