# scraper

In [1]:
# imports
import requests
import pandas as pd
from bs4 import BeautifulSoup

## 1. Download basic data

### Resources

In [22]:
# Links of both the arabic and the latin version of the data, so we cat concatenate it later
LT_LINK = 'https://www.interieur.gov.dz/index.php/fr/component/annuaires/annuairecommunes.html'
AR_LINK = 'https://www.interieur.gov.dz/index.php/ar/component/annuaires/annuairecommunes.html'

### Functions

In [3]:
def get_basic_data(link):
    """Download the basic Commune, Daira, and Wilaya data,
    for both Latin and Arabic."""
    
    data = []
    
    while 1:
        # Download the first page
        html = requests.get(link)
        html = BeautifulSoup(html.text, 'html.parser')

        # Check if the next page exists, or we're reached the end
        suivant = html.find('li', {'class': 'pagination-next'}).find('a')

        # Get all the rows data striped and cleaned
        table = html.find('table', {'id': 'annuairecommuneList'}).find('tbody').find_all('tr')
        for row in table:
            data.append(strip_data(row))

        # IN case it longer exists, it returns None
        if suivant is not None:
            link = 'https://www.interieur.gov.dz' + suivant['href']
        else:
            break

    return data


In [4]:
def strip_data(row):
    """The data is represented in rows, and table data tags,
    therefore i needed to clean it."""
    
    out_row = []
    
    for data in row.find_all('td'):
        out_row.append(data.text.strip())
    
    # The first pop if fot the table id
    # while the second is for the address, that is incorrect
    out_row.pop(0)
    out_row.pop(1)
        
    return out_row

In [5]:
# The basic Commune, Daira, and Wilaya data
# for both Latin and Arabic
basic_lt_data = get_basic_data(LT_LINK)
basic_ar_data = get_basic_data(AR_LINK)

## 2. Download phone codes

### Resources

In [23]:
PHONE_LINK = 'https://fr.wikipedia.org/wiki/Liste_des_codes_t%C3%A9l%C3%A9phoniques_des_wilayas_d%27Alg%C3%A9rie'

In [24]:
html = requests.get(PHONE_LINK)
html = BeautifulSoup(html.text, 'html.parser')

In [29]:
table = html.find('table')
table

<table class="wikitable sortable"><tbody><tr>
<th scope="col" style="background">Code
</th>
<th scope="col" style="background">Wilaya
</th>
<th scope="col" style="background">Indicatif Téléphonique
</th></tr>
<tr>
<td>01
</td>
<td><a href="/wiki/Wilaya_d%27Adrar" title="Wilaya d'Adrar">Wilaya d'Adrar</a>
</td>
<td style="text-align:right">+213 (0)49
</td></tr>
<tr>
<td>02
</td>
<td><a href="/wiki/Wilaya_de_Chlef" title="Wilaya de Chlef">Wilaya de Chlef</a>
</td>
<td style="text-align:right">+213 (0)27
</td></tr>
<tr>
<td>03
</td>
<td><a href="/wiki/Wilaya_de_Laghouat" title="Wilaya de Laghouat">Wilaya de Laghouat</a>
</td>
<td style="text-align:right">+213 (0)29
</td></tr>
<tr>
<td>04
</td>
<td><a href="/wiki/Wilaya_d%27Oum_El_Bouaghi" title="Wilaya d'Oum El Bouaghi">Wilaya d'Oum El Bouaghi</a>
</td>
<td style="text-align:right">+213 (0)32
</td></tr>
<tr>
<td>05
</td>
<td><a href="/wiki/Wilaya_de_Batna" title="Wilaya de Batna">Wilaya de Batna</a>
</td>
<td style="text-align:right">+213