# scraper

In [1]:
# imports
import requests
import pandas as pd
from bs4 import BeautifulSoup

## 1. Download basic data

### Resources

In [22]:
# Links of both the arabic and the latin version of the data, so we cat concatenate it later
LT_LINK = 'https://www.interieur.gov.dz/index.php/fr/component/annuaires/annuairecommunes.html'
AR_LINK = 'https://www.interieur.gov.dz/index.php/ar/component/annuaires/annuairecommunes.html'

### Functions

In [3]:
def get_basic_data(link):
    """Download the basic Commune, Daira, and Wilaya data,
    for both Latin and Arabic."""
    
    data = []
    
    while 1:
        # Download the first page
        html = requests.get(link)
        html = BeautifulSoup(html.text, 'html.parser')

        # Check if the next page exists, or we're reached the end
        suivant = html.find('li', {'class': 'pagination-next'}).find('a')

        # Get all the rows data striped and cleaned
        table = html.find('table', {'id': 'annuairecommuneList'}).find('tbody').find_all('tr')
        for row in table:
            data.append(strip_data(row))

        # IN case it longer exists, it returns None
        if suivant is not None:
            link = 'https://www.interieur.gov.dz' + suivant['href']
        else:
            break

    return data


In [4]:
def strip_data(row):
    """The data is represented in rows, and table data tags,
    therefore i needed to clean it."""
    
    out_row = []
    
    for data in row.find_all('td'):
        out_row.append(data.text.strip())
    
    # The first pop if fot the table id
    # while the second is for the address, that is incorrect
    out_row.pop(0)
    out_row.pop(1)
        
    return out_row

In [5]:
# The basic Commune, Daira, and Wilaya data
# for both Latin and Arabic
# with the following format: baladiya,daira,wilaya
basic_lt_data = get_basic_data(LT_LINK)
basic_ar_data = get_basic_data(AR_LINK)

## 2. Download phone codes

### Resources

In [79]:
'''
https://www.algerie-poste.net/code-postal/
https://fr.wikipedia.org/wiki/Liste_des_codes_t%C3%A9l%C3%A9phoniques_des_wilayas_d%27Alg%C3%A9rie
'''

'\nhttps://www.algerie-poste.net/code-postal/\nhttps://fr.wikipedia.org/wiki/Liste_des_codes_t%C3%A9l%C3%A9phoniques_des_wilayas_d%27Alg%C3%A9rie\n'

In [80]:
# In this part I had to do it manually, since I counld't see another way.
# The category corresponds to metropolic data
# with the following format: wilaya-code,wilaya,wilaya-phone-code
metro_data = [
    ["01", "ADRAR", "49"],
    ["02", "CHLEeF", "27"],
    ["03", "LAGHOUAT", "29"],
    ["04", "OUM EL BOUAGHI", "32"],
    ["05", "BATNA", "33"],
    ["06", "BEJAIA", "34"],
    ["07", "BISKRA", "33"],
    ["08", "BECHAR", "49"],
    ["09", "BLIDA", "25"],
    ["10", "BOUIRA", "26"],
    ["11", "TAMANRASSET", "29"],
    ["12", "TEBESSA", "37"],
    ["13", "TLEMCEN", "43"],
    ["14", "TIARET", "46"],
    ["15", "TIZI-OUZOU", "26"],
    ["16", "ALGER", "21", "23"],
    ["17", "DJELFA", "27"],
    ["18", "JIJEL", "34"],
    ["19", "SETIF", "36"],
    ["20", "SAIDA", "48"],
    ["21", "SKIKDA", "38"],
    ["22", "SIDI BEL-ABBES", "48"],
    ["23", "ANNABA", "38"],
    ["24", "GUELMA", "37"],
    ["25", "CONSTANTINE", "31"],
    ["26", "MEDEA", "25"],
    ["27", "MOSTAGANEM", "45"],
    ["28", "M'SILA", "35"],
    ["29", "MASCARA", "45"],
    ["30", "OUARGLA", "29"],
    ["31", "ORAN", "41"],
    ["32", "EL BAYADH", "49"],
    ["33", "ILLIZI", "29"],
    ["34", "B.B.ARRERIDJ", "35"],
    ["35", "BOUMERDES", "24"],
    ["36", "EL TARF", "38"],
    ["37", "TINDOUF", "49"],
    ["38", "TISSEMSILT", "46"],
    ["39", "EL-OUED", "32"],
    ["40", "KHENCHELA", "32"],
    ["41", "SOUK AHRAS", "37"],
    ["42", "TIPAZA", "24"],
    ["43", "MILA", "31"],
    ["44", "AIN-DEFLA", "27"],
    ["45", "NAAMA", "49"],
    ["46", "AIN TEMOUCHENT", "43"],
    ["47", "GHARDAIA", "29"],
    ["48", "RELIZANE", "46"],
]