# Librairies

In [1]:
import pandas as pd
import bs4
from urllib import request

# Webscrapping Wikipedia Dataset: Évolution de la population de la France métropolitaine (estimations avant 1950)

In [2]:
url_wiki = ("https://fr.wikipedia.org/wiki/Histoire_d%C3%A9mographique_de_la_France")
request_text = request.urlopen(url_wiki).read()
page = bs4.BeautifulSoup(request_text, "lxml")
print(page.find("title"))
print("There are", len(page.findAll("table")), "elements which are <table>")

<title>Histoire démographique de la France — Wikipédia</title>
There are 17 elements which are <table>


In [3]:
all_tables = page.find_all("table", {"class": "wikitable"})
for i, tab in enumerate(all_tables):
    print("Table: {}\n".format(i))
    print(tab)

Table: 0

<table class="wikitable">
<tbody><tr>
<th scope="row">Date
</th>
<th scope="row">Habitants
</th></tr>
<tr>
<td>15 000 av. J.-C.</td>
<td>50 000
</td></tr>
<tr>
<td>5000 av. J.-C.</td>
<td>500 000
</td></tr>
<tr>
<td>2500 av. J.-C.</td>
<td>5 500 000
</td></tr>
<tr>
<td>An 1</td>
<td>7 000 000
</td></tr>
<tr>
<td>400</td>
<td>12 000 000
</td></tr>
<tr>
<td>800</td>
<td>8 800 000
</td></tr>
<tr>
<td>850</td>
<td>6 000 000
</td></tr>
<tr>
<td>1226</td>
<td>16 000 000
</td></tr>
<tr>
<td>1300</td>
<td>15 000 000
</td></tr>
<tr>
<td>1345</td>
<td>20 200 000
</td></tr>
<tr>
<td>1350</td>
<td>15 000 000
</td></tr>
<tr>
<td>1400</td>
<td>12 000 000
</td></tr>
<tr>
<td>1457</td>
<td>11 000 000
</td></tr>
<tr>
<td>1500</td>
<td>14 000 000
</td></tr>
<tr>
<td>1550</td>
<td>15 300 000
</td></tr>
<tr>
<td>1560</td>
<td>16 200 000
</td></tr>
<tr>
<td>1580</td>
<td>16 500 000
</td></tr>
<tr>
<td>1600</td>
<td>20 000 000
</td></tr>
<tr>
<td>1620</td>
<td>21 000 000
</td></tr>
<tr>
<td>1650</

In [4]:
tables_recensement = [all_tables[0], all_tables[1], all_tables[2]]
list_cols = []
list_rows = []

for tab in tables_recensement:
    table_body = tab.find("tbody")
    rows = table_body.find_all("tr")
    list_rows.append(rows)
    for row in rows:
        cols = row.find_all("td")
        cols = [ele.text.strip() for ele in cols]
        list_cols.append(cols)

cols = [ele for sublist in list_cols for ele in sublist]
rows = [ele for sublist in list_rows for ele in sublist]

dico_recensement = dict()
for row in rows:
    cols = row.find_all("td")
    cols = [ele.text.strip() for ele in cols]
    if len(cols) > 0:
        dico_recensement[cols[0]] = cols[1:]

df_recensement = pd.DataFrame.from_dict(dico_recensement, orient="index")
df_recensement.reset_index(inplace=True)
df_recensement.rename(columns={'index': 'Year', 0: 'Population'}, inplace=True)
df_recensement.drop([0, 1, 2, 3], inplace = True)
df_recensement["Year"] = df_recensement["Year"].astype(int)
df_recensement['Population'] = df_recensement['Population'].str.replace('\xa0', '').str.replace(' ', '').astype(int)
df_recensement.head()

Unnamed: 0,Year,Population
4,400,12000000
5,800,8800000
6,850,6000000
7,1226,16000000
8,1300,15000000


In [5]:
subset_recensement = df_recensement[(df_recensement["Year"] >= 1836)&(df_recensement["Year"] <= 1936)]
display(subset_recensement)

Unnamed: 0,Year,Population
40,1836,34293000
41,1841,34912000
42,1846,36097000
43,1851,36472000
44,1856,36715000
45,1861,37386000
46,1872,37653000
47,1876,38783000
48,1881,39239000
49,1886,39783000


In [6]:
pop_1866 = (subset_recensement.loc[subset_recensement['Year'] == 1861, 'Population'].values[0] + subset_recensement.loc[subset_recensement['Year'] == 1872, 'Population'].values[0]) / 2
df_temp = pd.DataFrame({'Year': 1866, 'Population': pop_1866}, index=[5.5])
subset_recensement = pd.concat([subset_recensement.loc[subset_recensement['Year'] < 1866], df_temp, subset_recensement.loc[subset_recensement['Year'] >= 1866]])
subset_recensement.reset_index(drop=True, inplace=True)
subset_recensement = subset_recensement.drop(subset_recensement[(subset_recensement['Year'] > 1901) & (~subset_recensement['Year'].isin([1901 + 5*k for k in range(1,8)]))].index)
display(subset_recensement)

Unnamed: 0,Year,Population
0,1836,34293000.0
1,1841,34912000.0
2,1846,36097000.0
3,1851,36472000.0
4,1856,36715000.0
5,1861,37386000.0
6,1866,37519500.0
7,1872,37653000.0
8,1876,38783000.0
9,1881,39239000.0


In [7]:
print("Estimation population size (1836 - 1936): {}".format(int(subset_recensement["Population"].sum())))

Estimation population size (1836 - 1936): 815106500
