# Scraping Le Monde

In [1]:
from bs4 import BeautifulSoup
import requests

response = requests.get("https://www.lemonde.fr/en")
doc = BeautifulSoup(response.text)

In [33]:
body = doc.find(id="habillagepub")
len(body)

8

In [34]:
articles = body.find_all(class_="article")
articles
len(articles)

25

In [35]:
for article in articles:
    print("------")
    print(article.text)

------
  Subscribers only Israel, under fire from Iranian missiles, comes to standstill            JONAS OPPERSKALSKI FOR LE MONDE  Despite the many projectiles intercepted by Israel, attacks launched by Iran on several Israeli cities have killed 16 people, injured 500, and damaged several neighborhoods and important military sites.    Subscribers only Iran, struck throughout by Israel, faces chaos    Subscribers only Israeli strikes on Iranian oil and gas sites plunge markets into uncertainty    
------
         Gerald Herbert / AP   At the G7, it's Trump against the rest, in a chaotic world   Editorial   
------
         Benoit Tessier / REUTERS   France shuts five Israeli booths at Paris Air Show for showcasing 'offensive weapons'   
------
         AHMAD GHARABLI/AFP  Subscribers only Trump's messianic allies lead the charge against Gaza  The US president has rallied Christian fundamentalists, ideologically aligned with Israeli supremacists, in support of the multifaceted campaign 

#### We want to scrape:
* title
* article URL
* subhead
* byline
* premium or not
* article type
* image URL

In [None]:
for article in articles:
    print("------")

    # title - from homepage
    print(article.find(class_='article__title').text.strip())
    
    # url - from homepage
    url = article.find('a')['href']
    print(url)

    # go inside each article url to get more info
    article_response = requests.get(url)
    article_doc = BeautifulSoup(article_response.content)

    # subhead - from article page
    print(article_doc.find(class_='article__desc').text.strip())

    # byline - from article page, 'article__author-identity' sometimes annoyingly nested inside 'article__author-link' so set a priority
    try:
        print(article_doc.find(class_='article__author-identity').text.strip())
    except:
        print(article_doc.select_one('.article__author-link, .meta__author').text.strip())
        
    # premium - from homepage
    try:
        print(article.find(class_='sr-only').text.strip())
    except:
        print("Free to read")
    
    # type - from homepage, assuming anything without explicit tag is news
    try:
        print(article.find(class_="article__type").text.strip())
    except:
        print("News")

    # image url - from homepage
    img_tag = article.find('img')
    if img_tag:
        img_url = (
            img_tag.get('src') or
            img_tag.get('data-src')
        )
        print(img_url)
    else:
        print("No image")

------
Israel, under fire from Iranian missiles, comes to standstill
https://www.lemonde.fr/en/international/article/2025/06/16/israel-under-fire-from-iranian-missiles-comes-to-standstill_6742389_4.html
NewsDespite the many projectiles intercepted by Israel, attacks launched by Iran on several Israeli cities have killed 16 people, injured 500, and damaged several neighborhoods and important military sites.
Samuel Forey
Subscribers only
News
https://img.lemde.fr/2025/06/16/0/0/6000/4000/400/266/75/0/ce2fda8_upload-1-aebraqfdupnm-jo-iran-war-013-20250614.jpg
------
At the G7, it's Trump against the rest, in a chaotic world
https://www.lemonde.fr/en/opinion/article/2025/06/16/at-the-g7-it-s-trump-against-the-rest-in-a-chaotic-world_6742384_23.html
As the leaders of the seven largest Western economies, plus the European Union, meet in Canada from Sunday, the divisions within what can hardly still be called the 'Western family' have been brought out into the open.
Le Monde
Free to read
Edit

In [60]:
rows = []

for article in articles:
    row = {}
    row['title'] = article.find(class_='article__title').text.strip()
    
    url = article.find('a')['href']
    row['url'] = url

    article_response = requests.get(url)
    article_doc = BeautifulSoup(article_response.content)

    row['subhead'] = article_doc.find(class_='article__desc').text.strip()

    try:
        row['byline'] = article_doc.find(class_='article__author-identity').text.strip()
    except:
        row['byline'] = article_doc.select_one('.article__author-link, .meta__author').text.strip()
        
    try:
        row['premium'] = article.find(class_='sr-only').text.strip()
    except:
        row['premium'] = "Free to read"
    
    try:
        row['article type'] = article.find(class_="article__type").text.strip()
    except:
        row['article type'] = "News"

    img_tag = article.find('img')
    if img_tag:
        img_url = (
            img_tag.get('src') or
            img_tag.get('data-src')
        )
        row['image url'] = img_url
    else:
        row['image url'] = "No image"

    rows.append(row)

In [61]:
import pandas as pd

df = pd.json_normalize(rows)
df.head()

Unnamed: 0,title,url,subhead,byline,premium,article type,image url
0,"Israel, under fire from Iranian missiles, come...",https://www.lemonde.fr/en/international/articl...,NewsDespite the many projectiles intercepted b...,Samuel Forey,Subscribers only,News,https://img.lemde.fr/2025/06/16/0/0/6000/4000/...
1,"At the G7, it's Trump against the rest, in a c...",https://www.lemonde.fr/en/opinion/article/2025...,As the leaders of the seven largest Western ec...,Le Monde,Free to read,Editorial,https://img.lemde.fr/2025/06/16/4/0/3936/2624/...
2,France shuts five Israeli booths at Paris Air ...,https://www.lemonde.fr/en/economy/article/2025...,Black walls were installed around the stands o...,Le Monde with AFP,Free to read,News,https://img.lemde.fr/2025/06/16/0/0/5889/3926/...
3,Trump's messianic allies lead the charge again...,https://www.lemonde.fr/en/international/articl...,The US president has rallied Christian fundame...,Jean-Pierre Filiu,Subscribers only,News,https://img.lemde.fr/2025/06/14/0/4/4296/2864/...
4,WhatsApp boss: Meta AI doesn't see your conver...,https://www.lemonde.fr/en/pixels/article/2025/...,"Will Cathcart, head of Meta's subsidiary, anno...",By Morgane Tual and Alexandre Piquard,Subscribers only,News,https://img.lemde.fr/2025/04/25/0/0/3000/2000/...


In [62]:
df.to_csv("lemonde.csv", index=False)