# Scraping Le Monde

In [339]:
from bs4 import BeautifulSoup
import requests

response = requests.get("https://www.lemonde.fr/en")
doc = BeautifulSoup(response.text)

In [340]:
body = doc.find("main")
len(body)

11

In [341]:
articles = body.find_all(class_="article")
articles
len(articles)

25

In [342]:
for article in articles:
    print("------")
    print(article.text)

------
   Iran warns of 'devastating response' to Israel attacks            TOMER NEUBERG / REUTERS  'Leave the occupied territories because they will certainly no longer be habitable in the future," and shelters will 'not guarantee security', an Iranian spokesman for the armed forces said on Sunday. Tehran also reported that the intelligence chief of Iran's Revolutionary Guards had been killed in an Israeli strike.   Israel, Iran exchange deadly strikes for third day    Subscribers only The collapse of the 'axis of resistance'    
------
         Mads Claus Rasmussen / via REUTERS   In Greenland, Macron criticizes Trump's annexation threats: 'That's not what allies do'   
------
         BORIS SEMENIAKO  Subscribers only Was the Trump phenomenon a historical accident? Reinterpreting the history of the American right   
------
         HECTOR PASSAT  Subscribers only France's first Slow Fashion Week champions a simple, recycled approach to fashion  Designers and vintage enthusiasts gat

#### Now to scrape title, subhead, article URL, whether it's premium or not, byline, article type, image URL

In [343]:
for article in articles:
    print("------")
    # title
    print(article.find(class_='article__title').text)
    # subhead
    try:
        print(article.find(class_='article__desc').text)
    except:
        print("No subhead")
    # url
    try:
        print(article.find('a')['href'])
    except:
        print("No url")
    # premium
    try:
        print(article.find(class_='sr-only').text)
    except:
        print("Free to read")
    # byline
    try:
        print(article.select_one('.article__byline, .article__author-name').text)
    except:
        print("No byline")
    # type
    try:
        print(article.find(class_="article__type").text)
    except:
        print("No article type")
    # image url
    try:
        print(article.find('img').get('data-src', None))
    except:
        print("No image")
    


------
 Iran warns of 'devastating response' to Israel attacks  
'Leave the occupied territories because they will certainly no longer be habitable in the future," and shelters will 'not guarantee security', an Iranian spokesman for the armed forces said on Sunday. Tehran also reported that the intelligence chief of Iran's Revolutionary Guards had been killed in an Israeli strike.
https://www.lemonde.fr/en/international/article/2025/06/15/iran-warns-of-devastating-response-to-israel-attacks_6742369_4.html
Subscribers only
No byline
No article type
None
------
In Greenland, Macron criticizes Trump's annexation threats: 'That's not what allies do'
No subhead
https://www.lemonde.fr/en/international/article/2025/06/15/in-greenland-macron-criticizes-trump-s-annexation-threats-that-s-not-what-allies-do_6742364_4.html
Free to read
No byline
No article type
https://img.lemde.fr/2025/06/15/0/0/4320/2880/398/265/75/0/899bed8_ftp-import-images-1-btqov54uiqar-2025-06-15t125318z-697523675-rc203fadv

In [344]:
rows = []

for article in articles:
    row = {}
    row['title'] = article.find(class_='article__title').text
    try:
       row['subhead'] = article.find(class_='article__desc').text
    except:
        pass
    try:
        row['article_url'] = article.find('a')['href']
    except:
        pass
    try:
        row['premium'] = article.find(class_='sr-only').text
    except:
        pass
    try:
        row['byline'] = article.select_one('.article__byline, .article__author-name').text
    except:
        pass
    try:
        row['article_type'] = article.find(class_="article__type").text
    except:
       pass
    try:
        row['image_url'] = article.find('img').get('data-src').text
    except:
        pass
    rows.append(row)
    

In [345]:
import pandas as pd

df = pd.json_normalize(rows)
df.head()

Unnamed: 0,title,subhead,article_url,premium,byline,article_type
0,Iran warns of 'devastating response' to Israe...,'Leave the occupied territories because they w...,https://www.lemonde.fr/en/international/articl...,Subscribers only,,
1,"In Greenland, Macron criticizes Trump's annexa...",,https://www.lemonde.fr/en/international/articl...,,,
2,Was the Trump phenomenon a historical accident...,,https://www.lemonde.fr/en/international/articl...,Subscribers only,,
3,France's first Slow Fashion Week champions a s...,Designers and vintage enthusiasts gathered in ...,https://www.lemonde.fr/en/environment/article/...,Subscribers only,,
4,Paris Saint-Germain thrash Atlético 4-0 in Clu...,PSG largely dominated Atlético at the Rose Bow...,https://www.lemonde.fr/en/sports/article/2025/...,,,


In [346]:
df.to_csv("lemonde.csv", index=False)