In [5]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import datetime

In [6]:
# Function to get all Sundays up to the last Sunday before today, or include today if it's a Sunday
def get_sundays_until_today(start_year):
    sundays = []  # List to store the Sundays in the desired format
    today = datetime.date.today()  # Today's date
    for year in range(start_year, today.year + 1):
        for month in range(1, 13):  # Iterate over all months
            if year == today.year and month > today.month:
                break  # If the current month/year is beyond today's month/year, break the loop
            day = datetime.date(year, month, 1)
            # Calculate how many days to add to the first day of the month to get to the first Sunday
            days_to_sunday = 6 - day.weekday() if day.weekday() < 6 else 0
            first_sunday = day + datetime.timedelta(days=days_to_sunday)
            while first_sunday.month == month:  # Check if it's still within the same month
                if first_sunday > today:
                    break  # If the generated Sunday is after today, break the loop
                sundays.append(first_sunday.strftime('%y%m%d'))  # Add to the list in the 'yymmdd' format
                first_sunday += datetime.timedelta(days=7)  # Next Sunday
            if first_sunday > today:
                break  # If the generated Sunday is after today, break the year loop
    return sundays

# Call the function starting from 2018 up to the last Sunday before today
sundays = get_sundays_until_today(2018)


In [7]:
#base_url = "https://www.sundaytimes.lk/240218/news/" #Data da última edição dominical - 240218 = 18/02/2024
base_url = "https://www.sundaytimes.lk/{}/news/"
header = {"User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"} # Just get from searching o google: my user agent


In [8]:
urls_list = []
for sunday in sundays:
    # Substituir o espaço reservado na URL pela data do domingo atual
    url = base_url.format(sunday)

    # Enviar uma solicitação GET para a URL
    response = requests.get(url, headers=header)

    # Verificar se a solicitação foi bem-sucedida (código de status HTTP 200)
    if response.status_code == 200:
        print(f"Sucesso ao acessar {url}")
        # Aqui você pode adicionar o código para processar a resposta, como extrair informações da página
    else:
        print(f"Falha ao acessar {url} - Código de status: {response.status_code}")
    soup = BeautifulSoup(response.text, 'html.parser')

    articles = soup.find_all('h2', class_='entry_title')
#    for article in articles:
#        link = article.find('a')['href']
#        # Append the article details to the list
#        urls_list.append(link)

    for article in articles:
        a_tag = article.find('a')  # Encontrar a tag <a> dentro do elemento <h2>
        if a_tag is not None:  # Verificar se a tag <a> foi encontrada
            link = a_tag['href']  # Extrair o atributo 'href'
            urls_list.append(link)  # Adicionar o link à lista
        else:
            print(f"Nenhum link encontrado em {article}")  # Mensagem opcional para indicar quando um link não é encontrado


Sucesso ao acessar https://www.sundaytimes.lk/180107/news/
Sucesso ao acessar https://www.sundaytimes.lk/180114/news/
Sucesso ao acessar https://www.sundaytimes.lk/180121/news/
Sucesso ao acessar https://www.sundaytimes.lk/180128/news/
Sucesso ao acessar https://www.sundaytimes.lk/180204/news/
Sucesso ao acessar https://www.sundaytimes.lk/180211/news/
Sucesso ao acessar https://www.sundaytimes.lk/180218/news/
Sucesso ao acessar https://www.sundaytimes.lk/180225/news/
Sucesso ao acessar https://www.sundaytimes.lk/180304/news/
Sucesso ao acessar https://www.sundaytimes.lk/180311/news/
Sucesso ao acessar https://www.sundaytimes.lk/180318/news/
Sucesso ao acessar https://www.sundaytimes.lk/180325/news/
Sucesso ao acessar https://www.sundaytimes.lk/180401/news/
Sucesso ao acessar https://www.sundaytimes.lk/180408/news/
Sucesso ao acessar https://www.sundaytimes.lk/180415/news/
Sucesso ao acessar https://www.sundaytimes.lk/180422/news/
Sucesso ao acessar https://www.sundaytimes.lk/180429/new

In [10]:
len(urls_list)

9593

In [14]:
df = pd.DataFrame(urls_list, columns=['url'])
df
# Save the DataFrame to a CSV file
df.to_csv('sundaytimes_urls.csv', index=False, encoding='utf-8', sep=',')

In [11]:
urls_list

['http://www.sundaytimes.lk/180107/news/if-flags-can-tell-stories-see-what-these-pictures-show-276453.html',
 'http://www.sundaytimes.lk/180107/news/fertiliser-crisis-issue-with-pakistan-averted-276428.html',
 'http://www.sundaytimes.lk/180107/news/thilan-wijesinghe-tipped-to-chair-srilankan-interim-board-276440.html',
 'http://www.sundaytimes.lk/180107/news/4-star-hotel-at-bia-276438.html',
 'http://www.sundaytimes.lk/180107/news/prince-edward-chief-guest-at-70th-freedom-festival-276436.html',
 'http://www.sundaytimes.lk/180107/news/lanka-attracts-record-number-of-corporate-private-jets-276434.html',
 'http://www.sundaytimes.lk/180107/news/60-years-of-russia-sl-diplomatic-ties-sl-to-issue-stamp-2-months-after-russia-276469.html',
 'http://www.sundaytimes.lk/180107/news/navy-raid-uncovers-rs-30-million-worth-narcotics-on-vvt-beach-276464.html',
 'http://www.sundaytimes.lk/180107/news/episcopal-ordination-and-enthronement-of-6th-bishop-of-kurunegala-276491.html',
 'http://www.sundaytime