In [None]:
!pip install requests
!pip install beautifulsoup4
!pip install wikipedia-api
!pip install tqdm
!pip install random

In [None]:
import requests
from bs4 import BeautifulSoup
import wikipediaapi
import json
from tqdm import tqdm
import time
import random

In [36]:
def get_recent_articles(url, limit=2000):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }

    articles = []
    page_count = 0
    while len(articles) < limit:
        try:
            response = requests.get(url, headers=headers, timeout=30)
            response.raise_for_status()
            print(f"Status code: {response.status_code}")
        except requests.RequestException as e:
            print(f"Error fetching page: {e}")
            time.sleep(10)
            continue

        soup = BeautifulSoup(response.content, 'html.parser')
        ul_tag = soup.find('ul', class_='mw-contributions-list')

        if ul_tag is None:
            print("Failed to find the list of articles.")
            break

        for li_tag in ul_tag.find_all('li'):
            a_tag = li_tag.find('a', class_='mw-newpages-pagename')
            if a_tag:
                article_title = a_tag['title']
                article_url = 'https://en.wikipedia.org' + a_tag['href']
                articles.append((article_title, article_url))
                if len(articles) >= limit:
                    break

        page_count += 1
        if page_count % 5 == 0 or len(articles) >= limit:
            print(f"Collected {len(articles)} articles so far...")

        next_link = soup.find('a', class_='mw-nextlink')
        if next_link:
            url = 'https://en.wikipedia.org' + next_link['href']
        else:
            print("Couldn't find anymore pages to scrape.")
            break

        time.sleep(random.uniform(1, 3))  # Random delay between requests

    print(f"Finished collecting {len(articles)} articles.")
    return articles[:limit]

In [42]:
def scrape_wikipedia_all_pages(limit=2000):
    base_url = "https://en.wikipedia.org/wiki/Special:AllPages"
    next_page = None  # For pagination
    articles = []
    page_count = 0

    while len(articles) < limit:
        url = base_url if not next_page else f"https://en.wikipedia.org{next_page}"
        headers = {"User-Agent": "Mozilla/5.0"}

        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
            time.sleep(5)
            continue

        soup = BeautifulSoup(response.text, "html.parser")

        # Find the list of article links
        for link in soup.select(".mw-allpages-chunk a"):
            title = link.text
            article_url = f"https://en.wikipedia.org{link['href']}"
            articles.append((title, article_url))
            if len(articles) >= limit:
                break

        # Find next page link
        next_link = soup.find("a", string=lambda text: text and text.startswith("Next page"))
        if next_link:
            next_page = next_link["href"]
        else:
            print("No more pages to scrape: Couldn't find next button.")
            break  # Stop if there's no next page

        page_count += 1
        print(f"✅ Scraped {len(articles)} articles so far... (Page {page_count})")

        # Random delay to avoid detection
        time.sleep(random.uniform(1, 3))

    print(f"✅ Finished scraping {len(articles)} articles.")
    return articles

In [60]:
import requests
from bs4 import BeautifulSoup
import time
import random
from urllib.parse import unquote  # Import URL decoding function

def get_creation_date(page_url):
    """
    This function uses the Wikipedia API to fetch the revision history of a page
    and checks if the creation date is in August 2024.
    """
    # Extract the page title from the URL and decode it
    page_title = unquote(page_url.split('/')[-1])

    # Wikipedia API URL for page revisions
    api_url = f"https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'titles': page_title,
        'prop': 'revisions',
        'rvlimit': 1,  # Only need the first revision (creation)
        'rvdir': 'newer',
        'format': 'json',
    }

    response = requests.get(api_url, params=params)
    data = response.json()

    # Check if the page exists and has revisions
    pages = data.get('query', {}).get('pages', {})
    for page_id, page_data in pages.items():
        if 'revisions' in page_data:
            creation_timestamp = page_data['revisions'][0]['timestamp']
            # Parse the timestamp to check if it falls within August 2024
            creation_date = creation_timestamp[:7]  # Format: YYYY-MM
            if creation_date == '2024-08':
                print(f"Article '{page_title}' created on {creation_date}")  # Debugging: Print creation date
                return True

    return False


def scrape_wikipedia_all_pages_2(limit=2000):
    # base_url = "https://en.wikipedia.org/wiki/Special:AllPages"
    base_url = "https://en.wikipedia.org/w/index.php?title=Special:AllPages&from=%22Ako+ang+Batas%22+-Gen.+Tomas+Karingal"
    next_page = None  # For pagination
    articles = []
    page_count = 0

    while len(articles) < limit:
        url = base_url if not next_page else f"https://en.wikipedia.org{next_page}"
        print(f"Fetching {url}")  # Debugging: Print the URL being fetched
        headers = {"User-Agent": "Mozilla/5.0"}

        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
            time.sleep(5)
            continue

        soup = BeautifulSoup(response.text, "html.parser")

        # Find the list of article links
        for link in soup.select(".mw-allpages-chunk a"):
            title = link.text
            article_url = f"https://en.wikipedia.org{link['href']}"

            # Check if the page was created in August 2024
            if get_creation_date(article_url):
                articles.append((title, article_url))
                if len(articles) >= limit:
                    break

        # Debug: Print out the next page link
        next_link = soup.find("a", string=lambda text: text and text.startswith("Next page"))
        if next_link:
            next_page = next_link["href"]
            print(f"Next page found: {next_page}")  # Debugging: Print next page link
        else:
            print("No more pages to scrape.")  # Debugging: Stop condition
            break  # Stop if there's no next page

        page_count += 1
        print(f"✅ Scraped {len(articles)} articles so far... (Page {page_count})")

        # Random delay to avoid detection
        time.sleep(random.uniform(1, 3))

    print(f"✅ Finished scraping {len(articles)} articles.")
    return articles


In [40]:
def scrape_wikipedia_article(page_title, wiki_wiki, max_retries=3):
    for attempt in range(max_retries):
        try:
            page = wiki_wiki.page(page_title)
            if not page.exists():
                print(f"Page '{page_title}' does not exist.")
                return None
            return page.text
        except (requests.RequestException, requests.Timeout) as e:
            if attempt < max_retries - 1:
                print(f"Error scraping '{page_title}'. Retrying... (Attempt {attempt + 1}/{max_retries})")
                time.sleep(random.uniform(2, 5))
            else:
                print(f"Failed to scrape '{page_title}' after {max_retries} attempts.")
                return None

In [None]:
recent_articles_url = "https://en.wikipedia.org/wiki/Special:AllPages"
article_limit = 100
print(f"Starting to collect {article_limit} recent articles...")
articles = scrape_wikipedia_all_pages_2(article_limit)
print(f"Collected {len(articles)} articles.")
data = []

wiki_wiki = wikipediaapi.Wikipedia(
    user_agent='WikipediaScraper (your_email@example.com)',
    language='en',
    timeout=30
)

for article_title, article_url in tqdm(articles, desc="Scraping articles"):
    content = scrape_wikipedia_article(article_title, wiki_wiki)
    if content:
        data.append({
            'title': article_title,
            'url': article_url,
            'content': content
        })
    time.sleep(random.uniform(0.5, 1.5))  # Random delay between article scrapes

with open('scraped_wiki_articles_August_2024_1.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

print(f"Scraped data has been saved to 'scraped_wiki_articles.json'")
