### Step 1: Find articles on AI bubble (or similar) from different countries and scrape them using BeautifulSoup

In [1]:
import requests
from bs4 import BeautifulSoup
import boto3
import os
import time
from urllib.parse import urlparse

In [2]:
S3_BUCKET_NAME = "aruzhan-sabira-hw3"  
LOCAL_OUTPUT_DIR = 'scraped_articles'     # Directory to save local .txt files
S3_ENGLISH_PREFIX = 'raw_articles/english/'     # Folder for English articles (6 of them)
S3_OTHER_PREFIX = 'raw_articles/non-english/'   # Folder for non-English articles (6 of them)

s3_client = boto3.client('s3')
comprehend_client = boto3.client('comprehend') 

In [3]:
ARTICLE_URLS = [
    'https://live.handelsblatt.com/blasen-bei-ki-werten-platzen-sie-bald-oder-geht-da-noch-was/',
    'https://www.theguardian.com/technology/2025/dec/01/ai-bubble-us-economy',
    'https://www.9news.com.au/technology/are-we-in-an-ai-tech-bubble-what-happens-if-it-bursts-explainer/c11d1f63-a085-419d-bc62-030911459304',
    'https://meduza.io/feature/2025/10/13/eksperty-vse-chasche-nazyvayut-bum-iskusstvennogo-intellekta-finansovym-puzyrem-eto-priznayut-dazhe-sami-razrabotchiki-ii',
    'https://navbharattimes.indiatimes.com/business/business-news/ai-bubble-on-the-verge-of-bursting-bigger-crisis-than-2008-what-are-the-challenges-for-india-what-it-should-do/articleshow/124362358.cms',
    'https://www.japantimes.co.jp/news/2025/11/14/japan/media/japan-media-ai-threat/',
    'https://www.cnnbrasil.com.br/economia/mercado/resultados-da-oracle-sinalizam-bolha-de-ia-entenda-debate/',
    'https://www.lagazettefrance.fr/index.php/article/une-bulle-de-l-intelligence-artificielle',
    'https://www.vestbee.com/insights/articles/are-we-in-an-ai-bubble-we-asked-european-investors',
    'https://qazinform.com/news/uzbekistan-to-lay-off-over-2000-government-employees-amid-ai-integration-c7d87e',
    'https://astanatimes.com/2025/10/kazakhstan-advances-ai-digital-ecosystem-development-under-governments-digital-headquarters/',
    'https://www.bangkokbiznews.com/world/1206507'
]


In [4]:
def detect_language(text_content):
    """Uses Amazon Comprehend to detect the dominant language."""
    if not text_content:
        return None, 0.0
        
    # Comprehend has a limit of 5000 bytes for this operation.
    # We truncate the text if it's too long to ensure the call succeeds.
    text_to_analyze = text_content[:4900]
    
    try:
        response = comprehend_client.detect_dominant_language(Text=text_to_analyze)
        
        # Dominant language is the first one in the list
        if response.get('Languages'):
            dominant_lang = response['Languages'][0]
            lang_code = dominant_lang['LanguageCode']
            score = dominant_lang['Score']
            print(f"   Detected Language: {lang_code} (Score: {score:.2f})")
            return lang_code, score
        else:
            return 'UNKNOWN', 0.0
            
    except Exception as e:
        print(f"   ERROR in Comprehend language detection: {e}")
        return 'ERROR', 0.0

In [5]:
def clean_filename(url):
    """Creates a clean filename from a URL path."""
    parsed_url = urlparse(url)
    path = parsed_url.path.strip('/').replace('/', '_')  # Use the path and remove leading/trailing slashes and non-alphanumeric chars
    name = path.split('_')[-1] if path else 'article' # Use the last part of the path as the name, limiting length for safety
    return f"{name[:50]}_{time.time():.0f}.txt"

In [6]:
def scrape_article_text(url):
    """Fetches a URL and extracts the main text using Beautiful Soup."""
    print(f"-> Attempting to scrape: {url}")
    try:
        # 1. Fetch the page content
        headers = {
           'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)

        # 2. Parse the HTML
        soup = BeautifulSoup(response.content, 'html.parser')
        
        main_content = soup.find('article') or soup.find('main') or soup.body
        
        if main_content:
            # Extract all text, clean up excessive whitespace/newlines
            article_text = main_content.get_text(separator='\n', strip=True)
            return article_text
        else:
            print(f"   Warning: Could not find main article content in {url}")
            return soup.get_text(separator='\n', strip=True) # Fallback to all text

    except requests.exceptions.RequestException as e:
        print(f"   Error fetching {url}: {e}")
        return None


In [7]:
def upload_to_s3(file_path, s3_key, bucket_name):
    """Uploads a local file to S3."""
    try:
        s3_client.upload_file(file_path, bucket_name, s3_key)
        print(f"   SUCCESS: Uploaded {file_path} to s3://{bucket_name}/{s3_key}")
    except Exception as e:
        print(f"   ERROR uploading to S3: {e}")

In [8]:
if __name__ == "__main__":
    os.makedirs(LOCAL_OUTPUT_DIR, exist_ok=True)
    print(f"Starting scraping process for {len(ARTICLE_URLS)} articles...")
    
    for url in ARTICLE_URLS:
        text_content = scrape_article_text(url)
        
        if text_content:
            # 1. Detect Language
            language_code, confidence = detect_language(text_content)
            
            # 2. Determine S3 Prefix based on Language
            if language_code == 'en':
                s3_key_prefix = S3_ENGLISH_PREFIX
            elif language_code != 'ERROR':
                s3_key_prefix = S3_OTHER_PREFIX
            else:
                # Fallback for API errors or UNKNOWN
                s3_key_prefix = S3_OTHER_PREFIX 
            
            # 3. Define file paths
            local_file_name = clean_filename(url)
            local_file_path = os.path.join(LOCAL_OUTPUT_DIR, local_file_name)
            s3_key = s3_key_prefix + local_file_name # Combine prefix and filename

            # 4. Save content locally (to check correctness quicker)
            try:
                with open(local_file_path, 'w', encoding='utf-8') as f:
                    f.write(f"[[Language Code: {language_code}]]\n\n") 
                    f.write(text_content)
                print(f"   Saved locally: {local_file_path}")

                # 5. Upload the file to the correct S3 location
                upload_to_s3(local_file_path, s3_key, S3_BUCKET_NAME)

            except IOError as e:
                print(f"   ERROR saving file {local_file_path}: {e}")
            
            time.sleep(2) 
            
    print("\nScraping, language detection, and structured upload finished.")

Starting scraping process for 12 articles...
-> Attempting to scrape: https://live.handelsblatt.com/blasen-bei-ki-werten-platzen-sie-bald-oder-geht-da-noch-was/
   Detected Language: de (Score: 1.00)
   Saved locally: scraped_articles\blasen-bei-ki-werten-platzen-sie-bald-oder-geht-da_1765755100.txt
   SUCCESS: Uploaded scraped_articles\blasen-bei-ki-werten-platzen-sie-bald-oder-geht-da_1765755100.txt to s3://aruzhan-sabira-hw3/raw_articles/non-english/blasen-bei-ki-werten-platzen-sie-bald-oder-geht-da_1765755100.txt
-> Attempting to scrape: https://www.theguardian.com/technology/2025/dec/01/ai-bubble-us-economy
   Detected Language: en (Score: 1.00)
   Saved locally: scraped_articles\ai-bubble-us-economy_1765755103.txt
   SUCCESS: Uploaded scraped_articles\ai-bubble-us-economy_1765755103.txt to s3://aruzhan-sabira-hw3/raw_articles/english/ai-bubble-us-economy_1765755103.txt
-> Attempting to scrape: https://www.9news.com.au/technology/are-we-in-an-ai-tech-bubble-what-happens-if-it-burs