### Step 3: Now that every article is in English, we can check their sentiments using Amazon Comprehend. 

In [1]:
import boto3
import json
import os
import time

In [2]:
AWS_REGION = 'eu-west-1'
S3_BUCKET_NAME = 'aruzhan-sabira-hw3' 

# S3 prefixes containing ALL English-language articles
S3_INPUT_PREFIXES = [
    'raw_articles/english/', 
    'translated_articles/'
]
S3_OUTPUT_PREFIX = 'sentiment_results_sync/' # New folder for results of the service

s3_client = boto3.client('s3', region_name=AWS_REGION)
comprehend_client = boto3.client('comprehend', region_name=AWS_REGION)
MAX_TEXT_BYTES = 4900 

In [3]:
def analyze_sentiment(text_content):
    """
    Analyzes sentiment of the text using Comprehend's synchronous API.
    Handles text chunking to stay within the 5000 byte limit.
    """
    results = []
    
    # Text must be encoded to bytes for accurate length check
    original_bytes = text_content.encode('utf-8')
    current_index = 0
    total_bytes = len(original_bytes)
    
    # We will analyze sentiment for the first chunk only (simplification for time)
    # For a deep analysis, you would average the sentiment of all chunks.
    
    if total_bytes == 0:
        return {'Sentiment': 'NEUTRAL', 'SentimentScore': {'Mixed': 0, 'Positive': 0, 'Negative': 0, 'Neutral': 0}}
        
    end_index = min(current_index + MAX_TEXT_BYTES, total_bytes)
    
    # Safety check to avoid splitting a multi-byte character
    while end_index < total_bytes and (original_bytes[end_index] & 0xC0) == 0x80:
        end_index -= 1

    byte_chunk = original_bytes[current_index:end_index]
    
    try:
        string_chunk = byte_chunk.decode('utf-8')
    except UnicodeDecodeError:
        print("  [ERROR] Unicode decode error on chunk. Cannot analyze.")
        return {'Sentiment': 'ERROR', 'SentimentScore': {}}

    try:
        response = comprehend_client.detect_sentiment(
            Text=string_chunk, 
            LanguageCode='en' 
        )
        return response
        
    except Exception as e:
        print(f"  [ERROR] Comprehend analysis failed on chunk. Error: {e}")
        return {'Sentiment': 'ERROR', 'SentimentScore': {}}

In [4]:
def process_s3_file(bucket_name, input_key):
    """Downloads a file, analyzes sentiment, and uploads the result."""
    
    print(f"\nProcessing file: {input_key}")

    # 1. Download file content from S3
    try:
        response = s3_client.get_object(Bucket=bucket_name, Key=input_key)
        # Read the file content as a string
        original_text = response['Body'].read().decode('utf-8')
    except Exception as e:
        print(f"  [ERROR] Failed to read object {input_key}. Skipping. Error: {e}")
        return

    # 2. Analyze Sentiment
    sentiment_response = analyze_sentiment(original_text)
    
    # 3. Create Result Object
    result_data = {
        'SourceFile': input_key,
        'Sentiment': sentiment_response['Sentiment'],
        'ConfidenceScores': sentiment_response['SentimentScore'],
        'LanguageCode': 'en',
        'AnalysisTime': time.strftime("%Y-%m-%d %H:%M:%S")
    }

    # 4. Upload result to S3
    original_filename = os.path.basename(input_key)
    output_key = S3_OUTPUT_PREFIX + original_filename.replace('.txt', '.json')
    
    try:
        s3_client.put_object(
            Bucket=bucket_name, 
            Key=output_key, 
            Body=json.dumps(result_data, indent=2).encode('utf-8'),
            ContentType='application/json'
        )
        print(f"  [SUCCESS] Sentiment analyzed: {result_data['Sentiment']}. Results uploaded to s3://{bucket_name}/{output_key}")
    except Exception as e:
        print(f"  [ERROR] Failed to upload result object. Error: {e}")

In [5]:
if __name__ == "__main__":
    
    print(f"Starting synchronous sentiment analysis in region {AWS_REGION}...")

    # Iterate over both input folders
    for prefix in S3_INPUT_PREFIXES:
        print(f"\n--- Listing files in {prefix} ---")
        
        file_list_response = s3_client.list_objects_v2(
            Bucket=S3_BUCKET_NAME, 
            Prefix=prefix
        )
        
        if 'Contents' not in file_list_response:
            print("No files found.")
            continue
        
        for obj in file_list_response['Contents']:
            input_key = obj['Key']
            
            # Skip the folder itself
            if input_key.endswith('/'):
                continue
            
            process_s3_file(S3_BUCKET_NAME, input_key)
            time.sleep(1) # Be polite to the API rate limits

    print("\nSynchronous sentiment analysis complete. Results are in the 'sentiment_results_sync/' folder.")

Starting synchronous sentiment analysis in region eu-west-1...

--- Listing files in raw_articles/english/ ---

Processing file: raw_articles/english/ai-bubble-us-economy_1765755103.txt
  [SUCCESS] Sentiment analyzed: NEUTRAL. Results uploaded to s3://aruzhan-sabira-hw3/sentiment_results_sync/ai-bubble-us-economy_1765755103.json

Processing file: raw_articles/english/are-we-in-an-ai-bubble-we-asked-european-investors_1765755129.txt
  [SUCCESS] Sentiment analyzed: NEUTRAL. Results uploaded to s3://aruzhan-sabira-hw3/sentiment_results_sync/are-we-in-an-ai-bubble-we-asked-european-investors_1765755129.json

Processing file: raw_articles/english/c11d1f63-a085-419d-bc62-030911459304_1765755107.txt
  [SUCCESS] Sentiment analyzed: NEUTRAL. Results uploaded to s3://aruzhan-sabira-hw3/sentiment_results_sync/c11d1f63-a085-419d-bc62-030911459304_1765755107.json

Processing file: raw_articles/english/japan-media-ai-threat_1765755117.txt
  [SUCCESS] Sentiment analyzed: NEUTRAL. Results uploaded to 