### Step 2: Use Amazon Translate to translate 6 articles from France, Germany, India, Russia, Brazil, and Thailand to English. 

In [1]:
import boto3
import io
import os
import time

In [2]:
AWS_REGION = 'eu-west-1'
S3_BUCKET_NAME = 'aruzhan-sabira-hw3' 

S3_INPUT_PREFIX = 'raw_articles/non-english/' 
S3_OUTPUT_PREFIX = 'translated_articles/' 

TARGET_LANGUAGE_CODE = 'en' # Translation target
MAX_TEXT_BYTES = 9900 # Max bytes per translate_text call is 5000. Use a buffer.

s3_client = boto3.client('s3', region_name=AWS_REGION)
translate_client = boto3.client('translate', region_name=AWS_REGION)

In [3]:
def translate_and_upload_file(bucket_name, input_key):
    """
    Downloads a non-English file, translates it in byte-safe chunks, 
    and uploads the result to the S3 output prefix.
    """
    
    # 1. Download file content from S3 as bytes
    try:
        response = s3_client.get_object(Bucket=bucket_name, Key=input_key)
        # Read the file content as bytes
        original_bytes = response['Body'].read() 
    except Exception as e:
        print(f"  [ERROR] Failed to read object {input_key}. Skipping. Error: {e}")
        return

    # 2. Translate the text in byte-safe chunks
    translated_text_parts = []
    current_index = 0
    total_bytes = len(original_bytes)

    # Use a safe buffer below the 10000 byte limit
    SAFE_MAX_BYTES = 9900 

    while current_index < total_bytes:
        
        # Calculate the end index for the next chunk
        end_index = min(current_index + SAFE_MAX_BYTES, total_bytes)
        
        # Ensure we don't split a multi-byte character (safety check)
        # Check if the byte at the end_index is a continuation byte (0x80 to 0xBF).
        # If it is, move the boundary back until it lands on a start byte.
        while end_index < total_bytes and (original_bytes[end_index] & 0xC0) == 0x80:
            end_index -= 1
        
        # Extract the chunk and decode to string for the API
        byte_chunk = original_bytes[current_index:end_index]
        
        try:
            string_chunk = byte_chunk.decode('utf-8')
        except UnicodeDecodeError:
             print("  [ERROR] Unicode decode error on chunk. Skipping this chunk.")
             current_index = end_index 
             continue
             
        # print(f"  Translating chunk (Size: {len(byte_chunk)} bytes)...")
        try:
            translation_response = translate_client.translate_text(
                Text=string_chunk,
                SourceLanguageCode='auto',
                TargetLanguageCode=TARGET_LANGUAGE_CODE
            )
            translated_text_parts.append(translation_response['TranslatedText'])
            time.sleep(0.1) 
            
        except translate_client.exceptions.UnsupportedLanguagePairException as e:
            # This is the corrected way to handle text already detected as English.
            if 'The language pair you provided is not supported' in str(e):
                print("  [Warning] Language pair unsupported (likely already English). Appending original chunk.")
                translated_text_parts.append(string_chunk)
            else:
                print(f"  [ERROR] Translation failed due to unsupported pair: {e}")
                translated_text_parts.append(f"[[TRANSLATION_ERROR: {string_chunk}]]")

        except Exception as e:
            # Catch all other general errors
            print(f"  [ERROR] General translation failure on chunk. Error: {e}")
            translated_text_parts.append(f"[[TRANSLATION_ERROR: {string_chunk}]]")
            
        # Move the index forward
        current_index = end_index
        
    full_translated_text = "\n".join(translated_text_parts)

    # 3. Upload translated text to the new S3 folder
    original_filename = os.path.basename(input_key)
    output_key = S3_OUTPUT_PREFIX + original_filename
    
    try:
        s3_client.put_object(
            Bucket=bucket_name, 
            Key=output_key, 
            Body=full_translated_text.encode('utf-8'),
            ContentType='text/plain'
        )
        print(f"  [SUCCESS] Translated and uploaded to s3://{bucket_name}/{output_key}")
    except Exception as e:
        print(f"  [ERROR] Failed to upload translated object. Error: {e}")

In [4]:
if __name__ == "__main__":
    print(f"Starting translation of files in s3://{S3_BUCKET_NAME}/{S3_INPUT_PREFIX}")

    # List objects in the input prefix
    file_list_response = s3_client.list_objects_v2(
        Bucket=S3_BUCKET_NAME, 
        Prefix=S3_INPUT_PREFIX
    )
    
    if 'Contents' not in file_list_response:
        print("No files found in the non-english input folder. Exiting.")
    else:
        # Loop through all files found
        for obj in file_list_response['Contents']:
            input_key = obj['Key']
            
            # Skip the folder itself if list_objects_v2 returns it
            if input_key.endswith('/'):
                continue
            
            print(f"\nProcessing file: {input_key}")
            translate_and_upload_file(S3_BUCKET_NAME, input_key)

    print("\nTranslation script finished.")

Starting translation of files in s3://aruzhan-sabira-hw3/raw_articles/non-english/

Processing file: raw_articles/non-english/1206507_1765755140.txt
  [SUCCESS] Translated and uploaded to s3://aruzhan-sabira-hw3/translated_articles/1206507_1765755140.txt

Processing file: raw_articles/non-english/124362358.cms_1765755113.txt
  [SUCCESS] Translated and uploaded to s3://aruzhan-sabira-hw3/translated_articles/124362358.cms_1765755113.txt

Processing file: raw_articles/non-english/blasen-bei-ki-werten-platzen-sie-bald-oder-geht-da_1765755100.txt
  [SUCCESS] Translated and uploaded to s3://aruzhan-sabira-hw3/translated_articles/blasen-bei-ki-werten-platzen-sie-bald-oder-geht-da_1765755100.txt

Processing file: raw_articles/non-english/eksperty-vse-chasche-nazyvayut-bum-iskusstvennogo-_1765755110.txt
  [SUCCESS] Translated and uploaded to s3://aruzhan-sabira-hw3/translated_articles/eksperty-vse-chasche-nazyvayut-bum-iskusstvennogo-_1765755110.txt

Processing file: raw_articles/non-english/re