### Step 5: Now we will extract and compare key words across articles.

In [1]:
import boto3
import pandas as pd
import json
import os
import io
import time

In [2]:
AWS_REGION = 'eu-west-1' 
S3_BUCKET_NAME = 'aruzhan-sabira-hw3' 

# S3 prefixes containing ALL English-language articles (same as before)
S3_INPUT_PREFIXES = [
    'raw_articles/english/', 
    'translated_articles/'
]

# S3 output folder where the FINAL AGGREGATED TABLE will be saved
S3_OUTPUT_TABLE = 'final_analysis_table/keywords_sentiment.csv'

MAX_TEXT_BYTES = 4900 
NOISE_PATTERNS = [ # To get more meaningful key phrases
    # Navigation/Section Headings (New & Expanded)
    'news', 'insights', 'events', 'platform', 'all stories', 'nation', 'astana', 
    'culture', 'sports', 'people', 'kazakhstan regions', 'state', 'vestbee',
    'archive', 'items', 'homepage', 'mundo', 'loading', 'Mike Blake', 'Dana omirgazy',
    
    # Generic Junk & Symbols
    '[[', ']]', '.txt', '.json', 'photo', 'picture', 'logo', 'copyright', 'shutterstock', 
    'illustration', 'view image', 'by author', 'twitter', 'facebook', 'instagram', 
    'linkedin', 'telegram', 'bookmark', 'pdf', 'oct', 'nov', 'dec', 'min read',
    'language code', 'read time', 'share this', 'email', 'menu',
    
    # Specific Media Site Junk (Based on latest output)
    'the asahi', 'the nikkei', 'the yomiuri shimbun', 'vc fund managers', 
    'vc summit', 'katarzyna groszkowska', 'lisa palchynska', 'kazakh', 'uzbekistan',
    'new delhi', '5th world nomad games', 'tourism'
    ]    

s3_client = boto3.client('s3', region_name=AWS_REGION)
comprehend_client = boto3.client('comprehend', region_name=AWS_REGION)

In [3]:
COUNTRY_MAPPING = {
    'ai-bubble-us-economy': 'The Guardian (USA)',
    'are-we-in-an-ai-bubble-we-asked-european-investors': 'Poland (Vestbee)',
    'japan-media-ai-threat': 'Japan (Japan Times)',
    'kazakhstan-advances-ai-digital-ecosystem-developme': 'Kazakhstan (Astana Times)',
    'uzbekistan-to-lay-off-over-2000-government-employe': 'Uzbekistan (Qaz Inform)',
    'eksperty-vse-chasche-nazyvayut-bum-iskusstvennogo-': 'Russia (Meduza)',
    'c11d1f63-a085-419d-bc62-030911459304': 'Australia (9 News)', 
    '124362358.cms': 'India (The Times of India)', 
    'blasen-bei-ki-werten-platzen-sie-bald-oder-geht-da': 'Germany (Handelsblatt)', 
    'resultados-da-oracle-sinalizam-bolha-de-ia-entenda': 'Brazil (CNN Brazil)',
    'une-bulle-de-l-intelligence-artificielle': 'France (La Gazette)',
    '1206507': 'Thailand (Bangkok BizNews)',
}

In [4]:
def extract_key_phrases(text_content):
    # 1. Prepare byte-safe chunk for analysis
    original_bytes = text_content.encode('utf-8')
    total_bytes = len(original_bytes)
    
    if total_bytes == 0:
        return []

    # Get the first chunk (up to 4900 bytes)
    end_index = min(MAX_TEXT_BYTES, total_bytes)
    while end_index < total_bytes and (original_bytes[end_index] & 0xC0) == 0x80:
        end_index -= 1

    byte_chunk = original_bytes[:end_index]
    
    try:
        string_chunk = byte_chunk.decode('utf-8')
    except UnicodeDecodeError:
        print("  [ERROR] Unicode decode error on chunk. Cannot analyze.")
        return []

    try:
        response = comprehend_client.detect_key_phrases(
            Text=string_chunk, 
            LanguageCode='en'
        )
        
        # 2. Filter key phrases based on confidence and noise patterns
        key_phrases = []
        for phrase in response['KeyPhrases']:
            phrase_text = phrase['Text'].strip()
            score = phrase['Score']
            
            # Filter 1: Confidence Score (Must be high for clean results)
            if score < 0.90: 
                continue 
            
            # Filter 2: Simple noise patterns and length check
            is_noise = any(noise in phrase_text.lower() for noise in NOISE_PATTERNS)
            is_too_short = len(phrase_text.split()) < 2
            
            if not is_noise and not is_too_short:
                key_phrases.append((phrase_text, score))
                
        return key_phrases
        
    except Exception as e:
        print(f"  [ERROR] Comprehend key phrase extraction failed. Error: {e}")
        return [("EXTRACTION_ERROR", 0.0)]



In [5]:
def process_s3_file_for_keywords(bucket_name, input_key):
    # 1. Download file content from S3
    try:
        response = s3_client.get_object(Bucket=bucket_name, Key=input_key)
        original_text = response['Body'].read().decode('utf-8')
    except Exception as e:
        print(f"  [ERROR] Failed to read object {input_key}. Skipping. Error: {e}")
        return None

    # Determine the safe chunk for API calls (for sentiment)
    text_to_analyze = original_text[:MAX_TEXT_BYTES] 

    # 2. Extract Key Phrases
    key_phrase_data = extract_key_phrases(original_text)

    # 3. Analyze Sentiment (for context)
    try:
        sentiment_response = comprehend_client.detect_sentiment(
            Text=text_to_analyze, 
            LanguageCode='en'
        )
    except Exception as e:
        sentiment_response = {'Sentiment': 'ERROR', 'SentimentScore': {}}

    # 4. Compile and return structured data
    return {
        'SourceFile': input_key,
        'MediaSource': os.path.basename(input_key).replace('.txt', ''),
        'Sentiment': sentiment_response['Sentiment'],
        'KeyPhrases': [text for text, score in key_phrase_data],
        'TopKeyPhrases': ", ".join([text for text, score in key_phrase_data][:5]) # Get top 5 as a string
    }

In [6]:
if __name__ == "__main__":
    
    all_article_data = []

    for prefix in S3_INPUT_PREFIXES:
        file_list_response = s3_client.list_objects_v2(
            Bucket=S3_BUCKET_NAME, 
            Prefix=prefix
        )
        
        if 'Contents' not in file_list_response:
            continue
        
        for obj in file_list_response['Contents']:
            input_key = obj['Key']
            
            if input_key.endswith('/') or not input_key.endswith('.txt'):
                continue
            
            # Process the file
            # This calls the function that extracts keywords and sentiment
            data = process_s3_file_for_keywords(S3_BUCKET_NAME, input_key)
            if data:
                all_article_data.append(data)
            
            time.sleep(1) 

    if not all_article_data:
        print("\nNo articles were successfully processed.")
    else:
        final_df = pd.DataFrame(all_article_data)
        
        final_df['MediaSource_Clean'] = final_df['MediaSource'].apply(
            lambda x: x.rsplit('_', 1)[0] if isinstance(x, str) and '_' in x else x
        )

        # Apply the mapping 
        final_df['Country/Region'] = final_df['MediaSource_Clean'].map(COUNTRY_MAPPING)
        final_df['Country/Region'] = final_df['Country/Region'].fillna(final_df['MediaSource_Clean'])
        final_df['MediaSource'] = final_df['Country/Region']
        final_df = final_df.drop(columns=['MediaSource_Clean', 'Country/Region'])
        
        csv_buffer = io.StringIO()
        final_df.to_csv(csv_buffer, index=False)
        
        s3_client.put_object(
            Bucket=S3_BUCKET_NAME, 
            Key=S3_OUTPUT_TABLE, 
            Body=csv_buffer.getvalue().encode('utf-8')
        )
        print(f"\n✅ Full results saved to s3://{S3_BUCKET_NAME}/{S3_OUTPUT_TABLE}")
        
        print("\n--- Final DataFrame Head (with Renamed MediaSource) ---")
        print(final_df[['MediaSource', 'Sentiment', 'TopKeyPhrases']].head().to_markdown(index=False))


✅ Full results saved to s3://aruzhan-sabira-hw3/final_analysis_table/keywords_sentiment.csv

--- Final DataFrame Head (with Renamed MediaSource) ---
| MediaSource               | Sentiment   | TopKeyPhrases                                                                                             |
|:--------------------------|:------------|:----------------------------------------------------------------------------------------------------------|
| The Guardian (USA)        | NEUTRAL     | The question, the AI bubble, the fallout, Eduardo Porter                                                  |
|                           |             | Will, the bubble                                                                                          |
| Poland (Vestbee)          | NEUTRAL     | the biggest, invitation-only event, an AI bubble, European investors, technology coverage, this year      |
| Australia (9 News)        | NEUTRAL     | an AI tech bubble, the valuations, the company

In [7]:
display_df = final_df[['MediaSource', 'TopKeyPhrases']]
display_df

Unnamed: 0,MediaSource,TopKeyPhrases
0,The Guardian (USA),"The question, the AI bubble, the fallout, Edua..."
1,Poland (Vestbee),"the biggest, invitation-only event, an AI bubb..."
2,Australia (9 News),"an AI tech bubble, the valuations, the company..."
3,Japan (Japan Times),"a lawsuit, the Tokyo District Court, Generativ..."
4,Kazakhstan (Astana Times),"’s Presidency, ’s Presidency, Digital Ecosyste..."
5,Uzbekistan (Qaz Inform),"2,000 government employees, AI integration\r\n..."
6,Thailand (Bangkok BizNews),"DOLLAR AI BUBBLE, BIG TECH FEARS MARCH, LOANS ..."
7,India (The Times of India),"The boom, Artificial Intelligence, the form, a..."
8,Germany (Handelsblatt),"AI values, The numerous headlines warning, an ..."
9,Russia (Meduza),"the AI boom, a financial bubble, tech companie..."
