In [4]:
pip install requests pandas nltk


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
pip install googletrans==4.0.0rc1

Collecting googletrans==4.0.0rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting httpx==0.13.3 (from googletrans==4.0.0rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0rc1)
  Downloading hstspreload-2025.1.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aext-shared 4.1.0 requires anaconda-cloud-auth>=0.7.1, which is not installed.
jupyterlab 4.3.4 requires httpx>=0.25.0, but you have httpx 0.13.3 which is incompatible.
spyder 5.2.2 requires pyqt5<5.13, but you have pyqt5 5.15.10 which is incompatible.
spyder 5.2.2 requires pyqtwebengine<5.13, but you have pyqtwebengine 5.15.6 which is incompatible.

[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import requests
import pandas as pd
import time
from datetime import datetime, timedelta
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Setup
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

API_KEY = 'ffe56edd243ebc59225204486a1d4c17'
BASE_URL = 'https://gnews.io/api/v4/search'

# Country codes for key Malaysia trading partners
countries = {
    'us': 'United States',
    'de': 'Germany', 
    'cn': 'China',
    'my': 'Malaysia',
    'vn': 'Vietnam',
    'kr': 'South Korea',
    'mx': 'Mexico'
}

# Focus: 2020 to 2023 only
date_list = pd.date_range(start='2020-01-01', end='2023-12-31', freq='MS')

# Define keywords and languages for each country
def get_search_params(country_code):
    """Get appropriate keywords and language for each country"""
    if country_code == 'cn':
        return {
            'keywords': ['关税 AND 电子', '贸易战 AND 电子产品', '进口税 AND 半导体'],
            'language': 'zh'
        }
    elif country_code == 'de':
        return {
            'keywords': ['Zoll AND Elektronik', 'Handelskrieg AND Elektronik', 'Einfuhrsteuer AND Halbleiter'],
            'language': 'de'
        }
    else:
        return {
            'keywords': ['tariff AND electronic', 'trade war AND electronics', 'import duty AND semiconductor'],
            'language': 'en'
        }

# Collect data
all_data = []
failed_requests = []

for code, name in countries.items():
    print(f"\n=== Collecting for {name} ({code}) ===")
    search_params = get_search_params(code)
    
    for keyword_set in search_params['keywords']:
        print(f"Using keywords: {keyword_set}")
        
        for date in date_list:
            from_date = date.strftime('%Y-%m-%d')
            to_date = (date + timedelta(days=30)).strftime('%Y-%m-%d')
            
            params = {
                'q': keyword_set,
                'lang': search_params['language'],
                'country': code,
                'from': from_date,
                'to': to_date,
                'max': 100,
                'apikey': API_KEY
            }
            
            try:
                response = requests.get(BASE_URL, params=params, timeout=30)
                
                if response.status_code == 200:
                    data = response.json()
                    articles = data.get('articles', [])
                    
                    if articles:
                        print(f"Found {len(articles)} articles for {name} on {from_date}")
                        
                        for article in articles:
                            # Handle potential None values
                            title = article.get('title') or ''
                            description = article.get('description') or ''
                            content = article.get('content') or ''
                            
                            # Combine text for sentiment analysis
                            text = f"{title}. {description}"
                            
                            # Calculate sentiment (works best on English text)
                            try:
                                sentiment_score = sia.polarity_scores(text)['compound']
                            except:
                                sentiment_score = 0.0
                            
                            all_data.append({
                                'country_code': code,
                                'country_name': name,
                                'language': search_params['language'],
                                'keywords_used': keyword_set,
                                'publishedAt': article.get('publishedAt'),
                                'title': title,
                                'description': description,
                                'content': content,
                                'url': article.get('url'),
                                'sentiment': sentiment_score,
                                'collection_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                            })
                    else:
                        print(f"No articles found for {name} on {from_date} with keywords: {keyword_set}")
                        
                elif response.status_code == 429:
                    print(f"Rate limit exceeded. Waiting longer...")
                    time.sleep(10)
                    continue
                    
                else:
                    error_info = {
                        'country': name,
                        'date': from_date,
                        'keywords': keyword_set,
                        'status_code': response.status_code,
                        'error': response.text
                    }
                    failed_requests.append(error_info)
                    print(f"Error {response.status_code} for {name} on {from_date}: {response.text}")
                    
            except requests.exceptions.RequestException as e:
                error_info = {
                    'country': name,
                    'date': from_date,
                    'keywords': keyword_set,
                    'status_code': 'Request Exception',
                    'error': str(e)
                }
                failed_requests.append(error_info)
                print(f"Request failed for {name} on {from_date}: {e}")
            
            # Respect API rate limits
            time.sleep(2)

# Create DataFrames
df = pd.DataFrame(all_data)
failed_df = pd.DataFrame(failed_requests)

# Save results
if not df.empty:
    df.to_csv('sentiment_tariff_electronics_enhanced.csv', index=False, encoding='utf-8')
    print(f"\n✅ Data collection completed! Collected {len(df)} articles.")
    
    # Print summary by country
    print("\n📊 Summary by country:")
    summary = df.groupby(['country_name', 'language']).size().reset_index(name='article_count')
    for _, row in summary.iterrows():
        print(f"  {row['country_name']} ({row['language']}): {row['article_count']} articles")
else:
    print("❌ No data collected.")

# Save failed requests for debugging
if failed_requests:
    failed_df.to_csv('failed_requests_log.csv', index=False)
    print(f"⚠️  {len(failed_requests)} requests failed. Check 'failed_requests_log.csv' for details.")

print("\n🎉 Process completed!")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\razin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!



=== Collecting for United States (us) ===
Using keywords: tariff AND electronic
Found 4 articles for United States on 2020-01-01
Found 4 articles for United States on 2020-02-01
Found 4 articles for United States on 2020-03-01
Found 4 articles for United States on 2020-04-01
Found 4 articles for United States on 2020-05-01
Found 4 articles for United States on 2020-06-01
Found 4 articles for United States on 2020-07-01
Found 4 articles for United States on 2020-08-01
Found 4 articles for United States on 2020-09-01
Found 4 articles for United States on 2020-10-01
Found 4 articles for United States on 2020-11-01
Found 4 articles for United States on 2020-12-01
Found 4 articles for United States on 2021-01-01
Found 4 articles for United States on 2021-02-01
Found 4 articles for United States on 2021-03-01
Found 4 articles for United States on 2021-04-01
Found 4 articles for United States on 2021-05-01
Found 4 articles for United States on 2021-06-01
Found 4 articles for United States on

In [3]:
# Install first if needed: !pip install deep-translator

import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import time
import re

# Download NLTK data
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# Try to import translation libraries (with fallbacks)
translator = None
translation_method = None

try:
    from deep_translator import GoogleTranslator
    translation_method = "deep_translator"
    print("Using deep-translator for translation")
except ImportError:
    try:
        from googletrans import Translator
        translator = Translator()
        translation_method = "googletrans"
        print("Using googletrans for translation")
    except ImportError:
        translation_method = "keyword_based"
        print("No translation library found, using keyword-based approach")

def translate_with_deep_translator(text, source_lang='auto', dest_lang='en'):
    """Translate using deep-translator"""
    try:
        if source_lang == 'zh':
            source_lang = 'chinese'
        elif source_lang == 'de':
            source_lang = 'german'
        
        translator = GoogleTranslator(source=source_lang, target=dest_lang)
        result = translator.translate(text[:500])  # Limit text length
        return result if result else text
    except Exception as e:
        print(f"Deep translator error: {e}")
        return text

def translate_with_googletrans(text, source_lang='auto', dest_lang='en'):
    """Translate using googletrans with better error handling"""
    try:
        result = translator.translate(text[:500], src=source_lang, dest=dest_lang)
        return result.text if result and hasattr(result, 'text') else text
    except Exception as e:
        print(f"Googletrans error: {e}")
        return text

def keyword_based_sentiment(text, language):
    """Fallback sentiment analysis based on keywords"""
    if not text or pd.isna(text):
        return 0.0
    
    # Enhanced keyword dictionaries
    positive_keywords = {
        'zh': ['增长', '上升', '改善', '积极', '良好', '成功', '合作', '发展', '机会', '优势', '利好', '推动', '促进', '繁荣'],
        'de': ['wachstum', 'verbesserung', 'positiv', 'gut', 'erfolg', 'zusammenarbeit', 'entwicklung', 
               'chance', 'vorteil', 'steigend', 'förderung', 'fortschritt', 'günstig', 'vorteilhaft'],
        'en': ['growth', 'positive', 'good', 'success', 'cooperation', 'improvement', 'opportunity', 
               'advantage', 'rising', 'beneficial', 'progress', 'favorable', 'boost', 'enhance']
    }
    
    negative_keywords = {
        'zh': ['下降', '减少', '困难', '问题', '冲突', '危机', '损失', '负面', '担忧', '挑战', '衰退', '恶化', '阻碍', '威胁'],
        'de': ['rückgang', 'reduzierung', 'schwierigkeit', 'problem', 'konflikt', 'krise', 'verlust', 
               'negativ', 'sorge', 'herausforderung', 'verschlechterung', 'hindernis', 'bedrohung', 'risiko'],
        'en': ['decline', 'reduction', 'difficulty', 'problem', 'conflict', 'crisis', 'loss', 'negative', 
               'concern', 'challenge', 'deterioration', 'obstacle', 'threat', 'risk']
    }
    
    # Add tariff-specific keywords
    tariff_negative = {
        'zh': ['关税', '贸易战', '制裁', '限制', '禁令'],
        'de': ['zoll', 'handelskrieg', 'sanktionen', 'beschränkung', 'verbot'],
        'en': ['tariff', 'trade war', 'sanctions', 'restriction', 'ban']
    }
    
    text_lower = text.lower()
    
    # Count keywords
    pos_count = sum(1 for word in positive_keywords.get(language, []) if word in text_lower)
    neg_count = sum(1 for word in negative_keywords.get(language, []) if word in text_lower)
    tariff_neg_count = sum(1 for word in tariff_negative.get(language, []) if word in text_lower)
    
    # Weight tariff-related negative terms more heavily
    total_neg = neg_count + (tariff_neg_count * 1.5)
    
    # Calculate sentiment score
    if pos_count + total_neg == 0:
        return 0.0
    
    sentiment_score = (pos_count - total_neg) / (pos_count + total_neg + 2)
    return max(-1.0, min(1.0, sentiment_score))

def robust_translate_and_analyze(text, language):
    """Robust translation and sentiment analysis with fallbacks"""
    if not text or pd.isna(text) or len(text.strip()) == 0:
        return 0.0
    
    # Clean text
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = text.strip()[:500]  # Limit length
    
    if language == 'en':
        # Direct analysis for English
        return sia.polarity_scores(text)['compound']
    
    # Try translation methods in order of preference
    translated_text = None
    
    if translation_method == "deep_translator":
        translated_text = translate_with_deep_translator(text, source_lang=language)
    elif translation_method == "googletrans":
        translated_text = translate_with_googletrans(text, source_lang=language)
    
    # If translation worked, analyze translated text
    if translated_text and translated_text != text and len(translated_text.strip()) > 0:
        try:
            return sia.polarity_scores(translated_text)['compound']
        except:
            pass
    
    # Fallback to keyword-based analysis
    return keyword_based_sentiment(text, language)

def fix_sentiment_robust(df):
    """Robust sentiment fixing with multiple fallbacks"""
    
    df_fixed = df.copy()
    df_fixed['sentiment_fixed'] = df_fixed['sentiment'].copy()
    
    # Find articles needing fixes
    needs_fix_mask = (df_fixed['sentiment'] == 0.0) & (df_fixed['language'].isin(['zh', 'de']))
    articles_to_fix = df_fixed[needs_fix_mask]
    
    print(f"Found {len(articles_to_fix)} articles needing sentiment fix")
    
    # Process in batches with progress tracking
    batch_size = 50
    total_processed = 0
    successful_fixes = 0
    
    for i in range(0, len(articles_to_fix), batch_size):
        batch = articles_to_fix.iloc[i:i+batch_size]
        print(f"Processing batch {i//batch_size + 1}/{(len(articles_to_fix)-1)//batch_size + 1}")
        
        for idx, row in batch.iterrows():
            try:
                # Combine title and description
                text = f"{row['title']} {row['description']}"
                
                # Analyze sentiment
                sentiment_score = robust_translate_and_analyze(text, row['language'])
                
                if sentiment_score != 0.0:
                    df_fixed.loc[idx, 'sentiment_fixed'] = sentiment_score
                    successful_fixes += 1
                
                total_processed += 1
                
                # Small delay to avoid rate limits
                if translation_method in ["deep_translator", "googletrans"]:
                    time.sleep(0.05)
                
            except Exception as e:
                print(f"Error processing article {idx}: {e}")
                # Try keyword-based as final fallback
                try:
                    text = f"{row['title']} {row['description']}"
                    sentiment_score = keyword_based_sentiment(text, row['language'])
                    df_fixed.loc[idx, 'sentiment_fixed'] = sentiment_score
                except:
                    pass
        
        print(f"  Processed {min((i+1)*batch_size, len(articles_to_fix))}/{len(articles_to_fix)} articles")
    
    print(f"\nCompleted! Successfully fixed {successful_fixes}/{total_processed} articles")
    return df_fixed

# Main execution
def main():
    # Load data
    print("Loading data...")
    df = pd.read_csv('sentiment_tariff_electronics_enhanced.csv')
    
    print(f"Loaded {len(df)} articles")
    print("\nCurrent sentiment distribution by language:")
    current_stats = df.groupby('language')['sentiment'].agg(['count', 'mean', lambda x: (x==0).sum()])
    current_stats.columns = ['Total', 'Mean_Sentiment', 'Zero_Count']
    print(current_stats.round(3))
    
    # Fix sentiment
    print(f"\nStarting sentiment analysis fix using {translation_method} method...")
    df_fixed = fix_sentiment_robust(df)
    
    # Save results
    df_fixed.to_csv('Data/sentiment_tariff_electronics_detail.csv', index=False)
    print("\nSaved: sentiment_tariff_electronics_detail.csv")
    
    # Create annual summary
    df_fixed['publishedAt'] = pd.to_datetime(df_fixed['publishedAt'])
    df_fixed['year'] = df_fixed['publishedAt'].dt.year
    
    annual_sentiment = df_fixed.groupby(['country_name', 'year'])['sentiment_fixed'].mean().reset_index()
    annual_sentiment.to_csv('Data/sentiment_tariff_electronics_annual_final.csv', index=False)
    print("Saved: sentiment_tariff_electronics_annual_final.csv")
    
    # Show results
    print("\nFixed sentiment distribution by language:")
    fixed_stats = df_fixed.groupby('language')['sentiment_fixed'].agg(['count', 'mean', lambda x: (x==0).sum()])
    fixed_stats.columns = ['Total', 'Mean_Sentiment', 'Zero_Count']
    print(fixed_stats.round(3))
    
    print("\nFixed sentiment by country:")
    country_stats = df_fixed.groupby('country_name')['sentiment_fixed'].agg(['mean', 'std', 'count'])
    print(country_stats.round(3))

if __name__ == "__main__":
    main()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\razin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Using googletrans for translation
Loading data...
Loaded 6848 articles

Current sentiment distribution by language:
          Total  Mean_Sentiment  Zero_Count
language                                   
de          480          -0.308         240
en         5792          -0.131         432
zh          576           0.000         576

Starting sentiment analysis fix using googletrans method...
Found 816 articles needing sentiment fix
Processing batch 1/17
  Processed 50/816 articles
Processing batch 2/17
  Processed 816/816 articles
Processing batch 3/17
  Processed 816/816 articles
Processing batch 4/17
  Processed 816/816 articles
Processing batch 5/17
Googletrans error: invalid source language
Googletrans error: invalid source language
Googletrans error: invalid source language
Googletrans error: invalid source language
Googletrans error: invalid source language
Googletrans error: invalid source language
Googletrans error: invalid source language
Googletrans error: invalid source la