<a href="https://colab.research.google.com/github/woshixiyangyang/langchain-news-extractor/blob/main/Untitled4english.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from IPython.display import display, JSON
import ipywidgets as widgets
import json
import requests
import re

# Create input box
text_area = widgets.Textarea(
    value='',
    placeholder='Please paste news content here...',
    description='News Content:',
    layout=widgets.Layout(width='95%', height='200px')
)

# Create button
button = widgets.Button(
    description='Extract 5W1H',
    button_style='primary',
    layout=widgets.Layout(width='200px')
)

# Create output area
output = widgets.Output()

# Create download link (initially empty)
download_link = widgets.HTML()

def extract_5w(news_text):
    """
    Extract 5W1H information from news text
    Using local processing to avoid API encoding issues
    """
    try:
        # Clean text
        clean_text = news_text.strip()
        if not clean_text:
            return {"error": "Please enter news content"}

        # Simple keyword extraction (example)
        # In actual projects, you might need to call AI APIs
        result = {
            "What": extract_what(clean_text),
            "Who": extract_who(clean_text),
            "When": extract_when(clean_text),
            "Where": extract_where(clean_text),
            "Why": extract_why(clean_text),
            "How": extract_how(clean_text),
            "Original Text": clean_text[:200] + "..." if len(clean_text) > 200 else clean_text
        }

        return result

    except Exception as e:
        return {"error": f"Processing error: {str(e)}"}

def extract_what(text):
    """Extract event content"""
    # Simple event extraction logic
    sentences = re.split(r'[.!?]', text)
    if sentences:
        return sentences[0].strip() + "."
    return "Unable to identify specific event"

def extract_who(text):
    """Extract people/entities"""
    # Simple name/entity recognition
    import re
    # Look for common name patterns and titles
    names = re.findall(r'[A-Z][a-z]+ [A-Z][a-z]+|Mr\. [A-Z][a-z]+|Ms\. [A-Z][a-z]+|Dr\. [A-Z][a-z]+|President [A-Z][a-z]+|CEO [A-Z][a-z]+|Director [A-Z][a-z]+', text)

    # Also look for organizations
    orgs = re.findall(r'[A-Z][a-zA-Z]+ (?:Company|Corp|Corporation|Inc|University|Hospital|Department|Ministry|Agency)', text)

    entities = names + orgs
    if entities:
        return ", ".join(set(entities[:3]))  # Return first 3 unique entities
    return "Relevant parties"

def extract_when(text):
    """Extract time information"""
    import re
    # Match time expressions
    time_patterns = [
        r'\d{4}[-/]\d{1,2}[-/]\d{1,2}',  # 2024-01-15 or 2024/01/15
        r'(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}',
        r'(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)',
        r'today|yesterday|tomorrow|this morning|this afternoon|this evening|tonight',
        r'last week|next week|this week',
        r'\d{1,2}:\d{2} (?:AM|PM|am|pm)',
        r'at \d{1,2}:\d{2}',
        r'in \d{4}',
        r'on (?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)'
    ]

    times = []
    for pattern in time_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        times.extend(matches)

    if times:
        return ", ".join(set(times[:3]))
    return "Time to be confirmed"

def extract_where(text):
    """Extract location information"""
    import re
    # Match location expressions
    place_patterns = [
        r'in [A-Z][a-zA-Z ]+(?:City|State|Country|Province)',
        r'at [A-Z][a-zA-Z ]+(?:University|Hospital|School|Company|Building)',
        r'(?:New York|Los Angeles|Chicago|Houston|Phoenix|Philadelphia|San Antonio|San Diego|Dallas|San Jose|Austin|Jacksonville|Fort Worth|Columbus|Charlotte|San Francisco|Indianapolis|Seattle|Denver|Washington|Boston|El Paso|Nashville|Detroit|Oklahoma City|Portland|Las Vegas|Memphis|Louisville|Baltimore|Milwaukee|Albuquerque|Tucson|Fresno|Mesa|Sacramento|Atlanta|Kansas City|Colorado Springs|Omaha|Raleigh|Miami|Long Beach|Virginia Beach|Oakland|Minneapolis|Tulsa|Tampa|Arlington|New Orleans)',
        r'(?:United States|Canada|Mexico|Brazil|Argentina|United Kingdom|France|Germany|Italy|Spain|Russia|China|Japan|India|Australia|South Korea|Indonesia|Thailand|Vietnam|Philippines|Malaysia|Singapore)',
        r'[A-Z][a-zA-Z]+ (?:Street|Avenue|Road|Boulevard|Drive|Lane|Court|Place)',
        r'downtown [A-Z][a-zA-Z]+',
        r'in the [A-Z][a-zA-Z ]+',
        r'at the [A-Z][a-zA-Z ]+'
    ]

    places = []
    for pattern in place_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        places.extend(matches)

    if places:
        return ", ".join(set(places[:3]))
    return "Location to be confirmed"

def extract_why(text):
    """Extract reasons/causes"""
    # Look for cause-indicating keywords
    cause_keywords = ['because', 'due to', 'owing to', 'as a result of', 'caused by', 'led to', 'resulted from', 'stemmed from', 'triggered by']
    sentences = re.split(r'[.!?]', text)

    for sentence in sentences:
        for keyword in cause_keywords:
            if keyword.lower() in sentence.lower():
                return sentence.strip() + "."

    # Also look for explanatory phrases
    explanation_patterns = [
        r'[^.!?]*(?:reason|cause|motivation|purpose)[^.!?]*[.!?]',
        r'[^.!?]*(?:in order to|so that|to)[^.!?]*[.!?]'
    ]

    for pattern in explanation_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            return matches[0].strip()

    return "Reason requires further analysis"

def extract_how(text):
    """Extract methods/processes"""
    # Look for method-indicating keywords
    method_keywords = ['through', 'by', 'via', 'using', 'with the help of', 'by means of', 'method', 'process', 'approach', 'technique']
    sentences = re.split(r'[.!?]', text)

    for sentence in sentences:
        for keyword in method_keywords:
            if keyword.lower() in sentence.lower():
                return sentence.strip() + "."

    # Look for process descriptions
    process_patterns = [
        r'[^.!?]*(?:process|procedure|method|way|manner|technique)[^.!?]*[.!?]',
        r'[^.!?]*(?:step by step|gradually|systematically)[^.!?]*[.!?]'
    ]

    for pattern in process_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            return matches[0].strip()

    return "Method to be analyzed"

def on_button_click(b):
    """Button click event handler"""
    with output:
        output.clear_output()

        news_text = text_area.value.strip()
        if not news_text:
            print("❌ Please enter news content first!")
            return

        print("🔄 Extracting 5W1H information...")
        result = extract_5w(news_text)

        if "error" in result:
            print(f"❌ {result['error']}")
            return

        print("✅ Extraction completed! Results:")
        print("=" * 50)

        # Display results
        for key, value in result.items():
            if key != "Original Text":
                print(f"📝 {key}: {value}")
                print("-" * 30)

        # Generate JSON download
        create_download_link(result)

def create_download_link(result):
    """Create download link"""
    json_str = json.dumps(result, ensure_ascii=False, indent=2)

    # Create download link HTML
    download_html = f'''
    <div style="margin-top: 10px; padding: 10px; background-color: #f0f0f0; border-radius: 5px;">
        <strong>📁 Download Results:</strong><br>
        <textarea readonly style="width: 100%; height: 100px; margin-top: 5px;">{json_str}</textarea>
        <p style="font-size: 12px; color: #666;">Tip: Copy the JSON content above and save it as a .json file</p>
    </div>
    '''
    download_link.value = download_html

# Bind button click event
button.on_click(on_button_click)

# Display components
print("📰 News 5W1H Information Extractor")
print("=" * 40)
print("Instructions:")
print("1. Paste news content in the text box")
print("2. Click 'Extract 5W1H' button")
print("3. View extraction results and download JSON file")
print("=" * 40)

display(text_area, button, output, download_link)

📰 News 5W1H Information Extractor
Instructions:
1. Paste news content in the text box
2. Click 'Extract 5W1H' button
3. View extraction results and download JSON file


Textarea(value='', description='News Content:', layout=Layout(height='200px', width='95%'), placeholder='Pleas…

Button(button_style='primary', description='Extract 5W1H', layout=Layout(width='200px'), style=ButtonStyle())

Output()

HTML(value='')