# Semantic Text Analyzer Demo Notebook


## Setup and Environment

In [1]:

import os
import sys
from pathlib import Path
import asyncio
import logging
from typing import Dict, Any, Optional
from pprint import pprint

import pandas as pd

# Add project root to Python path
project_root = str(Path().resolve().parent)
if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added {project_root} to Python path")

# Core components
from src.semantic_analyzer.analyzer import SemanticAnalyzer
from src.utils.FileUtils.file_utils import FileUtils
from src.loaders.parameter_adapter import ParameterAdapter
from src.core.config import AnalyzerConfig
from src.loaders.models import ParameterSet

from src.analyzers.keyword_analyzer import KeywordAnalyzer
from src.semantic_analyzer.parameters import ParameterManager

# Initialize FileUtils and set up logging
file_utils = FileUtils()
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)



Added /home/topi/data-science/repos/semantic-text-analyzer to Python path


## Environment Verification

In [2]:
def verify_environment():
    """Verify that the notebook environment is properly configured."""
    # Load environment variables
    from dotenv import load_dotenv
    env_path = Path(project_root) / ".env"
    env_loaded = load_dotenv(env_path)

    # Required environment variables
    required_env_vars = [
        'OPENAI_API_KEY',
        'ANTHROPIC_API_KEY',
    ]

    # Basic checks
    basic_checks = {
        "Project root in path": project_root in sys.path,
        "Can import src": "src" in sys.modules,
        "FileUtils initialized": hasattr(file_utils, "project_root"),
        ".env file loaded": env_loaded,
    }

    # Environment variable checks
    env_var_checks = {
        f"{var} set": os.getenv(var) is not None
        for var in required_env_vars
    }

    # Check for required paths using FileUtils
    expected_paths = {
        "Raw data": file_utils.get_data_path("raw"),
        "Processed data": file_utils.get_data_path("processed"),
        "Configuration": file_utils.get_data_path("configurations"),
        "Main config.yaml": Path(project_root) / "config.yaml"
    }
    
    path_checks = {
        f"{name} exists": path.exists()
        for name, path in expected_paths.items()
    }

    # Combine all checks
    all_checks = {
        **basic_checks,
        **env_var_checks,
        **path_checks
    }
    
    # Print results
    print("Environment Check Results:")
    print("=" * 50)
    
    def print_section(title, checks):
        print(f"\n{title}:")
        print("-" * len(title))
        for check, result in checks.items():
            status = "✓" if result else "✗"
            print(f"{status} {check}")
    
    print_section("Basic Setup", basic_checks)
    print_section("Environment Variables", env_var_checks)
    print_section("Project Structure", path_checks)
    
    # Overall status
    all_passed = all(all_checks.values())
    print("\n" + "=" * 50)
    print("Environment Status:", "Ready ✓" if all_passed else "Setup needed ✗")
    
    if not all_passed:
        print("\nSetup Instructions:")
        if not env_loaded:
            print("- Create a .env file in the project root with required API keys")
        for var in required_env_vars:
            if not os.getenv(var):
                print(f"- Add {var} to your .env file")
        for name, path in expected_paths.items():
            if not path.exists():
                print(f"- Create {name} directory at {path}")

    return all_passed

# Run verification
verify_environment()



Environment Check Results:

Basic Setup:
-----------
✓ Project root in path
✓ Can import src
✓ FileUtils initialized
✓ .env file loaded

Environment Variables:
---------------------
✓ OPENAI_API_KEY set
✓ ANTHROPIC_API_KEY set

Project Structure:
-----------------
✓ Raw data exists
✓ Processed data exists
✓ Configuration exists
✓ Main config.yaml exists

Environment Status: Ready ✓


True

In [4]:
# verify_stopwords.ipynb

import os
import sys
from pathlib import Path
import logging
from pprint import pprint

# Add project root to Python path
project_root = str(Path().resolve().parent)
if project_root not in sys.path:
    sys.path.append(project_root)

from src.core.language_processing import create_text_processor
from src.utils.FileUtils.file_utils import FileUtils

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def verify_stopwords():
    """Verify stopwords for both English and Finnish."""
    
    file_utils = FileUtils()
    
    # Check stopwords files
    stop_words_dir = file_utils.get_data_path("configurations") / "stop_words"
    print("\nStopwords Directory Contents:")
    print(f"Directory: {stop_words_dir}")
    if stop_words_dir.exists():
        for file in stop_words_dir.glob("*.txt"):
            print(f"\nFile: {file.name}")
            with open(file, 'r', encoding='utf-8') as f:
                words = {line.strip() for line in f if line.strip()}
                print(f"Number of words: {len(words)}")
                print("Sample words (first 10):")
                pprint(sorted(list(words))[:10])
    else:
        print("Stopwords directory not found!")
    
    # Test processors
    print("\nTesting Language Processors:")
    for lang in ["en", "fi"]:
        print(f"\n{lang.upper()} Processor:")
        processor = create_text_processor(language=lang)
        
        # Check stopwords
        if hasattr(processor, '_stop_words'):
            print(f"Number of stopwords: {len(processor._stop_words)}")
            print("Sample stopwords (first 10):")
            pprint(sorted(list(processor._stop_words))[:10])
        
        # Test some common words
        test_words = {
            "en": ["the", "and", "is", "that", "with", "for"],
            "fi": ["ja", "on", "että", "joka", "tai", "vain"]
        }
        
        print(f"\nTesting common {lang} words:")
        for word in test_words[lang]:
            print(f"{word}: {'stop word' if processor.is_stop_word(word) else 'not stop word'}")

# Run verification
verify_stopwords()

2024-11-09 22:10:25 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-09 22:10:25 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-09 22:10:25 - src.core.language_processing.finnish - INFO - Loaded 747 stopwords from /home/topi/data-science/repos/semantic-text-analyzer/data/configurations/stop_words/fi.txt
2024-11-09 22:10:25 - src.core.language_processing.finnish - INFO - Successfully initialized Voikko using system libraries



Stopwords Directory Contents:
Directory: /home/topi/data-science/repos/semantic-text-analyzer/data/configurations/stop_words

File: fi.txt
Number of words: 747
Sample words (first 10):
['aika',
 'aikaa',
 'aikaan',
 'aikaisemmin',
 'aikaisin',
 'aikajen',
 'aikana',
 'aikoina',
 'aikoo',
 'aikovat']

File: en.txt
Number of words: 4
Sample words (first 10):
['a', 'an', 'and', 'the']

Testing Language Processors:

EN Processor:
Number of stopwords: 242
Sample stopwords (first 10):
["'d", "'ll", "'re", "'s", "'t", "'ve", 'a', 'about', 'above', 'actually']

Testing common en words:
the: stop word
and: stop word
is: stop word
that: stop word
with: stop word
for: stop word

FI Processor:
Number of stopwords: 774
Sample stopwords (first 10):
['aiempi',
 'aika',
 'aikaa',
 'aikaan',
 'aikaisemmin',
 'aikaisin',
 'aikajen',
 'aikana',
 'aikoina',
 'aikoo']

Testing common fi words:
ja: stop word
on: stop word
että: stop word
joka: stop word
tai: stop word
vain: stop word


In [3]:
# test_keywords.ipynb

import os
import sys
from pathlib import Path
import asyncio
import logging
from typing import Dict, Any, Optional
from pprint import pprint

# Add project root to Python path
project_root = str(Path().resolve().parent)
if project_root not in sys.path:
    sys.path.append(project_root)

# Core components
from src import KeywordAnalyzer
from src.core.language_processing import create_text_processor

# # Initialize logging
# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)


In [5]:
# test_keywords.ipynb

import os
import sys
from pathlib import Path
import asyncio
import logging
from typing import Dict, Any, Optional
from pprint import pprint

# Add project root to Python path
project_root = str(Path().resolve().parent)
if project_root not in sys.path:
    sys.path.append(project_root)

# Core components
from src import KeywordAnalyzer
from src.core.language_processing import create_text_processor
from src.schemas import KeywordInfo

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Test texts
test_texts = {
    "ai_ml": """
    Machine learning is a branch of artificial intelligence that focuses on developing systems 
    that can learn from and make decisions based on data. Deep learning, a subset of machine 
    learning, uses neural networks with multiple layers to analyze various levels of abstraction.
    """,
    
    "nlp": """
    Natural Language Processing (NLP) combines linguistics and computer science to help 
    computers understand and process human language. Key tasks include sentiment analysis, 
    text classification, and machine translation.
    """,
    
    "finnish": """
    Koneoppiminen on tekoälyn osa-alue, joka keskittyy järjestelmiin, jotka oppivat datasta. 
    Syväoppiminen käyttää neuroverkkoja monimutkaisten mallien analysointiin.
    """
}

async def test_keyword_analysis(text: str, language: str = "en"):
    """Test keyword extraction for a given text."""
    
    # Create language processor
    language_processor = create_text_processor(language=language)
    
    # Initialize analyzer with language processor
    analyzer = KeywordAnalyzer(
        config={
            "max_keywords": 10,
            "min_length": 3,
            "weights": {
                "statistical": 0.4,
                "llm": 0.6
            }
        },
        language_processor=language_processor
    )
    
    print(f"\nAnalyzing Text:")
    print("=" * 50)
    print(f"Text: {text[:100]}...")
    print("-" * 50)
    
    # Run analysis
    result = await analyzer.analyze(text)
    
    # Display results
    print("\nExtracted Keywords:")
    if result.keywords:  # Check if we have keywords
        for kw in result.keywords:
            if isinstance(kw, KeywordInfo):
                # Handle KeywordInfo objects
                print(f"- {kw.keyword:<30} (score: {kw.score:.3f})")
                if kw.domain:
                    print(f"  Domain: {kw.domain}")
                if kw.compound_parts:
                    print(f"  Parts: {', '.join(kw.compound_parts)}")
            else:
                # Handle string or dict keywords
                if isinstance(kw, dict):
                    print(f"- {kw.get('keyword', 'N/A'):<30} (score: {kw.get('score', 0.0):.3f})")
                else:
                    print(f"- {str(kw)}")
    
    print("\nCompound Words:")
    if result.compound_words:
        for compound in result.compound_words:
            print(f"- {compound}")
    
    print("\nLanguage:", result.language)
    print("Success:", result.success)
    if result.error:
        print("Error:", result.error)

async def run_tests():
    """Run tests for all sample texts."""
    print("\nTesting English texts...")
    await test_keyword_analysis(test_texts["ai_ml"], "en")
    await test_keyword_analysis(test_texts["nlp"], "en")
    
    print("\nTesting Finnish text...")
    await test_keyword_analysis(test_texts["finnish"], "fi")

# Run tests
await run_tests()

2024-11-09 22:10:49 - src.core.language_processing.factory - INFO - Using default configuration



Testing English texts...

Analyzing Text:
Text: 
    Machine learning is a branch of artificial intelligence that focuses on developing systems 
   ...
--------------------------------------------------


2024-11-09 22:11:12 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-09 22:11:12 - src.core.language_processing.factory - INFO - Using default configuration



Extracted Keywords:
- machine
- learn
- branch
- artificial
- intelligence
- focus
- develop
- system
- make
- decision

Compound Words:
- machine learning
- artificial intelligence
- deep learning
- neural networks

Language: en
Success: True

Analyzing Text:
Text: 
    Natural Language Processing (NLP) combines linguistics and computer science to help 
    comput...
--------------------------------------------------


2024-11-09 22:11:14 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-09 22:11:14 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-09 22:11:14 - src.core.language_processing.finnish - INFO - Loaded 747 stopwords from /home/topi/data-science/repos/semantic-text-analyzer/data/configurations/stop_words/fi.txt
2024-11-09 22:11:14 - src.core.language_processing.finnish - INFO - Successfully initialized Voikko using system libraries



Extracted Keywords:
- natural
- language
- process
- nlp
- combine
- linguistics
- computer
- science
- help
- understand

Compound Words:
- Natural Language
- machine translation

Language: en
Success: True

Testing Finnish text...

Analyzing Text:
Text: 
    Koneoppiminen on tekoälyn osa-alue, joka keskittyy järjestelmiin, jotka oppivat datasta. 
    S...
--------------------------------------------------


2024-11-09 22:11:18 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Extracted Keywords:
- koneoppia
- olla
- tekoäly
- osa-alue
- joka
- keskittyä
- järjestelmä
- oppia
- data
- syväoppia

Compound Words:
- koneoppiminen
- syväoppiminen
- neuroverkot

Language: fi
Success: True


## Helper Functions and Test Data Setup

In [6]:
class NotebookAnalyzer:
    """Helper class for running analyses in the notebook."""
    
    def __init__(self):
        self.file_utils = FileUtils()
        self.test_texts = self._load_test_texts()
        
        # Use parameter file from configurations directory
        self.parameter_file = self.file_utils.get_data_path("configurations") / "example_params.xlsx"
        
        # Create the parameter file if it doesn't exist
        if not self.parameter_file.exists():
            self._create_parameter_file()
        
        self.param_adapter = ParameterAdapter(self.parameter_file)
        
        logger.info(f"Initialized NotebookAnalyzer with {len(self.test_texts)} test texts")
    
    def _load_test_texts(self) -> Dict[str, str]:
        """Load or create test texts."""
        texts = {
            "technical": """
            Python is a high-level programming language known for its simplicity.
            It supports multiple programming paradigms including procedural and
            object-oriented programming.
            """,
            "business": """
            The company's Q3 results exceeded expectations with revenue growth of 15%.
            Customer acquisition costs decreased while retention rates improved.
            The board has approved a new strategic initiative focusing on expansion.
            """,
            "finnish": """
            Ohjelmistokehittäjä työskentelee asiakasprojektissa kehittäen
            verkkokauppajärjestelmää. Tekninen toteutus vaatii erityistä huomiota
            tietoturvan osalta.
            """
        }
        
        # Save texts using FileUtils
        df = pd.DataFrame([
            {"name": name, "content": content.strip()}
            for name, content in texts.items()
        ])
        
        self.file_utils.save_data_to_disk(
            data={"texts": df},
            output_type="raw",
            file_name="test_texts",
            output_filetype="xlsx",
            include_timestamp=False
        )
        
        return texts

    def _create_parameter_file(self):
        """Create initial parameter Excel file."""
        parameters_data = {
            'General Parameters': pd.DataFrame({
                'parameter': [
                    'max_kws',
                    'max_themes',
                    'focus_on',
                    'language',
                    'additional_context',
                    'min_confidence',
                    'include_compounds',
                    'column_name_to_analyze'
                ],
                'value': [
                    8,
                    3,
                    'business and financial content',
                    'en',
                    'Business performance analysis',
                    0.3,
                    True,
                    'text'
                ]
            }),
            'Excluded Keywords': pd.DataFrame({
                'keyword': [
                    'the', 'with', 'while', 'new',
                    'company', 'business', 'results',
                    'approximately', 'significantly',
                    'current', 'various', 'multiple'
                ],
                'reason': [
                    'Common word', 'Common word', 'Common word', 'Common word',
                    'Too generic', 'Too generic', 'Too generic',
                    'Modifier', 'Modifier',
                    'Vague term', 'Vague term', 'Vague term'
                ]
            }),
            'Categories': pd.DataFrame({
                'category': [
                    'business_performance',
                    'financial_metrics',
                    'strategy',
                    'operations'
                ],
                'description': [
                    'Business performance indicators and results',
                    'Financial and revenue related metrics',
                    'Strategic initiatives and planning',
                    'Operational metrics and processes'
                ],
                'keywords': [
                    'revenue,growth,performance,results,expectations',
                    'costs,revenue,profit,margin,acquisition',
                    'initiative,strategic,expansion,planning,board',
                    'operations,efficiency,retention,improvement,process'
                ],
                'threshold': [0.6, 0.6, 0.6, 0.6]
            })
        }
        
        self.file_utils.save_data_to_disk(
            data=parameters_data,
            output_type="configurations",
            file_name="example_params",
            output_filetype="xlsx",
            include_timestamp=False
        )

    async def analyze_text(self, text_key: str, **kwargs):
        """Analyze a specific text sample."""
        if text_key not in self.test_texts:
            raise ValueError(f"Unknown text key: {text_key}. Available keys: {list(self.test_texts.keys())}")
        
        # Load parameters
        params = self.param_adapter.load_and_convert()
        
        # Get excluded keywords from parameter file
        excluded_keywords = set()
        try:
            excluded_df = pd.read_excel(self.parameter_file, sheet_name='Excluded Keywords')
            excluded_keywords = set(excluded_df['keyword'].dropna())
        except Exception as e:
            logger.warning(f"Could not load excluded keywords: {e}")
        
        # Create analyzer with explicit config
        analyzer = SemanticAnalyzer(
            parameter_file=self.parameter_file,
            config={
                "excluded_keywords": excluded_keywords,
                "min_keyword_length": 3,
                **kwargs.get('config', {})
            },
            **kwargs
        )
        
        # Run analysis
        results = await analyzer.analyze(self.test_texts[text_key], **kwargs)
        await self.display_results(results)
        return results
            
    
    @staticmethod
    async def display_results(results: Dict[str, Any]) -> None:
        """Display analysis results in a formatted way."""
        for analysis_type, data in results.items():
            print(f"\n{'='*20} {analysis_type.upper()} {'='*20}")
            if isinstance(data, dict):
                for key, value in data.items():
                    if isinstance(value, (list, dict)):
                        print(f"\n{key}:")
                        print(value)
                    else:
                        print(f"{key}: {value}")
            else:
                print(data)

In [7]:
# Initialize analyzer
notebook_analyzer = NotebookAnalyzer()

2024-11-09 22:12:32 - src.utils.FileUtils.file_utils - INFO - Data saved to /home/topi/data-science/repos/semantic-text-analyzer/data/raw/test_texts.xlsx
2024-11-09 22:12:32 - src.utils.FileUtils.file_utils - INFO - Successfully loaded 3 sheets from /home/topi/data-science/repos/semantic-text-analyzer/data/configurations/example_params.xlsx
2024-11-09 22:12:32 - __main__ - INFO - Initialized NotebookAnalyzer with 3 test texts


### Verify setup


In [8]:
async def verify_setup():
    """Verify analyzer setup and available texts."""
    print("=== Setup Verification ===")
    print("\nAvailable test texts:")
    for name, text in notebook_analyzer.test_texts.items():
        print(f"\n{name}:")
        print(text[:100] + "...")  # Show first 100 chars
    
    print("\n=== Parameter File ===")
    print(f"Location: {notebook_analyzer.parameter_file}")
    print("Parameters loaded:", notebook_analyzer.param_adapter is not None)

# Run verification
await verify_setup()

=== Setup Verification ===

Available test texts:

technical:

            Python is a high-level programming language known for its simplicity.
            It su...

business:

            The company's Q3 results exceeded expectations with revenue growth of 15%.
            ...

finnish:

            Ohjelmistokehittäjä työskentelee asiakasprojektissa kehittäen
            verkkokauppaj...

=== Parameter File ===
Location: /home/topi/data-science/repos/semantic-text-analyzer/data/configurations/example_params.xlsx
Parameters loaded: True


In [9]:
# Verify parameter loading and stopword handling
async def verify_parameters():
    """Verify parameter loading and stopword handling."""
    # Load parameters
    params = notebook_analyzer.param_adapter.load_and_convert()
    
    print("=== Parameter Verification ===")
    print("\nExcluded Keywords from parameters:")
    if hasattr(params, 'excluded_keywords'):
        print(sorted(list(params.excluded_keywords)))
    else:
        print("No excluded keywords in parameters")
    
    # Create analyzer with parameters
    analyzer = SemanticAnalyzer(
        parameter_file=notebook_analyzer.parameter_file,
        language="en"
    )
    
    print("\n=== Language Processing Setup ===")
    print(f"Language: {analyzer.text_processor.language}")
    print(f"\nStopwords (NLTK + custom):")
    print(f"Total stopwords: {len(analyzer.text_processor._stop_words)}")
    print("\nSample stopwords (first 20):")
    print(sorted(list(analyzer.text_processor._stop_words))[:20])
    
    print(f"\nExcluded keywords from config:")
    print(sorted(list(analyzer.text_processor.excluded_keywords)))
    
    return analyzer

# Run verification
# analyzer = await verify_parameters()

# Test analysis with verified setup
# results = await analyzer.analyze(
#     notebook_analyzer.test_texts['business'],
#     analysis_types=["keywords", "themes", "categories"]
# )

In [10]:
async def test_keyword_extraction():
    """Test keyword extraction with full debug output."""
    text_key = "business"
    text = notebook_analyzer.test_texts[text_key]
    
    # Create analyzer with explicit config
    excluded_df = pd.read_excel(notebook_analyzer.parameter_file, sheet_name='Excluded Keywords')
    excluded_keywords = set(excluded_df['keyword'].dropna())
    
    analyzer = SemanticAnalyzer(
        parameter_file=notebook_analyzer.parameter_file,
        language="en",
        config={
            "excluded_keywords": excluded_keywords,
            "min_keyword_length": 3,
            "max_keywords": 8,
            "focus": "business metrics and performance"
        }
    )
    
    print("=== Keyword Extraction Test ===")
    print("\nOriginal text:")
    print(text)
    
    # Show processing steps
    tokens = analyzer.text_processor.tokenize(text)
    print("\nTokens after stopword removal:")
    filtered = [word for word in tokens 
               if not analyzer.text_processor.should_exclude_word(word)]
    print(filtered)
    
    # Run analysis
    results = await analyzer.analyze(
        text, 
        analysis_types=["keywords"]
    )
    
    print("\nExtracted Keywords:")
    if "keywords" in results:
        keywords = results["keywords"]
        print("\nKeywords:", keywords.get("keywords", []))
        print("\nCompound words:", keywords.get("compound_words", []))
        print("\nDomain keywords:", keywords.get("domain_keywords", {}))

# Run the test
await test_keyword_extraction()

2024-11-09 22:12:48 - src.utils.FileUtils.file_utils - INFO - Successfully loaded 3 sheets from /home/topi/data-science/repos/semantic-text-analyzer/data/configurations/example_params.xlsx
2024-11-09 22:12:48 - src.utils.FileUtils.file_utils - INFO - Successfully loaded 3 sheets from /home/topi/data-science/repos/semantic-text-analyzer/data/configurations/example_params.xlsx
2024-11-09 22:12:48 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-09 22:12:48 - src.semantic_analyzer.analyzer - INFO - Initialized SemanticAnalyzer with language: en, using parameter file: /home/topi/data-science/repos/semantic-text-analyzer/data/configurations/example_params.xlsx
2024-11-09 22:12:48 - src.semantic_analyzer.analyzer - INFO - Initialized SemanticAnalyzer with language: en, using parameter file: /home/topi/data-science/repos/semantic-text-analyzer/data/configurations/example_params.xlsx


=== Keyword Extraction Test ===

Original text:

            The company's Q3 results exceeded expectations with revenue growth of 15%.
            Customer acquisition costs decreased while retention rates improved.
            The board has approved a new strategic initiative focusing on expansion.
            

Tokens after stopword removal:


AttributeError: 'EnglishTextProcessor' object has no attribute 'should_exclude_word'

In [None]:
# Analysis with explicit stopword settings
params = notebook_analyzer.param_adapter.load_and_convert()

# Add some business-specific words to exclude
additional_config = {
    "excluded_keywords": ["the", "with", "while", "new"],  # Common words we want to exclude
}

# Run analysis
results = await notebook_analyzer.analyze_text(
    "business",
    analysis_types=["keywords", "themes", "categories"],
    language="en",
    parameter_file=notebook_analyzer.parameter_file,
    config=additional_config  # Add our additional config
)



## Basic Analysis Example

In [None]:
# Load parameters and analyze business text
params = notebook_analyzer.param_adapter.load_and_convert()

# Analyze with additional settings
results = await notebook_analyzer.analyze_text(
    "business",
    analysis_types=["keywords", "themes", "categories"],
    language="en",
    parameter_file=notebook_analyzer.parameter_file,
    min_confidence=0.3,  # Ensure we get broader results
    include_compounds=True,  # Explicitly enable compound words
    focus_on="business and financial content"  # Set specific focus
)

In [None]:
# Display results in a more readable format
async def display_formatted_results(results):
    """Display results with better formatting."""
    for analysis_type, data in results.items():
        print(f"\n{'='*20} {analysis_type.upper()} {'='*20}")
        if isinstance(data, dict):
            for key, value in data.items():
                if isinstance(value, list):
                    print(f"\n{key}:")
                    for item in value:
                        print(f"  - {item}")
                elif isinstance(value, dict):
                    print(f"\n{key}:")
                    for k, v in value.items():
                        print(f"  {k}: {v}")
                else:
                    print(f"{key}: {value}")

await display_formatted_results(results)



## Analysis with Parameters

In [None]:
# Load parameters and analyze business text
params = notebook_analyzer.param_adapter.load_and_convert()

results = await notebook_analyzer.analyze_text(
    "business",
    analysis_types=["keywords", "themes", "categories"],
    language="en",
    parameter_file=params
)



## Finnish Text Analysis

In [None]:
# Analyze Finnish text
results = await notebook_analyzer.analyze_text(
    "finnish",
    analysis_types=["keywords", "categories"],
    language="fi"
)

## Batch Analysis

In [None]:
# Analyze all texts
all_results = await notebook_analyzer.analyze_all(
    analysis_types=["keywords", "themes"]
)

## Saving Results

In [None]:
# Save results
saved_path = notebook_analyzer.save_results(results, "analysis_results")
print(f"Results saved to: {saved_path}")

## Custom Categories Analysis

In [None]:
# Cell 19 - Code
# Define custom categories
categories = {
    "technical": {
        "description": "Technical content",
        "keywords": ["programming", "software", "technology"],
        "threshold": 0.7
    },
    "business": {
        "description": "Business content",
        "keywords": ["revenue", "growth", "financial"],
        "threshold": 0.6
    }
}

# Analyze with custom categories
results = await notebook_analyzer.analyze_text(
    "technical",
    analysis_types=["categories"],
    categories=categories
)