In [1]:
import os
import sys
from pathlib import Path
import asyncio
import logging
from typing import Dict, Any, List, Tuple
from pprint import pprint

import pandas as pd

# Add project root to Python path
project_root = str(Path().resolve().parent)
if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added {project_root} to Python path")

# Core components
from src.semantic_analyzer.analyzer import SemanticAnalyzer
from src.utils.FileUtils.file_utils import FileUtils
from src.analyzers.keyword_analyzer import KeywordAnalyzer
from src.core.language_processing import create_text_processor
from src.loaders.parameter_adapter import ParameterAdapter

# Initialize FileUtils and set up logging
file_utils = FileUtils()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)



Added C:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer to Python path


In [2]:
def verify_environment():
    """Verify that the notebook environment is properly configured."""
    # Load environment variables
    from dotenv import load_dotenv
    env_path = Path(project_root) / ".env"
    env_loaded = load_dotenv(env_path)

    # Required environment variables
    required_env_vars = [
        'OPENAI_API_KEY',
        'ANTHROPIC_API_KEY',
    ]

    # Basic checks
    basic_checks = {
        "Project root in path": project_root in sys.path,
        "Can import src": "src" in sys.modules,
        "FileUtils initialized": hasattr(file_utils, "project_root"),
        ".env file loaded": env_loaded,
    }

    # Environment variable checks
    env_var_checks = {
        f"{var} set": os.getenv(var) is not None
        for var in required_env_vars
    }

    # Check for required paths using FileUtils
    expected_paths = {
        "Raw data": file_utils.get_data_path("raw"),
        "Processed data": file_utils.get_data_path("processed"),
        "Configuration": file_utils.get_data_path("configurations"),
        "Main config.yaml": Path(project_root) / "config.yaml"
    }
    
    path_checks = {
        f"{name} exists": path.exists()
        for name, path in expected_paths.items()
    }

    # Combine all checks
    all_checks = {
        **basic_checks,
        **env_var_checks,
        **path_checks
    }
    
    # Print results
    print("Environment Check Results:")
    print("=" * 50)
    
    def print_section(title, checks):
        print(f"\n{title}:")
        print("-" * len(title))
        for check, result in checks.items():
            status = "✓" if result else "✗"
            print(f"{status} {check}")
    
    print_section("Basic Setup", basic_checks)
    print_section("Environment Variables", env_var_checks)
    print_section("Project Structure", path_checks)
    
    # Overall status
    all_passed = all(all_checks.values())
    print("\n" + "=" * 50)
    print("Environment Status:", "Ready ✓" if all_passed else "Setup needed ✗")
    
    if not all_passed:
        print("\nSetup Instructions:")
        if not env_loaded:
            print("- Create a .env file in the project root with required API keys")
        for var in required_env_vars:
            if not os.getenv(var):
                print(f"- Add {var} to your .env file")
        for name, path in expected_paths.items():
            if not path.exists():
                print(f"- Create {name} directory at {path}")

    return all_passed

# Run verification
verify_environment()



Environment Check Results:

Basic Setup:
-----------
✓ Project root in path
✓ Can import src
✓ FileUtils initialized
✓ .env file loaded

Environment Variables:
---------------------
✓ OPENAI_API_KEY set
✓ ANTHROPIC_API_KEY set

Project Structure:
-----------------
✓ Raw data exists
✓ Processed data exists
✓ Configuration exists
✓ Main config.yaml exists

Environment Status: Ready ✓


True

## Create Tester classes

In [3]:
class KeywordTester:
    """Helper class for testing keyword analysis components."""
    
    def __init__(self):
        self.file_utils = FileUtils()
        self.test_texts = self._load_test_texts()
        
    def _load_test_texts(self) -> Dict[str, str]:
        """Load or create test texts."""
        texts = {
            "technical": """
                Python is a high-level programming language known for its simplicity.
                It supports multiple programming paradigms including procedural and
                object-oriented programming.
            """,
            "business": """
                The company's Q3 results exceeded expectations with revenue growth of 15%.
                Customer acquisition costs decreased while retention rates improved.
                The board has approved a new strategic initiative focusing on expansion.
            """,
            "finnish": """
                Ohjelmistokehittäjä työskentelee asiakasprojektissa kehittäen uusia 
                ominaisuuksia verkkokauppajärjestelmään. Tekninen toteutus vaatii
                erityistä huomiota tietoturvan osalta.
            """
        }
        
        # Save test texts using FileUtils
        df = pd.DataFrame([
            {"name": name, "content": content.strip()}
            for name, content in texts.items()
        ])
        
        self.file_utils.save_data_to_disk(
            data={"texts": df},
            output_type="raw",
            file_name="test_texts",
            output_filetype="xlsx",
            include_timestamp=False
        )
        
        return texts

    async def test_statistical_analysis(self, text: str, language: str = None):
        """Test statistical keyword extraction."""
        if language is None:
            # Try to detect language or use default
            from langdetect import detect
            try:
                language = detect(text)
            except:
                language = "en"
        
        # Create processor and analyzer
        processor = create_text_processor(language=language)
        analyzer = KeywordAnalyzer(
            config={"weights": {"statistical": 1.0, "llm": 0.0}},  # Statistical only
            language_processor=processor
        )
        
        results = await analyzer.analyze(text)
        return results

    async def test_llm_analysis(self, text: str, language: str = None):
        """Test LLM-based keyword extraction."""
        if language is None:
            # Try to detect language or use default
            from langdetect import detect
            try:
                language = detect(text)
            except:
                language = "en"
        
        analyzer = KeywordAnalyzer(
            config={"weights": {"statistical": 0.0, "llm": 1.0}},  # LLM only
            language_processor=create_text_processor(language=language)
        )
        
        results = await analyzer.analyze(text)
        return results

    async def test_combined_analysis(self, text: str, language: str = None):
        """Test combined statistical and LLM analysis."""
        if language is None:
            # Try to detect language or use default
            from langdetect import detect
            try:
                language = detect(text)
            except:
                language = "en"
        
        analyzer = KeywordAnalyzer(
            config={
                "weights": {"statistical": 0.4, "llm": 0.6},
                "max_keywords": 10,
                "min_confidence": 0.3
            },
            language_processor=create_text_processor(language=language)
        )
        
        results = await analyzer.analyze(text)
        return results

    # async def test_combined_analysis(self, text_key: str):
    #     """Test combined statistical and LLM analysis."""
    #     text = self.test_texts[text_key]
    #     language = "fi" if text_key == "finnish" else "en"
        
    #     analyzer = KeywordAnalyzer(
    #         config={
    #             "weights": {"statistical": 0.4, "llm": 0.6},
    #             "max_keywords": 10,
    #             "min_confidence": 0.3
    #         },
    #         language_processor=create_text_processor(language=language)
    #     )
        
    #     print(f"\nTesting Combined Analysis for {text_key}:")
    #     print("=" * 50)
    #     print(f"Text: {text[:100]}...")
        
    #     results = await analyzer.analyze(text)
        
    #     print("\nCombined Keywords:")
    #     if hasattr(results, "keywords"):
    #         for kw in results.keywords:
    #             print(f"- {kw}")
        
    #     return results



In [4]:
class ContentTester:
    """Helper class for testing with different content types."""
    
    def __init__(self):
        self.file_utils = FileUtils()
        self.test_texts = self._load_test_content()
        self.keyword_tester = KeywordTester()
        
    def _load_test_content(self) -> Dict[str, Dict[str, List[str]]]:
        """Load test content from files."""
        content = {}
        
        for lang in ["en", "fi"]:
            try:
                # Load content using FileUtils
                df = self.file_utils.load_single_file(
                    f"test_content_{lang}.xlsx",
                    input_type="raw"
                )
                
                # Group by content type
                content[lang] = {}
                for content_type, group in df.groupby("type"):
                    content[lang][content_type] = group["content"].tolist()
                
            except Exception as e:
                logger.warning(f"Could not load test content for {lang}: {e}")
                content[lang] = {}
        
        return content

    async def test_content_type(
        self, 
        language: str, 
        content_type: str, 
        analyzer: KeywordAnalyzer,
        show_comparison: bool = True
    ) -> List[Any]:
        """Test analysis for specific content type with optional comparison."""
        if not self.test_texts.get(language, {}).get(content_type):
            logger.warning(f"No {content_type} content available for {language}")
            return []
            
        results = []
        comparison_results = []
        texts = self.test_texts[language][content_type]
        
        logger.info(f"Processing {len(texts)} texts for {language} {content_type}")
        
        for i, text in enumerate(texts, 1):
            try:
                logger.debug(f"Processing text {i}/{len(texts)}")
                
                if show_comparison:
                    print(f"\nText {i}:")
                    print("-" * 50)
                    print(f"Content: {text[:100]}...")
                    
                    # Run comparison analysis
                    stat_results = await self.keyword_tester.test_statistical_analysis(
                        text, language=language
                    )
                    llm_results = await self.keyword_tester.test_llm_analysis(
                        text, language=language
                    )
                    combined_results = await analyzer.analyze(text)
                    
                    # Print comparison
                    print("\nResults Comparison:")
                    print("-" * 30)
                    print("Statistical:", stat_results.keywords if hasattr(stat_results, "keywords") else [])
                    print("LLM:", llm_results.keywords if hasattr(llm_results, "keywords") else [])
                    print("Combined:", combined_results.keywords if hasattr(combined_results, "keywords") else [])
                    
                    results.append(combined_results)
                    comparison_results.append((stat_results, llm_results, combined_results))
                else:
                    # Just run normal analysis
                    result = await analyzer.analyze(text)
                    results.append(result)
                    
                    print(f"\nText {i}:")
                    print("Keywords:", result.keywords if hasattr(result, "keywords") else [])
                    print("Domain Keywords:", result.domain_keywords if hasattr(result, "domain_keywords") else {})
                
            except Exception as e:
                logger.error(f"Error processing text {i}: {e}")
                results.append(None)
                if show_comparison:
                    comparison_results.append((None, None, None))
            
        return comparison_results if show_comparison else results
    
    async def analyze_text_with_comparison(
        self,
        text: str,
        language: str = "en"
    ) -> Tuple[Any, Any, Any]:
        """Analyze a single text with comparison of different methods."""
        try:
            print("\nRunning Analysis with Comparison:")
            print("=" * 50)
            print(f"Text: {text[:100]}...")
            
            # Run all analysis types
            stat_results = await self.keyword_tester.test_statistical_analysis(
                text, language=language
            )
            llm_results = await self.keyword_tester.test_llm_analysis(
                text, language=language
            )
            combined_results = await self.keyword_tester.test_combined_analysis(
                text, language=language
            )
            
            # Print comparison
            print("\nResults Comparison:")
            print("-" * 30)
            print("Statistical:", stat_results.keywords if hasattr(stat_results, "keywords") else [])
            print("LLM:", llm_results.keywords if hasattr(llm_results, "keywords") else [])
            print("Combined:", combined_results.keywords if hasattr(combined_results, "keywords") else [])
            
            return stat_results, llm_results, combined_results
            
        except Exception as e:
            logger.error(f"Error analyzing text: {e}")
            return None, None, None

    def get_content_types(self, language: str) -> List[str]:
        """Get available content types for a language."""
        return list(self.test_texts.get(language, {}).keys())

    def get_text_count(self, language: str, content_type: str) -> int:
        """Get number of texts for a language and content type."""
        return len(self.test_texts.get(language, {}).get(content_type, []))

    async def analyze_single_text(
        self,
        text: str,
        language: str,
        analyzer: KeywordAnalyzer
    ) -> Any:
        """Analyze a single text and display results."""
        try:
            result = await analyzer.analyze(text)
            
            print("\nAnalysis Results:")
            print("Keywords:", result.keywords if hasattr(result, "keywords") else [])
            print("Domain Keywords:", result.domain_keywords if hasattr(result, "domain_keywords") else {})
            
            return result
            
        except Exception as e:
            logger.error(f"Error analyzing text: {e}")
            return None

async def run_content_tests(show_comparison: bool = True):
    """Run tests for all content types.
    
    Args:
        show_comparison: If True, shows comparison between statistical, LLM, and combined results
    """
    from src.loaders.parameter_adapter import ParameterAdapter
    
    tester = ContentTester()
    file_utils = FileUtils()
    
    # Load main config from project root
    config_path = Path(file_utils.project_root) / "config.yaml"
    try:
        logger.info(f"Loading config from: {config_path}")
        main_config = file_utils.load_yaml(config_path)
        lang_configs = main_config.get("languages", {})
    except Exception as e:
        logger.warning(f"Could not load main config from {config_path}: {e}")
        lang_configs = {}
    
    # Load parameters using ParameterAdapter
    en_params = ParameterAdapter(
        file_utils.get_data_path("configurations") / "parameters_en.xlsx"
    ).parameters
    
    fi_params = ParameterAdapter(
        file_utils.get_data_path("configurations") / "parameters_fi.xlsx"
    ).parameters
    
    # Create analyzers with parameters
    en_analyzer = KeywordAnalyzer(
        config={
            **en_params.general.model_dump(),  # Convert to dict
            "weights": {"statistical": 0.4, "llm": 0.6},
            "max_keywords": 8
        },
        language_processor=create_text_processor(
            language="en",
            config=lang_configs.get("en", {})
        )
    )
    
    fi_analyzer = KeywordAnalyzer(
        config={
            **fi_params.general.model_dump(),  # Convert to dict
            "weights": {"statistical": 0.4, "llm": 0.6},
            "max_keywords": 8
        },
        language_processor=create_text_processor(
            language="fi",
            config=lang_configs.get("fi", {})
        )
    )
    
    results = {}
    
    # Test English content
    print("Testing English content:")
    print("=" * 50)
    for content_type in tester.get_content_types("en"):
        print(f"\nTesting {content_type} content:")
        results[f"en_{content_type}"] = await tester.test_content_type(
            "en", 
            content_type, 
            en_analyzer,
            show_comparison=show_comparison
        )
    
    # Test Finnish content
    print("\nTesting Finnish content:")
    print("=" * 50)
    for content_type in tester.get_content_types("fi"):
        print(f"\nTesting {content_type} content:")
        results[f"fi_{content_type}"] = await tester.test_content_type(
            "fi", 
            content_type, 
            fi_analyzer,
            show_comparison=show_comparison
        )
    
    return results

# Helper function for single text analysis
async def analyze_single_text(text: str, language: str = "en", show_comparison: bool = True):
    """Analyze a single text with optional comparison.
    
    Args:
        text: Text to analyze
        language: Language code ('en' or 'fi')
        show_comparison: If True, shows comparison between different analysis methods
    """
    tester = ContentTester()
    
    if show_comparison:
        return await tester.analyze_text_with_comparison(text, language)
    else:
        return await tester.analyze_single_text(text, language)



# Run in notebook:
# All tests:
# results = await run_content_tests()

# Single text analysis:
# result = await analyze_text("Your text here", language="en")

## Run tests

In [None]:

async def test_position_aware_keywords():
    """Test position-aware keyword extraction."""
    
    # Text with clearly positioned keywords
    text = """Machine Learning Applications
    
    Artificial intelligence and deep learning models are transforming businesses.
    These neural networks help companies analyze large datasets efficiently.
    Machine learning solutions provide valuable insights.
    
    In conclusion, AI technology continues to evolve rapidly."""
    
    # Initialize components
    file_utils = FileUtils()
    processor = create_text_processor(language="en")
    
    # Create LLM instance
    from src.core.llm.factory import create_llm
    llm = create_llm()
    
    analyzer = KeywordAnalyzer(
        llm=llm,  # Pass the LLM instance
        config={
            "weights": {"statistical": 0.4, "llm": 0.6},
            "max_keywords": 8,
            "position_weights": {
                "title": 1.5,
                "first_para": 1.3,
                "last_para": 1.2,
                "body": 1.0
            }
        },
        language_processor=processor
    )
    
    # Analyze text
    results = await analyzer.analyze(text)
    
    print("\nAnalysis Results:")
    print("=" * 50)
    
    if results.success:
        print("\nKeywords found:")
        for kw in results.keywords:
            print(f"- {kw.keyword:<20} (score: {kw.score:.2f})")
            
        if hasattr(results, "domain_keywords") and results.domain_keywords:
            print("\nDomain grouping:")
            for domain, words in results.domain_keywords.items():
                print(f"{domain}: {', '.join(words)}")
    else:
        print(f"Analysis failed: {results.error}")
        
    return results

# Run the test
results = await test_position_aware_keywords()

2024-11-11 16:50:32 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 16:50:32 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 16:50:32 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 16:50:44 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Analysis Results:

Keywords found:
- Machine Learning     (score: 0.68)
- Artificial Intelligence (score: 0.67)
- Deep Learning        (score: 0.66)
- Neural Networks      (score: 0.65)
- Data Analysis        (score: 0.63)
- Business Transformation (score: 0.63)
- AI Technology        (score: 0.61)
- Large Datasets       (score: 0.60)


In [6]:
async def test_business_keywords():
    """Test keyword extraction with business text."""
    
    text = """Q3 Financial Performance Review
    
    Revenue growth exceeded expectations with 15% year-over-year increase.
    Operating margins improved due to cost optimization initiatives.
    Customer acquisition metrics show positive trends.
    
    Looking ahead, market expansion remains our strategic priority."""
    
    from src.core.llm.factory import create_llm
    processor = create_text_processor(language="en")
    llm = create_llm()
    
    analyzer = KeywordAnalyzer(
        llm=llm,
        config={
            "weights": {"statistical": 0.4, "llm": 0.6},
            "max_keywords": 8,
            "domain_keywords": {
                "financial": ["revenue", "margins", "cost", "growth"],
                "business": ["strategic", "market", "customer", "acquisition"],
                "metrics": ["performance", "trends", "optimization"]
            },
            "position_weights": {
                "title": 1.5,
                "first_para": 1.3,
                "last_para": 1.2,
                "body": 1.0
            }
        },
        language_processor=processor
    )
    
    results = await analyzer.analyze(text)
    
    print("\nBusiness Text Analysis:")
    print("=" * 50)
    
    if results.success:
        print("\nKeywords by domain:")
        domains = {}
        for kw in results.keywords:
            domain = kw.domain or "general"
            if domain not in domains:
                domains[domain] = []
            domains[domain].append(f"{kw.keyword} ({kw.score:.2f})")
            
        for domain, keywords in domains.items():
            print(f"\n{domain.title()}:")
            for kw in keywords:
                print(f"- {kw}")
    else:
        print(f"Analysis failed: {results.error}")
    
    return results

# Run business test
business_results = await test_business_keywords()

2024-11-11 16:50:45 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 16:50:45 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 16:50:45 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 16:50:50 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Business Text Analysis:

Keywords by domain:

Financial:
- financial (1.00)
- review (1.00)
- revenue (1.00)
- growth (1.00)
- market expansion (0.64)
- operating margins (0.63)
- cost optimization (0.63)

Metrics:
- performance (1.00)


In [7]:
async def test_keyword_improvements():
    """Test improved keyword extraction with clustering."""
    
    text = """AI and Machine Learning Applications
    
    Artificial Intelligence (AI) and ML models are transforming business.
    These machine learning algorithms help analyze big data effectively.
    The ROI on AI investments has been significant.
    
    Key performance indicators (KPIs) show positive results."""
    
    from src.core.llm.factory import create_llm
    processor = create_text_processor(language="en")
    llm = create_llm()
    
    analyzer = KeywordAnalyzer(
        llm=llm,
        config={
            "weights": {"statistical": 0.4, "llm": 0.6},
            "max_keywords": 8,
            "domain_keywords": {
                "technical": ["ai", "artificial intelligence", "machine learning", "ml", "algorithm"],
                "business": ["roi", "return on investment", "kpi", "performance indicator"]
            },
            "clustering": {
                "similarity_threshold": 0.85,
                "max_cluster_size": 3,
                "boost_factor": 1.2
            }
        },
        language_processor=processor
    )
    
    results = await analyzer.analyze(text)
    
    print("\nImproved Keyword Analysis:")
    print("=" * 50)
    
    if results.success:
        print("\nClustered Keywords:")
        current_domain = None
        for kw in results.keywords:
            if kw.domain != current_domain:
                current_domain = kw.domain
                print(f"\n{current_domain or 'General'}:")
            print(f"- {kw.keyword:<25} (score: {kw.score:.2f})")
    else:
        print(f"Analysis failed: {results.error}")
    
    return results

# Run test
results = await test_keyword_improvements()

2024-11-11 16:50:50 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 16:50:50 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 16:50:50 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 16:50:55 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Improved Keyword Analysis:

Clustered Keywords:

technical:
- Machine Learning          (score: 0.68)
- Artificial Intelligence   (score: 0.68)
- big data                  (score: 0.65)

business:
- ROI                       (score: 0.61)
- performance indicators    (score: 0.58)

technical:
- algorithms                (score: 0.54)

business:
- business transformation   (score: 0.50)

General:
- learn                     (score: 0.48)


In [None]:
async def test_keyword_improvements():
    """Test improved keyword extraction with clustering."""
    
    text = """AI and Machine Learning Applications
    
    Artificial Intelligence (AI) and ML models are transforming business.
    These machine learning algorithms help analyze big data effectively.
    The ROI on AI investments has been significant.
    
    Key performance indicators (KPIs) show positive results."""
    
    from src.core.llm.factory import create_llm
    processor = create_text_processor(language="en")
    llm = create_llm()
    
    analyzer = KeywordAnalyzer(
        llm=llm,
        config={
            "weights": {"statistical": 0.4, "llm": 0.6},
            "max_keywords": 8,
            "domain_keywords": {
                "technical": ["ai", "artificial intelligence", "machine learning", "ml", "algorithm"],
                "business": ["roi", "return on investment", "kpi", "performance indicator"]
            },
            "clustering": {
                "similarity_threshold": 0.85,
                "max_cluster_size": 3,
                "boost_factor": 1.2
            }
        },
        language_processor=processor
    )
    
    results = await analyzer.analyze(text)
    
    print("\nImproved Keyword Analysis:")
    print("=" * 50)
    
    if results.success:
        print("\nClustered Keywords:")
        current_domain = None
        for kw in results.keywords:
            if kw.domain != current_domain:
                current_domain = kw.domain
                print(f"\n{current_domain or 'General'}:")
            print(f"- {kw.keyword:<25} (score: {kw.score:.2f})")
    else:
        print(f"Analysis failed: {results.error}")
    
    return results

# Run test
results = await test_keyword_improvements()

In [8]:
async def test_finnish_keywords():
    """Test improved keyword extraction with Finnish text."""
    
    text = """Tekoälyn Hyödyntäminen Liiketoiminnassa
    
    Ohjelmistokehittäjät hyödyntävät koneoppimismalleja asiakasprojekteissa.
    Verkkokauppajärjestelmän tehokkuus on parantunut automaation avulla.
    Tietoturvaratkaisut ovat keskeinen osa teknistä toteutusta.
    
    Liiketoiminnan mittarit osoittavat asiakastyytyväisyyden parantuneen merkittävästi."""
    
    from src.core.llm.factory import create_llm
    processor = create_text_processor(language="fi")
    llm = create_llm()
    
    analyzer = KeywordAnalyzer(
        llm=llm,
        config={
            "weights": {"statistical": 0.4, "llm": 0.6},
            "max_keywords": 8,
            "domain_keywords": {
                "tekninen": ["tekoäly", "koneoppiminen", "ohjelmisto", "tietoturva", "automaatio"],
                "liiketoiminta": ["asiakastyytyväisyys", "liiketoiminta", "mittarit", "verkkokauppa"]
            },
            "clustering": {
                "similarity_threshold": 0.85,
                "max_cluster_size": 3,
                "boost_factor": 1.2
            }
        },
        language_processor=processor
    )
    
    results = await analyzer.analyze(text)
    
    print("\nSuomenkielinen avainsana-analyysi:")
    print("=" * 50)
    
    if results.success:
        print("\nRyhmitellyt avainsanat:")
        current_domain = None
        for kw in results.keywords:
            if kw.domain != current_domain:
                current_domain = kw.domain
                print(f"\n{current_domain or 'Yleinen'}:")
            print(f"- {kw.keyword:<30} (tulos: {kw.score:.2f})")
            
        if hasattr(results, "domain_keywords") and results.domain_keywords:
            print("\nAvainsanat toimialueittain:")
            for domain, words in results.domain_keywords.items():
                print(f"\n{domain}:")
                print(", ".join(words))
    else:
        print(f"Analyysi epäonnistui: {results.error}")
    
    return results

# Run Finnish test
results = await test_finnish_keywords()

2024-11-11 16:58:00 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 16:58:00 - src.core.language_processing.finnish - INFO - Loaded 747 stopwords from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\stop_words\fi.txt
2024-11-11 16:58:00 - src.core.language_processing.finnish - INFO - Detected platform: win32
Exception ignored in: <function Voikko.__del__ at 0x0000025DFAA439D0>
Traceback (most recent call last):
  File "c:\Users\tja\AppData\Local\miniconda3\envs\semantic-analyzer\lib\site-packages\libvoikko.py", line 446, in __del__
    self.terminate()
  File "c:\Users\tja\AppData\Local\miniconda3\envs\semantic-analyzer\lib\site-packages\libvoikko.py", line 476, in terminate
    if self.__handle:
AttributeError: 'Voikko' object has no attribute '_Voikko__handle'
2024-11-11 16:58:00 - src.core.language_processing.finnish - INFO - Added C:\scripts\Vo


Suomenkielinen avainsana-analyysi:

Ryhmitellyt avainsanat:

liiketoiminta:
- liiketoiminta                  (tulos: 1.00)

tekninen:
- tekoäly                        (tulos: 1.00)

technical:
- verkkokauppajärjestelmä        (tulos: 1.00)
- koneoppimismallit              (tulos: 0.65)

business:
- asiakastyytyväisyys            (tulos: 0.63)

technical:
- automaation                    (tulos: 0.61)
- tietoturvaratkaisut            (tulos: 0.60)
- ohjelmistokehittäjät           (tulos: 0.60)


### Display helpers

In [None]:
def analyze_keyword_comparison(stat_keywords: List[str], llm_keywords: List[str], combined_keywords: List[str]) -> Dict:
    """Analyze and compare keywords from different methods."""
    # Find overlaps and unique keywords
    all_keywords = set(stat_keywords) | set(llm_keywords) | set(combined_keywords)
    
    analysis = {
        "all_methods": set(stat_keywords) & set(llm_keywords) & set(combined_keywords),
        "stat_llm_only": set(stat_keywords) & set(llm_keywords) - set(combined_keywords),
        "stat_combined_only": set(stat_keywords) & set(combined_keywords) - set(llm_keywords),
        "llm_combined_only": set(llm_keywords) & set(combined_keywords) - set(stat_keywords),
        "stat_only": set(stat_keywords) - set(llm_keywords) - set(combined_keywords),
        "llm_only": set(llm_keywords) - set(stat_keywords) - set(combined_keywords),
        "combined_only": set(combined_keywords) - set(stat_keywords) - set(llm_keywords)
    }
    
    return analysis

def display_comparison_results(text: str, stat_results: Any, llm_results: Any, combined_results: Any):
    """Display enhanced comparison of analysis results."""
    print("\nOriginal Text:")
    print("-" * 50)
    print(text.strip())
    
    print("\nResults Comparison:")
    print("-" * 50)
    
    # Get keywords from each method
    stat_kw = stat_results.keywords if hasattr(stat_results, "keywords") else []
    llm_kw = llm_results.keywords if hasattr(llm_results, "keywords") else []
    combined_kw = combined_results.keywords if hasattr(combined_results, "keywords") else []
    
    # Analyze overlaps
    analysis = analyze_keyword_comparison(stat_kw, llm_kw, combined_kw)
    
    # Display basic results
    print("\nKeywords by Method:")
    print("Statistical:", stat_kw)
    print("LLM:", llm_kw)
    print("Combined:", combined_kw)
    
    # Display analysis
    print("\nKeyword Analysis:")
    print("Found by all methods:", sorted(analysis["all_methods"]))
    print("Statistical & LLM only:", sorted(analysis["stat_llm_only"]))
    print("Statistical & Combined only:", sorted(analysis["stat_combined_only"]))
    print("LLM & Combined only:", sorted(analysis["llm_combined_only"]))
    print("Statistical only:", sorted(analysis["stat_only"]))
    print("LLM only:", sorted(analysis["llm_only"]))
    print("Combined only:", sorted(analysis["combined_only"]))
    
    # Display domain-specific insights
    print("\nDomain Analysis:")
    if hasattr(combined_results, "domain_keywords"):
        for domain, keywords in combined_results.domain_keywords.items():
            print(f"{domain}:", keywords)
    
    # Provide insights
    print("\nInsights:")
    print("- Statistical method found", len(stat_kw), "keywords")
    print("- LLM method found", len(llm_kw), "keywords")
    print("- Combined method found", len(combined_kw), "keywords")
    print("- Agreement between all methods:", len(analysis["all_methods"]), "keywords")
    
    # Calculate Jaccard similarity
    def jaccard_similarity(set1, set2):
        if not set1 or not set2:
            return 0
        return len(set1 & set2) / len(set1 | set2)
    
    print("\nSimilarity Analysis:")
    print("Statistical vs LLM:", f"{jaccard_similarity(set(stat_kw), set(llm_kw)):.2f}")
    print("Statistical vs Combined:", f"{jaccard_similarity(set(stat_kw), set(combined_kw)):.2f}")
    print("LLM vs Combined:", f"{jaccard_similarity(set(llm_kw), set(combined_kw)):.2f}")

## Run tests

- uncomment as needed

In [None]:
# Run all tests with comparison (default)
# results = await run_content_tests()

In [None]:
# Run all tests without comparison
# results = await run_content_tests(show_comparison=False)



In [None]:
# Analyze a single text without comparison
# results = await analyze_single_text(
#     """Your text here...""",
#     language="en",
#     show_comparison=False
# )

In [None]:
# results = await run_content_tests()

In [None]:
# Analyze a single text with comparison
# results = await analyze_single_text(
#     """Strategic partnerships drive innovation and market penetration. Investment in R&amp;D resulted in three new product launches. Sales performance exceeded targets in key market segments. """,
#     language="en",
#     show_comparison=True
# )



In [None]:
# Or analyze a single text with comparison

text = """Strategic partnerships drive innovation and market penetration. 
Investment in R&D resulted in three new product launches. 
Sales performance exceeded targets in key market segments."""

tester = ContentTester()
stat_resuls, llm_resuls, combined_resuls = await tester.analyze_text_with_comparison(
    text,
    language="en"
)

In [None]:
# Example usage


# display_comparison_results(text, stat_resuls, llm_results, combined_results)

In [None]:
# results = await run_content_tests()

In [None]:
# Test technical text
# await test_text("technical")



In [None]:
# Test business text
# await test_text("business")
    


In [None]:
# Test Finnish text
# await test_text("finnish")