In [1]:
import os
import sys
from pathlib import Path
import asyncio
import logging
from typing import Dict, Any, List, Tuple
from pprint import pprint

import pandas as pd

# Add project root to Python path
project_root = str(Path().resolve().parent)
if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added {project_root} to Python path")

# Core components
from src.semantic_analyzer.analyzer import SemanticAnalyzer
from src.utils.FileUtils.file_utils import FileUtils
from src.analyzers.keyword_analyzer import KeywordAnalyzer
from src.core.language_processing import create_text_processor
from src.loaders.parameter_adapter import ParameterAdapter

# Initialize FileUtils and set up logging
file_utils = FileUtils()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)



Added C:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer to Python path


In [2]:
def verify_environment():
    """Verify that the notebook environment is properly configured."""
    # Load environment variables
    from dotenv import load_dotenv
    env_path = Path(project_root) / ".env"
    env_loaded = load_dotenv(env_path)

    # Required environment variables
    required_env_vars = [
        'OPENAI_API_KEY',
        'ANTHROPIC_API_KEY',
    ]

    # Basic checks
    basic_checks = {
        "Project root in path": project_root in sys.path,
        "Can import src": "src" in sys.modules,
        "FileUtils initialized": hasattr(file_utils, "project_root"),
        ".env file loaded": env_loaded,
    }

    # Environment variable checks
    env_var_checks = {
        f"{var} set": os.getenv(var) is not None
        for var in required_env_vars
    }

    # Check for required paths using FileUtils
    expected_paths = {
        "Raw data": file_utils.get_data_path("raw"),
        "Processed data": file_utils.get_data_path("processed"),
        "Configuration": file_utils.get_data_path("configurations"),
        "Main config.yaml": Path(project_root) / "config.yaml"
    }
    
    path_checks = {
        f"{name} exists": path.exists()
        for name, path in expected_paths.items()
    }

    # Combine all checks
    all_checks = {
        **basic_checks,
        **env_var_checks,
        **path_checks
    }
    
    # Print results
    print("Environment Check Results:")
    print("=" * 50)
    
    def print_section(title, checks):
        print(f"\n{title}:")
        print("-" * len(title))
        for check, result in checks.items():
            status = "✓" if result else "✗"
            print(f"{status} {check}")
    
    print_section("Basic Setup", basic_checks)
    print_section("Environment Variables", env_var_checks)
    print_section("Project Structure", path_checks)
    
    # Overall status
    all_passed = all(all_checks.values())
    print("\n" + "=" * 50)
    print("Environment Status:", "Ready ✓" if all_passed else "Setup needed ✗")
    
    if not all_passed:
        print("\nSetup Instructions:")
        if not env_loaded:
            print("- Create a .env file in the project root with required API keys")
        for var in required_env_vars:
            if not os.getenv(var):
                print(f"- Add {var} to your .env file")
        for name, path in expected_paths.items():
            if not path.exists():
                print(f"- Create {name} directory at {path}")

    return all_passed

# Run verification
verify_environment()



Environment Check Results:

Basic Setup:
-----------
✓ Project root in path
✓ Can import src
✓ FileUtils initialized
✓ .env file loaded

Environment Variables:
---------------------
✓ OPENAI_API_KEY set
✓ ANTHROPIC_API_KEY set

Project Structure:
-----------------
✓ Raw data exists
✓ Processed data exists
✓ Configuration exists
✓ Main config.yaml exists

Environment Status: Ready ✓


True

## Create Tester classes

In [12]:
class KeywordTester:
    """Helper class for testing keyword analysis components."""
    
    def __init__(self):
        self.file_utils = FileUtils()
        self.test_texts = self._load_test_texts()
        
    def _load_test_texts(self) -> Dict[str, str]:
        """Load or create test texts."""
        texts = {
            "technical": """
                Python is a high-level programming language known for its simplicity.
                It supports multiple programming paradigms including procedural and
                object-oriented programming.
            """,
            "business": """
                The company's Q3 results exceeded expectations with revenue growth of 15%.
                Customer acquisition costs decreased while retention rates improved.
                The board has approved a new strategic initiative focusing on expansion.
            """,
            "finnish": """
                Ohjelmistokehittäjä työskentelee asiakasprojektissa kehittäen uusia 
                ominaisuuksia verkkokauppajärjestelmään. Tekninen toteutus vaatii
                erityistä huomiota tietoturvan osalta.
            """
        }
        
        # Save test texts using FileUtils
        df = pd.DataFrame([
            {"name": name, "content": content.strip()}
            for name, content in texts.items()
        ])
        
        self.file_utils.save_data_to_disk(
            data={"texts": df},
            output_type="raw",
            file_name="test_texts",
            output_filetype="xlsx",
            include_timestamp=False
        )
        
        return texts

    async def test_statistical_analysis(self, text: str, language: str = None):
        """Test statistical keyword extraction."""
        if language is None:
            # Try to detect language or use default
            from langdetect import detect
            try:
                language = detect(text)
            except:
                language = "en"
        
        # Create processor and analyzer
        processor = create_text_processor(language=language)
        analyzer = KeywordAnalyzer(
            config={"weights": {"statistical": 1.0, "llm": 0.0}},  # Statistical only
            language_processor=processor
        )
        
        results = await analyzer.analyze(text)
        return results

    async def test_llm_analysis(self, text: str, language: str = None):
        """Test LLM-based keyword extraction."""
        if language is None:
            # Try to detect language or use default
            from langdetect import detect
            try:
                language = detect(text)
            except:
                language = "en"
        
        analyzer = KeywordAnalyzer(
            config={"weights": {"statistical": 0.0, "llm": 1.0}},  # LLM only
            language_processor=create_text_processor(language=language)
        )
        
        results = await analyzer.analyze(text)
        return results

    async def test_combined_analysis(self, text: str, language: str = None):
        """Test combined statistical and LLM analysis."""
        if language is None:
            # Try to detect language or use default
            from langdetect import detect
            try:
                language = detect(text)
            except:
                language = "en"
        
        analyzer = KeywordAnalyzer(
            config={
                "weights": {"statistical": 0.4, "llm": 0.6},
                "max_keywords": 10,
                "min_confidence": 0.3
            },
            language_processor=create_text_processor(language=language)
        )
        
        results = await analyzer.analyze(text)
        return results

    # async def test_combined_analysis(self, text_key: str):
    #     """Test combined statistical and LLM analysis."""
    #     text = self.test_texts[text_key]
    #     language = "fi" if text_key == "finnish" else "en"
        
    #     analyzer = KeywordAnalyzer(
    #         config={
    #             "weights": {"statistical": 0.4, "llm": 0.6},
    #             "max_keywords": 10,
    #             "min_confidence": 0.3
    #         },
    #         language_processor=create_text_processor(language=language)
    #     )
        
    #     print(f"\nTesting Combined Analysis for {text_key}:")
    #     print("=" * 50)
    #     print(f"Text: {text[:100]}...")
        
    #     results = await analyzer.analyze(text)
        
    #     print("\nCombined Keywords:")
    #     if hasattr(results, "keywords"):
    #         for kw in results.keywords:
    #             print(f"- {kw}")
        
    #     return results



In [None]:
class ContentTester:
    """Helper class for testing with different content types."""
    
    def __init__(self):
        self.file_utils = FileUtils()
        self.test_texts = self._load_test_content()
        self.keyword_tester = KeywordTester()
        
    def _load_test_content(self) -> Dict[str, Dict[str, List[str]]]:
        """Load test content from files."""
        content = {}
        
        for lang in ["en", "fi"]:
            try:
                # Load content using FileUtils
                df = self.file_utils.load_single_file(
                    f"test_content_{lang}.xlsx",
                    input_type="raw"
                )
                
                # Group by content type
                content[lang] = {}
                for content_type, group in df.groupby("type"):
                    content[lang][content_type] = group["content"].tolist()
                
            except Exception as e:
                logger.warning(f"Could not load test content for {lang}: {e}")
                content[lang] = {}
        
        return content

    async def test_content_type(
        self, 
        language: str, 
        content_type: str, 
        analyzer: KeywordAnalyzer,
        show_comparison: bool = True
    ) -> List[Any]:
        """Test analysis for specific content type with optional comparison."""
        if not self.test_texts.get(language, {}).get(content_type):
            logger.warning(f"No {content_type} content available for {language}")
            return []
            
        results = []
        comparison_results = []
        texts = self.test_texts[language][content_type]
        
        logger.info(f"Processing {len(texts)} texts for {language} {content_type}")
        
        for i, text in enumerate(texts, 1):
            try:
                logger.debug(f"Processing text {i}/{len(texts)}")
                
                if show_comparison:
                    print(f"\nText {i}:")
                    print("-" * 50)
                    print(f"Content: {text[:100]}...")
                    
                    # Run comparison analysis
                    stat_results = await self.keyword_tester.test_statistical_analysis(
                        text, language=language
                    )
                    llm_results = await self.keyword_tester.test_llm_analysis(
                        text, language=language
                    )
                    combined_results = await analyzer.analyze(text)
                    
                    # Print comparison
                    print("\nResults Comparison:")
                    print("-" * 30)
                    print("Statistical:", stat_results.keywords if hasattr(stat_results, "keywords") else [])
                    print("LLM:", llm_results.keywords if hasattr(llm_results, "keywords") else [])
                    print("Combined:", combined_results.keywords if hasattr(combined_results, "keywords") else [])
                    
                    results.append(combined_results)
                    comparison_results.append((stat_results, llm_results, combined_results))
                else:
                    # Just run normal analysis
                    result = await analyzer.analyze(text)
                    results.append(result)
                    
                    print(f"\nText {i}:")
                    print("Keywords:", result.keywords if hasattr(result, "keywords") else [])
                    print("Domain Keywords:", result.domain_keywords if hasattr(result, "domain_keywords") else {})
                
            except Exception as e:
                logger.error(f"Error processing text {i}: {e}")
                results.append(None)
                if show_comparison:
                    comparison_results.append((None, None, None))
            
        return comparison_results if show_comparison else results
    
    async def analyze_text_with_comparison(
        self,
        text: str,
        language: str = "en"
    ) -> Tuple[Any, Any, Any]:
        """Analyze a single text with comparison of different methods."""
        try:
            print("\nRunning Analysis with Comparison:")
            print("=" * 50)
            print(f"Text: {text[:100]}...")
            
            # Run all analysis types
            stat_results = await self.keyword_tester.test_statistical_analysis(
                text, language=language
            )
            llm_results = await self.keyword_tester.test_llm_analysis(
                text, language=language
            )
            combined_results = await self.keyword_tester.test_combined_analysis(
                text, language=language
            )
            
            # Print comparison
            print("\nResults Comparison:")
            print("-" * 30)
            print("Statistical:", stat_results.keywords if hasattr(stat_results, "keywords") else [])
            print("LLM:", llm_results.keywords if hasattr(llm_results, "keywords") else [])
            print("Combined:", combined_results.keywords if hasattr(combined_results, "keywords") else [])
            
            return stat_results, llm_results, combined_results
            
        except Exception as e:
            logger.error(f"Error analyzing text: {e}")
            return None, None, None

    def get_content_types(self, language: str) -> List[str]:
        """Get available content types for a language."""
        return list(self.test_texts.get(language, {}).keys())

    def get_text_count(self, language: str, content_type: str) -> int:
        """Get number of texts for a language and content type."""
        return len(self.test_texts.get(language, {}).get(content_type, []))

    async def analyze_single_text(
        self,
        text: str,
        language: str,
        analyzer: KeywordAnalyzer
    ) -> Any:
        """Analyze a single text and display results."""
        try:
            result = await analyzer.analyze(text)
            
            print("\nAnalysis Results:")
            print("Keywords:", result.keywords if hasattr(result, "keywords") else [])
            print("Domain Keywords:", result.domain_keywords if hasattr(result, "domain_keywords") else {})
            
            return result
            
        except Exception as e:
            logger.error(f"Error analyzing text: {e}")
            return None

async def run_content_tests(show_comparison: bool = True):
    """Run tests for all content types.
    
    Args:
        show_comparison: If True, shows comparison between statistical, LLM, and combined results
    """
    from src.loaders.parameter_adapter import ParameterAdapter
    
    tester = ContentTester()
    file_utils = FileUtils()
    
    # Load main config from project root
    config_path = Path(file_utils.project_root) / "config.yaml"
    try:
        logger.info(f"Loading config from: {config_path}")
        main_config = file_utils.load_yaml(config_path)
        lang_configs = main_config.get("languages", {})
    except Exception as e:
        logger.warning(f"Could not load main config from {config_path}: {e}")
        lang_configs = {}
    
    # Load parameters using ParameterAdapter
    en_params = ParameterAdapter(
        file_utils.get_data_path("configurations") / "parameters_en.xlsx"
    ).parameters
    
    fi_params = ParameterAdapter(
        file_utils.get_data_path("configurations") / "parameters_fi.xlsx"
    ).parameters
    
    # Create analyzers with parameters
    en_analyzer = KeywordAnalyzer(
        config={
            **en_params.general.model_dump(),  # Convert to dict
            "weights": {"statistical": 0.4, "llm": 0.6},
            "max_keywords": 8
        },
        language_processor=create_text_processor(
            language="en",
            config=lang_configs.get("en", {})
        )
    )
    
    fi_analyzer = KeywordAnalyzer(
        config={
            **fi_params.general.model_dump(),  # Convert to dict
            "weights": {"statistical": 0.4, "llm": 0.6},
            "max_keywords": 8
        },
        language_processor=create_text_processor(
            language="fi",
            config=lang_configs.get("fi", {})
        )
    )
    
    results = {}
    
    # Test English content
    print("Testing English content:")
    print("=" * 50)
    for content_type in tester.get_content_types("en"):
        print(f"\nTesting {content_type} content:")
        results[f"en_{content_type}"] = await tester.test_content_type(
            "en", 
            content_type, 
            en_analyzer,
            show_comparison=show_comparison
        )
    
    # Test Finnish content
    print("\nTesting Finnish content:")
    print("=" * 50)
    for content_type in tester.get_content_types("fi"):
        print(f"\nTesting {content_type} content:")
        results[f"fi_{content_type}"] = await tester.test_content_type(
            "fi", 
            content_type, 
            fi_analyzer,
            show_comparison=show_comparison
        )
    
    return results

# Helper function for single text analysis
async def analyze_single_text(text: str, language: str = "en", show_comparison: bool = True):
    """Analyze a single text with optional comparison.
    
    Args:
        text: Text to analyze
        language: Language code ('en' or 'fi')
        show_comparison: If True, shows comparison between different analysis methods
    """
    tester = ContentTester()
    
    if show_comparison:
        return await tester.analyze_text_with_comparison(text, language)
    else:
        return await tester.analyze_single_text(text, language)



# Run in notebook:
# All tests:
# results = await run_content_tests()

# Single text analysis:
# result = await analyze_text("Your text here", language="en")

### Display helpers

In [14]:
def analyze_keyword_comparison(stat_keywords: List[str], llm_keywords: List[str], combined_keywords: List[str]) -> Dict:
    """Analyze and compare keywords from different methods."""
    # Find overlaps and unique keywords
    all_keywords = set(stat_keywords) | set(llm_keywords) | set(combined_keywords)
    
    analysis = {
        "all_methods": set(stat_keywords) & set(llm_keywords) & set(combined_keywords),
        "stat_llm_only": set(stat_keywords) & set(llm_keywords) - set(combined_keywords),
        "stat_combined_only": set(stat_keywords) & set(combined_keywords) - set(llm_keywords),
        "llm_combined_only": set(llm_keywords) & set(combined_keywords) - set(stat_keywords),
        "stat_only": set(stat_keywords) - set(llm_keywords) - set(combined_keywords),
        "llm_only": set(llm_keywords) - set(stat_keywords) - set(combined_keywords),
        "combined_only": set(combined_keywords) - set(stat_keywords) - set(llm_keywords)
    }
    
    return analysis

def display_comparison_results(text: str, stat_results: Any, llm_results: Any, combined_results: Any):
    """Display enhanced comparison of analysis results."""
    print("\nOriginal Text:")
    print("-" * 50)
    print(text.strip())
    
    print("\nResults Comparison:")
    print("-" * 50)
    
    # Get keywords from each method
    stat_kw = stat_results.keywords if hasattr(stat_results, "keywords") else []
    llm_kw = llm_results.keywords if hasattr(llm_results, "keywords") else []
    combined_kw = combined_results.keywords if hasattr(combined_results, "keywords") else []
    
    # Analyze overlaps
    analysis = analyze_keyword_comparison(stat_kw, llm_kw, combined_kw)
    
    # Display basic results
    print("\nKeywords by Method:")
    print("Statistical:", stat_kw)
    print("LLM:", llm_kw)
    print("Combined:", combined_kw)
    
    # Display analysis
    print("\nKeyword Analysis:")
    print("Found by all methods:", sorted(analysis["all_methods"]))
    print("Statistical & LLM only:", sorted(analysis["stat_llm_only"]))
    print("Statistical & Combined only:", sorted(analysis["stat_combined_only"]))
    print("LLM & Combined only:", sorted(analysis["llm_combined_only"]))
    print("Statistical only:", sorted(analysis["stat_only"]))
    print("LLM only:", sorted(analysis["llm_only"]))
    print("Combined only:", sorted(analysis["combined_only"]))
    
    # Display domain-specific insights
    print("\nDomain Analysis:")
    if hasattr(combined_results, "domain_keywords"):
        for domain, keywords in combined_results.domain_keywords.items():
            print(f"{domain}:", keywords)
    
    # Provide insights
    print("\nInsights:")
    print("- Statistical method found", len(stat_kw), "keywords")
    print("- LLM method found", len(llm_kw), "keywords")
    print("- Combined method found", len(combined_kw), "keywords")
    print("- Agreement between all methods:", len(analysis["all_methods"]), "keywords")
    
    # Calculate Jaccard similarity
    def jaccard_similarity(set1, set2):
        if not set1 or not set2:
            return 0
        return len(set1 & set2) / len(set1 | set2)
    
    print("\nSimilarity Analysis:")
    print("Statistical vs LLM:", f"{jaccard_similarity(set(stat_kw), set(llm_kw)):.2f}")
    print("Statistical vs Combined:", f"{jaccard_similarity(set(stat_kw), set(combined_kw)):.2f}")
    print("LLM vs Combined:", f"{jaccard_similarity(set(llm_kw), set(combined_kw)):.2f}")

## Run tests

In [17]:
# Run all tests with comparison (default)
results = await run_content_tests()



2024-11-11 13:27:15 - src.utils.FileUtils.file_utils - INFO - Attempting to load file: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\raw\test_content_en.xlsx
2024-11-11 13:27:15 - src.utils.FileUtils.file_utils - INFO - Successfully loaded Excel file: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\raw\test_content_en.xlsx
2024-11-11 13:27:15 - src.utils.FileUtils.file_utils - INFO - Attempting to load file: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\raw\test_content_fi.xlsx
2024-11-11 13:27:15 - src.utils.FileUtils.file_utils - INFO - Successfully loaded Excel file: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\raw\test_content_fi.xlsx
20

Testing English content:

Testing business content:

Text 1:
--------------------------------------------------
Content: Q3 financial results show 15% revenue growth and improved profit margins. Customer acquisition costs...


2024-11-11 13:27:24 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:27:24 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:27:24 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:27:24 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:27:29 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:27:34 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:27:34 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:27:34 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:27:34 - src.core.language_processing.english - INFO - Initialized English processor with 831


Results Comparison:
------------------------------
Statistical: ['financial', 'revenue', 'improve', 'result', 'growth']
LLM: ['financial', 'revenue', 'growth', 'profit', 'customer']
Combined: ['financial', 'revenue', 'growth', 'profit', 'acquisition']

Text 2:
--------------------------------------------------
Content: Strategic partnerships drive innovation and market penetration. Investment in R&D resulted in three ...


2024-11-11 13:27:38 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:27:38 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:27:38 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:27:38 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:27:42 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:27:45 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:27:45 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:27:45 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:27:45 - src.core.language_processing.english - INFO - Initialized English processor with 831


Results Comparison:
------------------------------
Statistical: ['partnership', 'penetration', 'innovation', 'strategic', 'drive']
LLM: ['strategic', 'innovation', 'partnerships', 'market', 'penetration']
Combined: ['innovation', 'strategic', 'penetration', 'market', 'investment']

Text 3:
--------------------------------------------------
Content: Operational efficiency improved through process automation. Customer satisfaction metrics show posit...


2024-11-11 13:27:49 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:27:49 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:27:49 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:27:49 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:27:53 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:27:56 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:27:56 - __main__ - INFO - Processing 2 texts for en general
2024-11-11 13:27:56 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:27:56 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:27:56 - src.core.l


Results Comparison:
------------------------------
Statistical: ['operational', 'efficiency', 'automation', 'improve', 'process']
LLM: ['operational', 'efficiency', 'automation', 'customer', 'satisfaction']
Combined: ['operational', 'efficiency', 'automation', 'customer', 'satisfaction']

Testing general content:

Text 1:
--------------------------------------------------
Content: Team collaboration improved with new communication tools. Project timeline adjustments accommodate a...


2024-11-11 13:28:00 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:28:00 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:28:00 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:28:00 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:28:04 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:28:07 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:28:07 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:28:07 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:28:07 - src.core.language_processing.english - INFO - Initialized English processor with 831


Results Comparison:
------------------------------
Statistical: ['collaboration', 'communication', 'improve', 'team', 'tool']
LLM: ['collaboration', 'communication', 'improved', 'team', 'tools']
Combined: ['collaboration', 'communication', 'team', 'project', 'timeline']

Text 2:
--------------------------------------------------
Content: Knowledge sharing sessions enhance team capabilities. Regular updates maintain stakeholder engagemen...


2024-11-11 13:28:11 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:28:11 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:28:11 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:28:11 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:28:16 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:28:19 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:28:19 - __main__ - INFO - Processing 3 texts for en technical
2024-11-11 13:28:19 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:28:19 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:28:19 - src.core


Results Comparison:
------------------------------
Statistical: ['capability', 'knowledge', 'session', 'enhance', 'share']
LLM: ['knowledge', 'enhance', 'sharing', 'sessions', 'capabilities']
Combined: ['knowledge', 'enhance', 'team', 'stakeholder', 'engagement']

Testing technical content:

Text 1:
--------------------------------------------------
Content: Machine learning models are trained using large datasets to recognize patterns. The neural network a...


2024-11-11 13:28:22 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:28:22 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:28:22 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:28:22 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:28:26 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:28:29 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:28:29 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:28:29 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:28:29 - src.core.language_processing.english - INFO - Initialized English processor with 831


Results Comparison:
------------------------------
Statistical: ['recognize', 'datasets', 'machine', 'pattern', 'learn']
LLM: ['machine learning', 'neural network', 'feature extraction', 'data preprocessing', 'feature engineering']
Combined: ['machine learning', 'neural network', 'feature extraction', 'data preprocessing', 'feature engineering']

Text 2:
--------------------------------------------------
Content: Cloud computing services provide scalable infrastructure for deployments. Microservices architecture...


2024-11-11 13:28:34 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:28:34 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:28:34 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:28:34 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:28:38 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:28:41 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:28:41 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:28:41 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:28:41 - src.core.language_processing.english - INFO - Initialized English processor with 831


Results Comparison:
------------------------------
Statistical: ['infrastructure', 'deployment', 'scalable', 'compute', 'service']
LLM: ['cloud computing', 'scalable infrastructure', 'microservices architecture', 'system design', 'API endpoints']
Combined: ['authentication', 'cloud computing', 'scalable infrastructure', 'microservices architecture', 'system design']

Text 3:
--------------------------------------------------
Content: Version control systems track changes in source code repositories. Continuous integration ensures co...


2024-11-11 13:28:46 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:28:46 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:28:46 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:28:46 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:28:50 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:28:55 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:28:55 - __main__ - INFO - Processing 3 texts for fi business
2024-11-11 13:28:55 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:28:55 - src.core.language_processing.finnish - INFO - Loaded 747 stopwords from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedosto


Results Comparison:
------------------------------
Statistical: ['repository', 'version', 'control', 'change', 'source']
LLM: ['version control', 'source code', 'continuous integration', 'code quality', 'automated testing']
Combined: ['version control', 'source code', 'continuous integration', 'code quality', 'automated testing']

Testing Finnish content:

Testing business content:

Text 1:
--------------------------------------------------
Content: Q3 taloudelliset tulokset osoittavat 15% liikevaihdon kasvun ja parantuneet katteet. Asiakashankinna...


2024-11-11 13:29:01 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:29:01 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:29:01 - src.core.language_processing.finnish - INFO - Loaded 747 stopwords from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\stop_words\fi.txt
2024-11-11 13:29:01 - src.core.language_processing.finnish - INFO - Detected platform: win32
2024-11-11 13:29:01 - src.core.language_processing.finnish - INFO - Added C:\scripts\Voikko to DLL search path
2024-11-11 13:29:01 - src.core.language_processing.finnish - INFO - Verifying Voikko installation...
2024-11-11 13:29:01 - src.core.language_processing.finnish - INFO - DLL exists: True (C:\scripts\Voikko\libvoikko-1.dll)
2024-11-11 13:29:01 - src.core.language_processing.finnish - INFO - Found dictionary version 5 at: C


Results Comparison:
------------------------------
Statistical: ['taloudellinen', 'liikevaihto', 'parantunut', 'osoittaa', 'tulos']
LLM: ['taloudelliset', 'liikevaihdon', 'kasvun', 'asiakashankinnan', 'asiakaspysyvyys']
Combined: ['asiakaspysyvyys', 'taloudelliset', 'liikevaihdon', 'kasvun', 'asiakashankinnan']

Text 2:
--------------------------------------------------
Content: Strategiset kumppanuudet edistävät innovaatiota ja markkinapenetraatiota. T&K-investoinnit johtivat ...


2024-11-11 13:29:15 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:29:15 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:29:15 - src.core.language_processing.finnish - INFO - Loaded 747 stopwords from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\stop_words\fi.txt
2024-11-11 13:29:15 - src.core.language_processing.finnish - INFO - Detected platform: win32
2024-11-11 13:29:15 - src.core.language_processing.finnish - INFO - Added C:\scripts\Voikko to DLL search path
2024-11-11 13:29:15 - src.core.language_processing.finnish - INFO - Verifying Voikko installation...
2024-11-11 13:29:15 - src.core.language_processing.finnish - INFO - DLL exists: True (C:\scripts\Voikko\libvoikko-1.dll)
2024-11-11 13:29:15 - src.core.language_processing.finnish - INFO - Found dictionary version 5 at: C


Results Comparison:
------------------------------
Statistical: ['markkinapenetraatio', 'strateginen', 'kumppanuus', 'innovaatio', 'edistää']
LLM: ['markkinapenetraatiota', 'kumppanuudet', 'innovaatiota', 'T&K-investoinnit', 'tuotelanseeraukseen']
Combined: ['markkinapenetraatiota', 'myyntitulos', 'kumppanuudet', 'innovaatiota', 'T&K-investoinnit']

Text 3:
--------------------------------------------------
Content: Toiminnan tehokkuus parani prosessiautomaation avulla. Asiakastyytyväisyysmittarit osoittavat positi...


2024-11-11 13:29:31 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:29:31 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:29:31 - src.core.language_processing.finnish - INFO - Loaded 747 stopwords from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\stop_words\fi.txt
2024-11-11 13:29:31 - src.core.language_processing.finnish - INFO - Detected platform: win32
2024-11-11 13:29:31 - src.core.language_processing.finnish - INFO - Added C:\scripts\Voikko to DLL search path
2024-11-11 13:29:31 - src.core.language_processing.finnish - INFO - Verifying Voikko installation...
2024-11-11 13:29:31 - src.core.language_processing.finnish - INFO - DLL exists: True (C:\scripts\Voikko\libvoikko-1.dll)
2024-11-11 13:29:31 - src.core.language_processing.finnish - INFO - Found dictionary version 5 at: C


Results Comparison:
------------------------------
Statistical: ['prosessiautomaatio', 'tehokkuus', 'toiminta', 'para', 'asiakastyytyväisyysmittari']
LLM: ['prosessiautomaatio', 'tehokkuus', 'asiakastyytyväisyysmittarit', 'kustannusoptimointi', 'säästöt']
Combined: ['prosessiautomaatio', 'tehokkuus', 'toiminta', 'kustannusoptimointi', 'asiakastyytyväisyysmittarit']

Testing general content:

Text 1:
--------------------------------------------------
Content: Tiimin yhteistyö parani uusien viestintätyökalujen myötä. Projektiaikataulua mukautettiin lisävaatim...


2024-11-11 13:29:44 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:29:44 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:29:44 - src.core.language_processing.finnish - INFO - Loaded 747 stopwords from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\stop_words\fi.txt
2024-11-11 13:29:44 - src.core.language_processing.finnish - INFO - Detected platform: win32
2024-11-11 13:29:44 - src.core.language_processing.finnish - INFO - Added C:\scripts\Voikko to DLL search path
2024-11-11 13:29:44 - src.core.language_processing.finnish - INFO - Verifying Voikko installation...
2024-11-11 13:29:44 - src.core.language_processing.finnish - INFO - DLL exists: True (C:\scripts\Voikko\libvoikko-1.dll)
2024-11-11 13:29:44 - src.core.language_processing.finnish - INFO - Found dictionary version 5 at: C


Results Comparison:
------------------------------
Statistical: ['viestintätyökalu', 'yhteistyö', 'tiimi', 'para', 'projektiaikataulu']
LLM: ['yhteistyö', 'viestintätyökalujen', 'projektiaikataulu', 'resurssien', 'tehtäväjaon']
Combined: ['yhteistyö', 'projektiaikataulu', 'viestintätyökalujen', 'varmistaa', 'resurssien']

Text 2:
--------------------------------------------------
Content: Tiedonjakotilaisuudet kehittävät tiimin osaamista. Säännölliset päivitykset ylläpitävät sidosryhmien...


2024-11-11 13:30:10 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:30:10 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:30:10 - src.core.language_processing.finnish - INFO - Loaded 747 stopwords from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\stop_words\fi.txt
2024-11-11 13:30:10 - src.core.language_processing.finnish - INFO - Detected platform: win32
2024-11-11 13:30:10 - src.core.language_processing.finnish - INFO - Added C:\scripts\Voikko to DLL search path
2024-11-11 13:30:10 - src.core.language_processing.finnish - INFO - Verifying Voikko installation...
2024-11-11 13:30:10 - src.core.language_processing.finnish - INFO - DLL exists: True (C:\scripts\Voikko\libvoikko-1.dll)
2024-11-11 13:30:10 - src.core.language_processing.finnish - INFO - Found dictionary version 5 at: C


Results Comparison:
------------------------------
Statistical: ['tiedonjakotilaisuus', 'kehittävä', 'tiimi', 'osata', 'laadunvarmistuskäytäntö']
LLM: ['tiedonjakotilaisuus', 'laadunvarmistuskäytäntö', 'kehittävä', 'tiimi', 'osaaminen']
Combined: ['tiedonjakotilaisuus', 'kehittävä', 'tiimi', 'laadunvarmistuskäytäntö', 'osaaminen']

Testing technical content:

Text 1:
--------------------------------------------------
Content: Koneoppimismalleja koulutetaan suurilla datajoukolla tunnistamaan kaavoja. Neuroverkon arkkitehtuuri...


2024-11-11 13:30:23 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:30:23 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:30:23 - src.core.language_processing.finnish - INFO - Loaded 747 stopwords from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\stop_words\fi.txt
2024-11-11 13:30:23 - src.core.language_processing.finnish - INFO - Detected platform: win32
2024-11-11 13:30:23 - src.core.language_processing.finnish - INFO - Added C:\scripts\Voikko to DLL search path
2024-11-11 13:30:23 - src.core.language_processing.finnish - INFO - Verifying Voikko installation...
2024-11-11 13:30:23 - src.core.language_processing.finnish - INFO - DLL exists: True (C:\scripts\Voikko\libvoikko-1.dll)
2024-11-11 13:30:23 - src.core.language_processing.finnish - INFO - Found dictionary version 5 at: C


Results Comparison:
------------------------------
Statistical: ['koneoppimismalli', 'datajoukko', 'kouluttaa', 'tunnistaa', 'kaava']
LLM: ['koneoppimismalli', 'datajoukko', 'kouluttaa', 'tunnistaa', 'kaava']
Combined: ['koneoppimismalli', 'datajoukko', 'kouluttaa', 'tunnistaa', 'kaava']

Text 2:
--------------------------------------------------
Content: Pilvipalvelut tarjoavat skaalautuvan infrastruktuurin käyttöönottoon. Mikropalveluarkkitehtuuri mahd...


2024-11-11 13:30:38 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:30:38 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:30:38 - src.core.language_processing.finnish - INFO - Loaded 747 stopwords from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\stop_words\fi.txt
2024-11-11 13:30:38 - src.core.language_processing.finnish - INFO - Detected platform: win32
2024-11-11 13:30:38 - src.core.language_processing.finnish - INFO - Added C:\scripts\Voikko to DLL search path
2024-11-11 13:30:38 - src.core.language_processing.finnish - INFO - Verifying Voikko installation...
2024-11-11 13:30:38 - src.core.language_processing.finnish - INFO - DLL exists: True (C:\scripts\Voikko\libvoikko-1.dll)
2024-11-11 13:30:38 - src.core.language_processing.finnish - INFO - Found dictionary version 5 at: C


Results Comparison:
------------------------------
Statistical: ['infrastruktuuri', 'pilvipalvelu', 'käyttöönotto', 'skaalautuva', 'tarjota']
LLM: ['pilvipalvelut', 'skaalautuvan', 'infrastruktuurin', 'käyttöönottoon', 'mikropalveluarkkitehtuuri']
Combined: ['mikropalveluarkkitehtuuri', 'pilvipalvelut', 'skaalautuvan', 'autentikoinnin', 'infrastruktuurin']

Text 3:
--------------------------------------------------
Content: Versionhallintajärjestelmät seuraavat lähdekoodin muutoksia. Jatkuva integraatio varmistaa koodin la...


2024-11-11 13:30:54 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:30:54 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:30:54 - src.core.language_processing.finnish - INFO - Loaded 747 stopwords from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\stop_words\fi.txt
2024-11-11 13:30:54 - src.core.language_processing.finnish - INFO - Detected platform: win32
2024-11-11 13:30:54 - src.core.language_processing.finnish - INFO - Added C:\scripts\Voikko to DLL search path
2024-11-11 13:30:54 - src.core.language_processing.finnish - INFO - Verifying Voikko installation...
2024-11-11 13:30:54 - src.core.language_processing.finnish - INFO - DLL exists: True (C:\scripts\Voikko\libvoikko-1.dll)
2024-11-11 13:30:54 - src.core.language_processing.finnish - INFO - Found dictionary version 5 at: C


Results Comparison:
------------------------------
Statistical: ['versionhallintajärjestelmä', 'lähdekoodi', 'muutos', 'automaattitestaus', 'järjestelmäarkkitehtuuri']
LLM: ['versionhallintajärjestelmä', 'lähdekoodi', 'muutos', 'automaattitestaus', 'järjestelmäarkkitehtuuri']
Combined: ['versionhallintajärjestelmä', 'lähdekoodi', 'muutos', 'automaattitestaus', 'järjestelmäarkkitehtuuri']


In [None]:
# Run all tests without comparison
results = await run_content_tests(show_comparison=False)



In [None]:
# Analyze a single text without comparison
# results = await analyze_single_text(
#     """Your text here...""",
#     language="en",
#     show_comparison=False
# )

In [20]:
results = await run_content_tests()

2024-11-11 12:38:19 - src.utils.FileUtils.file_utils - INFO - Attempting to load file: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\raw\test_content_en.xlsx
2024-11-11 12:38:19 - src.utils.FileUtils.file_utils - INFO - Successfully loaded Excel file: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\raw\test_content_en.xlsx
2024-11-11 12:38:19 - src.utils.FileUtils.file_utils - INFO - Attempting to load file: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\raw\test_content_fi.xlsx
2024-11-11 12:38:19 - src.utils.FileUtils.file_utils - INFO - Successfully loaded Excel file: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\raw\test_content_fi.xlsx
20

Testing English content:

Testing technical content:


2024-11-11 12:38:33 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Text 1:
Keywords: ['machine learning', 'neural network', 'feature extraction', 'data preprocessing', 'feature engineering']
Domain Keywords: {'machine learning': ['recognize', 'datasets', 'learn', 'pattern']}


2024-11-11 12:38:36 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Text 2:
Keywords: ['authentication', 'cloud computing', 'scalable infrastructure', 'microservices architecture', 'system design']
Domain Keywords: {'cloud': ['cloud computing', 'scalable infrastructure', 'API endpoints'], 'architecture': ['microservices architecture', 'system design'], 'security': ['authentication', 'data validation']}


2024-11-11 12:38:39 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 12:38:39 - __main__ - INFO - Processing 3 texts for en business



Text 3:
Keywords: ['version control', 'source code', 'continuous integration', 'code quality', 'automated testing']
Domain Keywords: {'version_control': ['repository', 'version', 'control', 'change', 'source']}

Text 1:
Keywords: ['machine learning', 'neural network', 'feature extraction', 'data preprocessing', 'feature engineering']
Domain Keywords: {'machine learning': ['recognize', 'datasets', 'learn', 'pattern']}

Text 2:
Keywords: ['authentication', 'cloud computing', 'scalable infrastructure', 'microservices architecture', 'system design']
Domain Keywords: {'cloud': ['cloud computing', 'scalable infrastructure', 'API endpoints'], 'architecture': ['microservices architecture', 'system design'], 'security': ['authentication', 'data validation']}

Text 3:
Keywords: ['version control', 'source code', 'continuous integration', 'code quality', 'automated testing']
Domain Keywords: {'version_control': ['repository', 'version', 'control', 'change', 'source']}

Testing business content:


2024-11-11 12:38:42 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Text 1:
Keywords: ['financial', 'revenue', 'growth', 'profit', 'acquisition']
Domain Keywords: {'finance': ['financial', 'revenue', 'profit', 'margins'], 'business': ['growth', 'acquisition', 'retention', 'strategy']}


2024-11-11 12:38:44 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Text 2:
Keywords: ['innovation', 'strategic', 'penetration', 'market', 'investment']
Domain Keywords: {'business': ['partnership', 'innovation', 'market', 'penetration', 'investment', 'performance', 'sales']}


2024-11-11 12:38:47 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 12:38:47 - __main__ - INFO - Processing 2 texts for en general



Text 3:
Keywords: ['operational', 'efficiency', 'automation', 'customer', 'satisfaction']
Domain Keywords: {'business': ['operational', 'efficiency', 'automation', 'cost', 'optimization'], 'metrics': ['customer', 'satisfaction', 'year-over-year']}

Text 1:
Keywords: ['financial', 'revenue', 'growth', 'profit', 'acquisition']
Domain Keywords: {'finance': ['financial', 'revenue', 'profit', 'margins'], 'business': ['growth', 'acquisition', 'retention', 'strategy']}

Text 2:
Keywords: ['innovation', 'strategic', 'penetration', 'market', 'investment']
Domain Keywords: {'business': ['partnership', 'innovation', 'market', 'penetration', 'investment', 'performance', 'sales']}

Text 3:
Keywords: ['operational', 'efficiency', 'automation', 'customer', 'satisfaction']
Domain Keywords: {'business': ['operational', 'efficiency', 'automation', 'cost', 'optimization'], 'metrics': ['customer', 'satisfaction', 'year-over-year']}

Testing general content:


2024-11-11 12:38:51 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Text 1:
Keywords: ['collaboration', 'communication', 'team', 'project', 'timeline']
Domain Keywords: {'project_management': ['collaboration', 'communication', 'resource allocation', 'task distribution']}


2024-11-11 12:38:54 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 12:38:54 - __main__ - INFO - Processing 3 texts for fi technical



Text 2:
Keywords: ['knowledge', 'enhance', 'team', 'stakeholder', 'engagement']
Domain Keywords: {'general': ['knowledge', 'capability', 'session', 'enhance', 'share']}

Text 1:
Keywords: ['collaboration', 'communication', 'team', 'project', 'timeline']
Domain Keywords: {'project_management': ['collaboration', 'communication', 'resource allocation', 'task distribution']}

Text 2:
Keywords: ['knowledge', 'enhance', 'team', 'stakeholder', 'engagement']
Domain Keywords: {'general': ['knowledge', 'capability', 'session', 'enhance', 'share']}

Testing Finnish content:

Testing technical content:


2024-11-11 12:38:57 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Text 1:
Keywords: ['koneoppimismalli', 'datajoukko', 'kouluttaa', 'tunnistaa', 'kaava']
Domain Keywords: {'machine_learning': ['koneoppimismalli', 'datajoukko', 'kouluttaa', 'tunnistaa', 'kaava']}


2024-11-11 12:39:01 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Text 2:
Keywords: ['mikropalveluarkkitehtuuri', 'pilvipalvelut', 'skaalautuvan', 'autentikoinnin', 'infrastruktuurin']
Domain Keywords: {'cloud_services': ['pilvipalvelut', 'infrastruktuuri', 'käyttöönotto'], 'architecture': ['mikropalveluarkkitehtuuri', 'modulaarisen'], 'authentication': ['autentikoinnin']}


2024-11-11 12:39:06 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 12:39:06 - __main__ - INFO - Processing 3 texts for fi business



Text 3:
Keywords: ['versionhallintajärjestelmä', 'lähdekoodi', 'muutos', 'automaattitestaus', 'järjestelmäarkkitehtuuri']
Domain Keywords: {'software_development': ['versionhallintajärjestelmä', 'lähdekoodi', 'automaattitestaus', 'järjestelmäarkkitehtuuri']}

Text 1:
Keywords: ['koneoppimismalli', 'datajoukko', 'kouluttaa', 'tunnistaa', 'kaava']
Domain Keywords: {'machine_learning': ['koneoppimismalli', 'datajoukko', 'kouluttaa', 'tunnistaa', 'kaava']}

Text 2:
Keywords: ['mikropalveluarkkitehtuuri', 'pilvipalvelut', 'skaalautuvan', 'autentikoinnin', 'infrastruktuurin']
Domain Keywords: {'cloud_services': ['pilvipalvelut', 'infrastruktuuri', 'käyttöönotto'], 'architecture': ['mikropalveluarkkitehtuuri', 'modulaarisen'], 'authentication': ['autentikoinnin']}

Text 3:
Keywords: ['versionhallintajärjestelmä', 'lähdekoodi', 'muutos', 'automaattitestaus', 'järjestelmäarkkitehtuuri']
Domain Keywords: {'software_development': ['versionhallintajärjestelmä', 'lähdekoodi', 'automaattitestaus', 

2024-11-11 12:39:10 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Text 1:
Keywords: ['asiakaspysyvyys', 'taloudelliset', 'liikevaihdon', 'kasvun', 'katteet']
Domain Keywords: {'talous': ['taloudelliset', 'liikevaihto', 'tulos'], 'markkinointi': ['asiakashankinnan', 'asiakaspysyvyys']}


2024-11-11 12:39:13 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Text 2:
Keywords: ['markkinapenetraatiota', 'myyntitulos', 'kumppanuudet', 'innovaatiota', 'T&K-investoinnit']
Domain Keywords: {'business': ['kumppanuudet', 'markkinapenetraatio', 'innovaatiota', 'strateginen', 'edistää']}


2024-11-11 12:39:16 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 12:39:16 - __main__ - INFO - Processing 2 texts for fi general



Text 3:
Keywords: ['prosessiautomaatio', 'tehokkuus', 'toiminta', 'kustannusoptimointi', 'asiakastyytyväisyysmittarit']
Domain Keywords: {'business': ['tehokkuus', 'prosessiautomaatio', 'asiakastyytyväisyysmittarit', 'kustannusoptimointi']}

Text 1:
Keywords: ['asiakaspysyvyys', 'taloudelliset', 'liikevaihdon', 'kasvun', 'katteet']
Domain Keywords: {'talous': ['taloudelliset', 'liikevaihto', 'tulos'], 'markkinointi': ['asiakashankinnan', 'asiakaspysyvyys']}

Text 2:
Keywords: ['markkinapenetraatiota', 'myyntitulos', 'kumppanuudet', 'innovaatiota', 'T&K-investoinnit']
Domain Keywords: {'business': ['kumppanuudet', 'markkinapenetraatio', 'innovaatiota', 'strateginen', 'edistää']}

Text 3:
Keywords: ['prosessiautomaatio', 'tehokkuus', 'toiminta', 'kustannusoptimointi', 'asiakastyytyväisyysmittarit']
Domain Keywords: {'business': ['tehokkuus', 'prosessiautomaatio', 'asiakastyytyväisyysmittarit', 'kustannusoptimointi']}

Testing general content:


2024-11-11 12:39:21 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Text 1:
Keywords: ['yhteistyö', 'projektiaikataulu', 'viestintätyökalujen', 'varmistaa', 'resurssien']
Domain Keywords: {'project_management': ['yhteistyö', 'projektiaikataulu', 'resurssien']}


2024-11-11 12:39:26 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Text 2:
Keywords: ['tiedonjakotilaisuus', 'kehittävä', 'tiimi', 'laadunvarmistuskäytäntö', 'osaaminen']
Domain Keywords: {'team_development': ['tiedonjakotilaisuus', 'kehittävä', 'tiimi', 'osaaminen'], 'quality_assurance': ['laadunvarmistuskäytäntö', 'säännölliset']}

Text 1:
Keywords: ['yhteistyö', 'projektiaikataulu', 'viestintätyökalujen', 'varmistaa', 'resurssien']
Domain Keywords: {'project_management': ['yhteistyö', 'projektiaikataulu', 'resurssien']}

Text 2:
Keywords: ['tiedonjakotilaisuus', 'kehittävä', 'tiimi', 'laadunvarmistuskäytäntö', 'osaaminen']
Domain Keywords: {'team_development': ['tiedonjakotilaisuus', 'kehittävä', 'tiimi', 'osaaminen'], 'quality_assurance': ['laadunvarmistuskäytäntö', 'säännölliset']}


In [18]:
# Analyze a single text with comparison
# results = await analyze_single_text(
#     """Strategic partnerships drive innovation and market penetration. Investment in R&amp;D resulted in three new product launches. Sales performance exceeded targets in key market segments. """,
#     language="en",
#     show_comparison=True
# )



In [15]:
# Or analyze a single text with comparison

text = """Strategic partnerships drive innovation and market penetration. 
Investment in R&D resulted in three new product launches. 
Sales performance exceeded targets in key market segments."""

tester = ContentTester()
stat_resuls, llm_resuls, combined_resuls = await tester.analyze_text_with_comparison(
    text,
    language="en"
)

2024-11-11 13:24:30 - src.utils.FileUtils.file_utils - INFO - Attempting to load file: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\raw\test_content_en.xlsx
2024-11-11 13:24:30 - src.utils.FileUtils.file_utils - INFO - Successfully loaded Excel file: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\raw\test_content_en.xlsx
2024-11-11 13:24:30 - src.utils.FileUtils.file_utils - INFO - Attempting to load file: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\raw\test_content_fi.xlsx
2024-11-11 13:24:30 - src.utils.FileUtils.file_utils - INFO - Successfully loaded Excel file: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\raw\test_content_fi.xlsx
20


Running Analysis with Comparison:
Text: Strategic partnerships drive innovation and market penetration. 
Investment in R&D resulted in three...


2024-11-11 13:24:35 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:24:35 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:24:35 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:24:35 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:24:41 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-11 13:24:41 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-11 13:24:41 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:24:41 - src.core.language_processing.english - INFO - Initialized English processor with 831 stopwords
2024-11-11 13:24:47 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1


Results Comparison:
------------------------------
Statistical: ['partnership', 'penetration', 'innovation', 'strategic', 'drive']
LLM: ['strategic', 'innovation', 'penetration', 'partnerships', 'market']
Combined: ['innovation', 'strategic', 'partnership', 'penetration', 'drive']


In [None]:
# Example usage


display_comparison_results(text, stat_resuls, llm_results, combined_results)


Original Text:
--------------------------------------------------
Strategic partnerships drive innovation and market penetration. 
Investment in R&D resulted in three new product launches. 
Sales performance exceeded targets in key market segments.

Results Comparison:
--------------------------------------------------

Keywords by Method:
Statistical: ['partnership', 'penetration', 'innovation', 'strategic', 'drive']
LLM: ['strategic', 'innovation', 'penetration', 'partnerships', 'market']
Combined: ['innovation', 'strategic', 'partnership', 'penetration', 'drive']

Keyword Analysis:
Found by all methods: ['innovation', 'penetration', 'strategic']
Statistical & LLM only: []
Statistical & Combined only: ['drive', 'partnership']
LLM & Combined only: []
Statistical only: []
LLM only: ['market', 'partnerships']
Combined only: []

Domain Analysis:
business: ['partnership', 'innovation', 'market', 'sales', 'investment']

Insights:
- Statistical method found 5 keywords
- LLM method found 5 

In [None]:
# results = await run_content_tests()

In [None]:
# Test technical text
# await test_text("technical")



In [None]:
# Test business text
# await test_text("business")
    


In [None]:
# Test Finnish text
# await test_text("finnish")