In [1]:
# Import required modules
import sys
from pathlib import Path
from typing import List, Dict, Any, Tuple, Union
import logging
import asyncio

# Add project root to Python path if needed
project_root = str(Path().resolve().parent)
if project_root not in sys.path:
    sys.path.append(project_root)

# Import necessary components
from src.nb_helpers.environment import setup_notebook_env, verify_environment
from src.semantic_analyzer import SemanticAnalyzer
from src.utils.FileUtils.file_utils import FileUtils
from src.core.language_processing import create_text_processor
from src.core.llm.factory import create_llm
from src.loaders.parameter_handler import ParameterHandler
from src.analyzers.keyword_analyzer import KeywordAnalyzer
from src.analyzers.theme_analyzer import ThemeAnalyzer
from src.analyzers.category_analyzer import CategoryAnalyzer



In [2]:
# Set up environment and logging
setup_notebook_env(log_level="DEBUG")
verify_environment()



2024-11-25 19:33:22,909 - src.utils.FileUtils.file_utils - DEBUG [file_utils.py:192] - Initialized FileUtils with log level: INFO
2024-11-25 19:33:22,910 - src.utils.FileUtils.file_utils - DEBUG [file_utils.py:198] - Project root: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer


Environment Check Results:

Basic Setup:
-----------
✓ Project root in path
✓ FileUtils initialized
✓ .env file loaded

Environment Variables:
---------------------
✓ OPENAI_API_KEY set
✓ ANTHROPIC_API_KEY set

Project Structure:
-----------------
✓ Raw data exists
✓ Processed data exists
✓ Configuration exists
✓ Main config.yaml exists

Environment Status: Ready ✓


True

In [3]:
# Test data to use
test_texts = {
    "en": {
        "technical": """Machine learning models are trained using large datasets to recognize patterns. 
                     The neural network architecture includes multiple layers for feature extraction. 
                     Data preprocessing and feature engineering are crucial steps.""",
        "business": """Q3 financial results show 15% revenue growth and improved profit margins. 
                    Customer acquisition costs decreased while retention rates increased. 
                    Market expansion strategy focuses on emerging technology sectors."""
    },
    "fi": {
        "technical": """Koneoppimismalleja koulutetaan suurilla datajoukolla tunnistamaan kaavoja. 
                     Neuroverkon arkkitehtuuri sisältää useita kerroksia piirteiden erottamiseen. 
                     Datan esikäsittely ja piirteiden suunnittelu ovat keskeisiä vaiheita.""",
        "business": """Q3 taloudelliset tulokset osoittavat 15% liikevaihdon kasvun ja parantuneet katteet. 
                    Asiakashankinnan kustannukset laskivat ja asiakaspysyvyys parani. 
                    Markkinalaajennusstrategia keskittyy nouseviin teknologiasektoreihin."""
    }
}



In [4]:
async def test_individual_analyzer(
    analyzer: Union[KeywordAnalyzer, ThemeAnalyzer, CategoryAnalyzer], 
    text: str, 
    analyzer_type: str
):
    """Test individual analyzer component."""
    print(f"\nTesting {analyzer_type} Analysis")
    print("=" * 50)
    print("\nInput text:")
    print(text[:200] + "..." if len(text) > 200 else text)
    
    try:
        results = await analyzer.analyze(text)
        
        print("\nResults:")
        print("-" * 20)
        
        if isinstance(analyzer, KeywordAnalyzer):
            if results.keywords:
                print("\nKeywords:")
                for kw in results.keywords[:10]:  # Show top 10
                    print(f"• {kw.keyword:<20} ({kw.score:.2f})")
                    if kw.domain:
                        print(f"  Domain: {kw.domain}")
                
                if results.compound_words:
                    print("\nCompound Words:")
                    print(", ".join(results.compound_words))
                    
                if results.domain_keywords:
                    print("\nKeywords by Domain:")
                    for domain, kws in results.domain_keywords.items():
                        print(f"\n{domain}:")
                        print(", ".join(kws))
                        
        elif isinstance(analyzer, ThemeAnalyzer):
            if results.themes:
                print("\nThemes:")
                for theme in results.themes:
                    print(f"\n• {theme.name}")
                    print(f"  Confidence: {theme.confidence:.2f}")
                    print(f"  Description: {theme.description}")
                    if theme.keywords:
                        print(f"  Keywords: {', '.join(theme.keywords)}")
                
                if results.theme_hierarchy:
                    print("\nTheme Hierarchy:")
                    for parent, children in results.theme_hierarchy.items():
                        print(f"{parent} -> {', '.join(children)}")
                        
        elif isinstance(analyzer, CategoryAnalyzer):
            if results.categories:
                print("\nCategories:")
                for cat in results.categories:
                    print(f"\n• {cat.name}")
                    print(f"  Confidence: {cat.confidence:.2f}")
                    if cat.description:
                        print(f"  Description: {cat.description}")
                    if cat.evidence:
                        print("\n  Evidence:")
                        for ev in cat.evidence:
                            print(f"  - {ev.text} (relevance: {ev.relevance:.2f})")
                            
        if hasattr(results, 'error') and results.error:
            print(f"\nErrors occurred: {results.error}")
            
        return results
        
    except Exception as e:
        print(f"\nError in analysis: {e}")
        return None

In [5]:
# Example 1: Test individual keyword analyzer
async def test_keyword_analyzer():
    """Test keyword analyzer with different languages."""
    print("Testing Keyword Analyzer")
    print("=" * 50)
    
    # Initialize components
    parameter_handler = ParameterHandler("parameters_fi.xlsx")
    llm = create_llm()
    
    # Test English
    print("\nTesting English Technical Content:")
    en_processor = create_text_processor(language="en")
    keyword_analyzer_en = KeywordAnalyzer(
        llm=llm,
        config=parameter_handler.parameters.general.model_dump(),
        language_processor=en_processor
    )
    await test_individual_analyzer(keyword_analyzer_en, test_texts["en"]["technical"], "Keyword")
    
    # Test Finnish
    print("\nTesting Finnish Technical Content:")
    fi_processor = create_text_processor(language="fi")
    keyword_analyzer_fi = KeywordAnalyzer(
        llm=llm,
        config=parameter_handler.parameters.general.model_dump(),
        language_processor=fi_processor
    )
    await test_individual_analyzer(keyword_analyzer_fi, test_texts["fi"]["technical"], "Keyword")



In [6]:
# Example 2: Test all components
async def test_components_for_language(language: str):
    """Test all components for a specific language."""
    print(f"\nTesting All Components for {language.upper()}")
    print("=" * 50)
    
    # Initialize components
    parameter_handler = ParameterHandler(f"parameters_{language}.xlsx")
    llm = create_llm()
    language_processor = create_text_processor(language=language)
    
    # Create analyzers
    keyword_analyzer = KeywordAnalyzer(
        llm=llm,
        config=parameter_handler.parameters.general.model_dump(),
        language_processor=language_processor
    )
    
    theme_analyzer = ThemeAnalyzer(
        llm=llm,
        config=parameter_handler.parameters.general.model_dump(),
        language_processor=language_processor
    )
    
    category_analyzer = CategoryAnalyzer(
        categories=parameter_handler.parameters.categories,
        llm=llm,
        config=parameter_handler.parameters.general.model_dump(),
        language_processor=language_processor
    )
    
    # Test technical content
    print(f"\nTesting {language.upper()} Technical Content:")
    await test_individual_analyzer(keyword_analyzer, test_texts[language]["technical"], "Keyword")
    await test_individual_analyzer(theme_analyzer, test_texts[language]["technical"], "Theme")
    await test_individual_analyzer(category_analyzer, test_texts[language]["technical"], "Category")
    
    # Test business content
    print(f"\nTesting {language.upper()} Business Content:")
    await test_individual_analyzer(keyword_analyzer, test_texts[language]["business"], "Keyword")
    await test_individual_analyzer(theme_analyzer, test_texts[language]["business"], "Theme")
    await test_individual_analyzer(category_analyzer, test_texts[language]["business"], "Category")



In [7]:
# Example 3: Quick test of full pipeline
async def test_pipeline():
    """Test full pipeline with both languages."""
    print("Testing Full Pipeline")
    print("=" * 50)
    
    # Test English pipeline
    print("\nEnglish Pipeline:")
    en_analyzer = SemanticAnalyzer(parameter_file="parameters_en.xlsx")
    result = await en_analyzer.analyze(test_texts["en"]["technical"])
    print(f"Success: {result.success}")
    print(f"Keywords found: {len(result.keywords.keywords)}")
    print(f"Themes found: {len(result.themes.themes)}")
    print(f"Categories found: {len(result.categories.matches)}")
    
    # Test Finnish pipeline
    print("\nFinnish Pipeline:")
    fi_analyzer = SemanticAnalyzer(parameter_file="parameters_fi.xlsx")
    result = await fi_analyzer.analyze(test_texts["fi"]["technical"])
    print(f"Success: {result.success}")
    print(f"Keywords found: {len(result.keywords.keywords)}")
    print(f"Themes found: {len(result.themes.themes)}")
    print(f"Categories found: {len(result.categories.matches)}")

# Run the tests
async def run_all_tests():
    """Run all tests."""
    # Test individual component
    await test_keyword_analyzer()
    
    # Test all components by language
    await test_components_for_language("en")
    await test_components_for_language("fi")
    
    # Test full pipeline
    await test_pipeline()



In [8]:
# Run in notebook
# await run_all_tests()

# Or run individual tests:
# await test_keyword_analyzer()
await test_components_for_language("fi")
# await test_pipeline()


Testing All Components for FI

Testing FI Technical Content:

Testing Keyword Analysis

Input text:
Koneoppimismalleja koulutetaan suurilla datajoukolla tunnistamaan kaavoja. 
                     Neuroverkon arkkitehtuuri sisältää useita kerroksia piirteiden erottamiseen. 
                     Data...

Results:
--------------------

Keywords:
• koneoppimismalli     (0.95)
  Domain: technical
• datajoukko           (0.95)
  Domain: technical
• neuroverkon arkkitehtuuri (0.95)
  Domain: technical
• datan esikäsittely   (0.95)
  Domain: technical
• piirteiden suunnittelu (0.95)
  Domain: technical
• piirre               (0.90)
  Domain: technical
• kerroksia            (0.90)
  Domain: technical

Compound Words:
neuroverkon arkkitehtuuri, datan esikäsittely, piirteiden suunnittelu

Testing Theme Analysis

Input text:
Koneoppimismalleja koulutetaan suurilla datajoukolla tunnistamaan kaavoja. 
                     Neuroverkon arkkitehtuuri sisältää useita kerroksia piirteiden erottamiseen