In [5]:
# Test Notebook: Semantic Analysis Pipeline
import sys
from pathlib import Path
import logging

# Add project root to Python path
project_root = str(Path().resolve().parent)
if project_root not in sys.path:
    sys.path.append(project_root)

from typing import List, Dict, Any, Tuple, Union

# Import necessary components
from src.nb_helpers.environment import setup_notebook_env, verify_environment
from src.semantic_analyzer import SemanticAnalyzer
from src.utils.FileUtils.file_utils import FileUtils
# from src.loaders.parameter_handler import get_parameter_file_path
# from src.analyzers import KeywordAnalyzer, ThemeAnalyzer, CategoryAnalyzer
from src.analyzers.category_analyzer import CategoryAnalyzer
from src.analyzers.keyword_analyzer import KeywordAnalyzer
from src.analyzers.theme_analyzer import ThemeAnalyzer

# from src.core.config import AnalyzerConfig
from src.core.language_processing import create_text_processor
from src.core.llm.factory import create_llm
from src.loaders.parameter_handler import (
    ParameterHandler,
    # get_parameter_file_path,
)

In [6]:
from src.nb_helpers.logging import configure_logging, verify_logging_setup_with_hierarchy, reset_debug_logging


In [7]:
def setup_debug_logging():
    """Set up proper debug logging."""
    # Reset all loggers to DEBUG
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)
    
    # Clear existing handlers
    for handler in root.handlers[:]:
        root.removeHandler(handler)
    
    # Add new handler with formatter
    handler = logging.StreamHandler()
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    handler.setLevel(logging.DEBUG)
    root.addHandler(handler)
    
    # Set specific loggers
    debug_loggers = [
        "src.core.language_processing.finnish",
        "src.analyzers.keyword_analyzer",
        "src.semantic_analyzer.analyzer",
    ]
    
    for logger_name in debug_loggers:
        logger = logging.getLogger(logger_name)
        logger.setLevel(logging.DEBUG)
        logger.propagate = True



In [8]:
# Run this before testing
debug = False
if debug:
    setup_debug_logging()
# Keep HTTP loggers at INFO
for name in ["httpx", "httpcore", "openai", "anthropic"]:
    logging.getLogger(name).setLevel(logging.INFO)
verify_logging_setup_with_hierarchy()


Logging Configuration:
--------------------------------------------------

Logger: root
Propagates to root: True
No handlers (uses root handlers)

Logger: src.nb_helpers.analyzers
Hierarchy:
  src: NOTSET
  src.nb_helpers: NOTSET
  src.nb_helpers.analyzers: NOTSET
Set Level: NOTSET
Propagates to root: True
No handlers (uses root handlers)

Logger: src.analyzers.keyword_analyzer
Hierarchy:
  src: NOTSET
  src.analyzers: NOTSET
  src.analyzers.keyword_analyzer: NOTSET
Set Level: NOTSET
Propagates to root: True
No handlers (uses root handlers)

Logger: src.analyzers.theme_analyzer
Hierarchy:
  src: NOTSET
  src.analyzers: NOTSET
  src.analyzers.theme_analyzer: NOTSET
Set Level: NOTSET
Propagates to root: True
No handlers (uses root handlers)

Logger: src.analyzers.category_analyzer
Hierarchy:
  src: NOTSET
  src.analyzers: NOTSET
  src.analyzers.category_analyzer: NOTSET
Set Level: NOTSET
Propagates to root: True
No handlers (uses root handlers)

Logger: src.utils.FileUtils.file_utils
Hi

In [9]:
# Set up environment with DEBUG level for detailed logs
# setup_notebook_env(log_level="DEBUG")
verify_environment()


Environment Check Results:

Basic Setup:
-----------
✓ Project root in path
✓ FileUtils initialized
✓ .env file loaded

Environment Variables:
---------------------
✓ OPENAI_API_KEY set
✓ ANTHROPIC_API_KEY set

Project Structure:
-----------------
✓ Raw data exists
✓ Processed data exists
✓ Configuration exists
✓ Main config.yaml exists

Environment Status: Ready ✓


True

In [6]:
# verify_logging_setup_with_hierarchy()


In [10]:
# verify_logging_setup_with_hierarchy()

# Initialize FileUtils
file_utils = FileUtils()

In [11]:
# Load test content
def load_test_content(language: str) -> dict:
    """Load test content from Excel file."""
    content_file = f"test_content_{language}.xlsx"
    df = file_utils.load_single_file(content_file, input_type="raw")
    
    # Group content by type
    content_by_type = {}
    for _, row in df.iterrows():
        content_by_type[f"{row['type']}_{row['id']}"] = row['content']
    return content_by_type



In [None]:
async def test_individual_analyzer(
    analyzer: Union[KeywordAnalyzer, ThemeAnalyzer, CategoryAnalyzer], 
    text: str, 
    analyzer_type: str
):
    """Test individual analyzer component."""
    print(f"\nTesting {analyzer_type} Analysis")
    print("=" * 50)
    print("\nInput text:")
    print(text[:200] + "..." if len(text) > 200 else text)
    
    try:
        results = await analyzer.analyze(text)
        
        print("\nResults:")
        print("-" * 20)
        
        if isinstance(analyzer, KeywordAnalyzer):
            if results.keywords:
                print("\nKeywords:")
                for kw in results.keywords[:10]:  # Show top 10
                    print(f"• {kw.keyword:<20} ({kw.score:.2f})")
                    if kw.domain:
                        print(f"  Domain: {kw.domain}")
                
                if results.compound_words:
                    print("\nCompound Words:")
                    print(", ".join(results.compound_words))
                    
                if results.domain_keywords:
                    print("\nKeywords by Domain:")
                    for domain, kws in results.domain_keywords.items():
                        print(f"\n{domain}:")
                        print(", ".join(kws))
                        
        elif isinstance(analyzer, ThemeAnalyzer):
            if results.themes:
                print("\nThemes:")
                for theme in results.themes:
                    print(f"\n• {theme.name}")
                    print(f"  Confidence: {theme.confidence:.2f}")
                    print(f"  Description: {theme.description}")
                    if theme.keywords:
                        print(f"  Keywords: {', '.join(theme.keywords)}")
                
                if results.theme_hierarchy:
                    print("\nTheme Hierarchy:")
                    for parent, children in results.theme_hierarchy.items():
                        print(f"{parent} -> {', '.join(children)}")
                        
        elif isinstance(analyzer, CategoryAnalyzer):
            if results.categories:
                print("\nCategories:")
                for cat in results.categories:
                    print(f"\n• {cat.name}")
                    print(f"  Confidence: {cat.confidence:.2f}")
                    if cat.description:
                        print(f"  Description: {cat.description}")
                    if cat.evidence:
                        print("\n  Evidence:")
                        for ev in cat.evidence:
                            print(f"  - {ev.text} (relevance: {ev.relevance:.2f})")
                            
        if hasattr(results, 'error') and results.error:
            print(f"\nErrors occurred: {results.error}")
            
        return results
        
    except Exception as e:
        print(f"\nError in analysis: {e}")
        return None

async def test_all_components():
    """Test all components individually and the full pipeline."""
    print("Starting Component Tests")
    print("=" * 50)
    
    # Initialize analyzers with Finnish parameters
    file_utils = FileUtils()
    parameter_handler = ParameterHandler("parameters_fi.xlsx")
    llm = create_llm()
    language_processor = create_text_processor(language="fi")
    
    # Create individual analyzers
    keyword_analyzer = KeywordAnalyzer(
        llm=llm,
        config=parameter_handler.parameters.general.model_dump(),
        language_processor=language_processor
    )
    
    theme_analyzer = ThemeAnalyzer(
        llm=llm,
        config=parameter_handler.parameters.general.model_dump(),
        language_processor=language_processor
    )
    
    category_analyzer = CategoryAnalyzer(
        categories=parameter_handler.parameters.categories,
        llm=llm,
        config=parameter_handler.parameters.general.model_dump(),
        language_processor=language_processor
    )
    
    # Load test content
    test_content = load_test_content("fi")
    tech_text = test_content["technical_technical_1"]
    business_text = test_content["business_business_1"]
    
    # Test each analyzer individually
    print("\nTesting Technical Content")
    print("-" * 30)
    await test_individual_analyzer(keyword_analyzer, tech_text, "Keyword")
    await test_individual_analyzer(theme_analyzer, tech_text, "Theme")
    await test_individual_analyzer(category_analyzer, tech_text, "Category")
    
    print("\nTesting Business Content")
    print("-" * 30)
    await test_individual_analyzer(keyword_analyzer, business_text, "Keyword")
    await test_individual_analyzer(theme_analyzer, business_text, "Theme")
    await test_individual_analyzer(category_analyzer, business_text, "Category")
    
    # Test full pipeline
    print("\nTesting Full Pipeline")
    print("-" * 30)
    # await run_pipeline_tests()



In [None]:
# Example usage in notebook
# if __name__ == "__main__":
# Test individual components
# await test_all_components()



In [13]:
text = """
        Pilvipalveluihin siirtyminen paransi järjestelmän skaalautuvuutta ja vähensi kustannuksia.
        Uudet DevOps-käytännöt tehostivat merkittävästi käyttöönottoprosessia.
    """

In [14]:
# Or test specific analyzer
analyzer = KeywordAnalyzer(language_processor=create_text_processor(language="fi"))
await test_individual_analyzer(analyzer, text, "Keyword")

AttributeError: 'NoneType' object has no attribute 'get'

In [None]:
# Test function for single text analysis
async def test_analyze_text(analyzer: SemanticAnalyzer, text: str, language: str):
    """Run analysis and display results."""
    print(f"\nAnalyzing {language} text:")
    print("=" * 50)
    print("\nInput text:")
    print(text[:200] + "..." if len(text) > 200 else text)
    
    try:
        results = await analyzer.analyze(text)
        
        print("\nResults:")
        print("-" * 20)
        
        # Keywords
        if results.keywords and results.keywords.keywords:
            print("\nKeywords:")
            for kw in results.keywords.keywords[:10]:  # Show top 10
                print(f"• {kw.keyword:<20} ({kw.score:.2f})")
                if kw.domain:
                    print(f"  Domain: {kw.domain}")
            
            if results.keywords.compound_words:
                print("\nCompound Words:")
                print(", ".join(results.keywords.compound_words))
                
            if results.keywords.domain_keywords:
                print("\nKeywords by Domain:")
                for domain, kws in results.keywords.domain_keywords.items():
                    print(f"\n{domain}:")
                    print(", ".join(kws))
        else:
            print("\nNo keywords found")
            if results.keywords.error:
                print("Error:", results.keywords.error)
        
        # Process other results similarly...
        print(f"\nProcessing time: {results.processing_time:.2f}s")
        
        return results

    except Exception as e:
        print(f"\nError running analysis: {e}")
        return None
    
# Main test routine
async def run_pipeline_tests():
    print("Starting Semantic Analysis Pipeline Tests")
    print("=" * 50)
    
    # Test English pipeline
    print("\nEnglish Pipeline Test")
    print("-" * 30)
    
    # Initialize English analyzer
    en_analyzer = SemanticAnalyzer(parameter_file="parameters_en.xlsx")
    en_content = load_test_content("en")
    
    # Test technical content
    await test_analyze_text(
        en_analyzer,
        en_content["technical_technical_1"],
        "English"
    )
    
    # Test business content
    await test_analyze_text(
        en_analyzer,
        en_content["business_business_1"],
        "English"
    )
    
    # Finnish Pipeline Test
    print("\nFinnish Pipeline Test")
    print("-" * 30)
    
    # Initialize Finnish analyzer
    fi_analyzer = SemanticAnalyzer(parameter_file="parameters_fi.xlsx")
    fi_content = load_test_content("fi")
    
    # Test technical content
    await test_analyze_text(
        fi_analyzer,
        fi_content["technical_technical_1"],
        "Finnish"
    )
    
    # Test business content
    await test_analyze_text(
        fi_analyzer,
        fi_content["business_business_1"],
        "Finnish"
    )

In [9]:
# Run the tests
await run_pipeline_tests()

Starting Semantic Analysis Pipeline Tests

English Pipeline Test
------------------------------

Analyzing English text:

Input text:
Machine learning models are trained using large datasets to recognize patterns. The neural network architecture includes multiple layers for feature extraction. Data preprocessing and feature engineer...

Results:
--------------------

Keywords:
• machine learning     (0.95)
  Domain: technical
• neural network       (0.95)
  Domain: technical
• feature extraction   (0.90)
  Domain: technical
• data preprocessing   (0.90)
  Domain: technical
• feature engineering  (0.90)
  Domain: technical
• pipeline             (0.85)
  Domain: technical
• architecture         (0.85)
  Domain: technical
• dataset              (0.80)
  Domain: technical

Compound Words:
machine learning, neural network, feature extraction, data preprocessing, feature engineering, pipeline, architecture, dataset

Keywords by Domain:

technical:
machine learning, neural network, feature e