In [2]:
import os
import sys
from pathlib import Path
import asyncio
import json
import logging
from typing import Dict, Any, List, Tuple, Optional
from pprint import pprint

import pandas as pd

# Add project root to Python path
project_root = str(Path().resolve().parent)
if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added {project_root} to Python path")

# Core components
from src.semantic_analyzer.analyzer import SemanticAnalyzer
from src.utils.FileUtils.file_utils import FileUtils
from src.analyzers.keyword_analyzer import KeywordAnalyzer
from src.analyzers.theme_analyzer import ThemeAnalyzer
from src.core.language_processing import create_text_processor
from src.loaders.parameter_adapter import ParameterAdapter

# Initialize FileUtils and set up logging
file_utils = FileUtils()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
class BaseTester:
    """Base class for analysis testing."""
    
    def __init__(self):
        from src.core.llm.factory import create_llm
        self.file_utils = FileUtils()
        self.llm = create_llm()  # Create LLM instance
        self.test_texts = self._load_test_texts()
        
    def _load_test_texts(self) -> Dict[str, str]:
        """Load test texts from files."""
        try:
            # Try to load from existing files
            texts = {}
            for lang in ["en", "fi"]:
                df = self.file_utils.load_single_file(
                    f"test_content_{lang}.xlsx",
                    input_type="raw"
                )
                if df is not None:
                    for _, row in df.iterrows():
                        key = f"{lang}_{row['type']}"
                        texts[key] = row['content']
            return texts
                        
        except Exception as e:
            logger.warning(f"Could not load test texts: {e}. Using defaults.")
            return self._create_default_texts()
    
    def _create_default_texts(self) -> Dict[str, str]:
        """Create default test texts."""
        return {
            "en_technical": """
                Machine learning models are trained using large datasets.
                Neural networks extract features through multiple layers.
                Data preprocessing improves model performance.
            """,
            "en_business": """
                Q3 financial results show 15% revenue growth.
                Customer acquisition costs decreased while retention improved.
                Market expansion strategy targets emerging sectors.
            """,
            "fi_technical": """
                Ohjelmistokehittäjä työskentelee asiakasprojektissa.
                Tekninen toteutus vaatii erityistä huomiota.
                Tietoturva on keskeinen osa kehitystä.
            """
        }

    def save_test_texts(self) -> None:
        """Save test texts using FileUtils."""
        df = pd.DataFrame([
            {
                "language": key.split("_")[0],
                "type": key.split("_")[1],
                "content": content.strip()
            }
            for key, content in self.test_texts.items()
        ])
        
        self.file_utils.save_data_to_disk(
            data={"texts": df},
            output_type="raw",
            file_name="test_texts",
            output_filetype="xlsx",
            include_timestamp=False
        )

    async def analyze_text(self, text: str, language: str, analyzer: Any) -> Dict[str, Any]:
        """Base method for text analysis."""
        try:
            return await analyzer.analyze(text)
        except Exception as e:
            logger.error(f"Analysis error: {e}")
            return {"error": str(e)}



In [4]:
class KeywordTester(BaseTester):
    """Helper class for testing keyword analysis."""
    
    async def test_statistical_analysis(self, text: str, language: str = None) -> Dict[str, Any]:
        """Test statistical keyword extraction."""
        if language is None:
            from langdetect import detect
            try:
                language = detect(text)
            except:
                language = "en"
        
        # Create processor and analyzer
        processor = create_text_processor(language=language)
        analyzer = KeywordAnalyzer(
            llm=self.llm,  # Pass LLM instance
            config={"weights": {"statistical": 1.0, "llm": 0.0}},  # Statistical only
            language_processor=processor
        )
        
        return await self.analyze_text(text, language, analyzer)

    async def test_llm_analysis(self, text: str, language: str = None) -> Dict[str, Any]:
        """Test LLM-based keyword extraction."""
        if language is None:
            from langdetect import detect
            try:
                language = detect(text)
            except:
                language = "en"
        
        analyzer = KeywordAnalyzer(
            llm=self.llm,  # Pass LLM instance
            config={"weights": {"statistical": 0.0, "llm": 1.0}},  # LLM only
            language_processor=create_text_processor(language=language)
        )
        
        return await self.analyze_text(text, language, analyzer)

    async def test_combined_analysis(self, text: str, language: str = None) -> Dict[str, Any]:
        """Test combined statistical and LLM analysis."""
        if language is None:
            from langdetect import detect
            try:
                language = detect(text)
            except:
                language = "en"
        
        analyzer = KeywordAnalyzer(
            llm=self.llm,  # Pass LLM instance
            config={
                "weights": {"statistical": 0.4, "llm": 0.6},
                "max_keywords": 8,
                "min_confidence": 0.3
            },
            language_processor=create_text_processor(language=language)
        )
        
        return await self.analyze_text(text, language, analyzer)

    def display_keyword_results(self, results: Dict[str, Any]) -> None:
        """Display keyword analysis results."""
        print("\nKeyword Analysis Results:")
        print("-" * 50)
        
        if "error" in results:
            print(f"Error: {results['error']}")
            return
            
        if "keywords" in results:
            print("\nKeywords:", results["keywords"])
            
        if "domain_keywords" in results:
            print("\nDomain Keywords:")
            for domain, keywords in results["domain_keywords"].items():
                print(f"{domain}: {keywords}")

In [5]:
class ThemeTester(BaseTester):
    """Helper class for testing theme analysis."""
    
    async def test_theme_analysis(self, text: str, language: str = None) -> Dict[str, Any]:
        """Test theme analysis on text."""
        if language is None:
            from langdetect import detect
            try:
                language = detect(text)
            except:
                language = "en"
        
        analyzer = ThemeAnalyzer(
            llm=self.llm,
            config={
                "max_themes": 3,
                "min_confidence": 0.3,
                "focus_areas": "business,technical"
            }
        )
        
        return await self.analyze_text(text, language, analyzer)
        
    def display_theme_results(self, results: Any) -> None:
        """Display theme analysis results.
        
        Args:
            results: Either a dict or ThemeOutput model
        """
        print("\nTheme Analysis Results:")
        print("-" * 50)
        
        # Convert to dict if it's a pydantic model
        if hasattr(results, "model_dump"):
            results = results.model_dump()
        elif hasattr(results, "dict"):
            results = results.dict()
            
        # Handle error case
        if isinstance(results, dict) and "error" in results:
            print(f"Error: {results['error']}")
            return

        # Access theme data
        themes_data = results.get("themes", {})
        if isinstance(themes_data, dict):
            themes = themes_data.get("themes", [])
            descriptions = themes_data.get("theme_descriptions", {})
            confidence = themes_data.get("theme_confidence", {})
            keywords = themes_data.get("related_keywords", {})
        else:
            themes = []
            descriptions = {}
            confidence = {}
            keywords = {}
            
        # Display themes
        if not themes:
            print("No themes found.")
            return
            
        for theme in themes:
            print(f"\nTheme: {theme}")
            print(f"Description: {descriptions.get(theme, 'No description available')}")
            print(f"Confidence: {confidence.get(theme, 0):.2f}")
            theme_keywords = keywords.get(theme, [])
            if theme_keywords:
                print(f"Keywords: {', '.join(theme_keywords)}")



In [6]:
class AnalysisPipeline:
    """Complete analysis pipeline for testing multiple analyzers."""
    
    def __init__(self):
        self.file_utils = FileUtils()
        self.keyword_tester = KeywordTester()
        self.theme_tester = ThemeTester()
        
    async def analyze_text(self, text: str, language: str = None) -> Dict[str, Any]:
        """Run complete analysis pipeline on text."""
        if language is None:
            from langdetect import detect
            try:
                language = detect(text)
            except:
                language = "en"
        
        # Run analyses
        keyword_results = await self.keyword_tester.test_combined_analysis(
            text, language=language
        )
        theme_results = await self.theme_tester.test_theme_analysis(
            text, language=language
        )
        
        return {
            "keywords": keyword_results,
            "themes": theme_results,
            "language": language
        }
    
    def display_results(self, results: Dict[str, Any]) -> None:
        """Display complete analysis results."""
        print("\nComplete Analysis Results")
        print("=" * 50)
        
        # Display keyword results
        print("\nKeyword Analysis:")
        print("-" * 20)
        keyword_results = results.get("keywords", {})
        if hasattr(keyword_results, "model_dump"):
            keyword_results = keyword_results.model_dump()
        elif hasattr(keyword_results, "dict"):
            keyword_results = keyword_results.dict()
            
        if isinstance(keyword_results, dict):
            if "keywords" in keyword_results:
                print("Keywords:", keyword_results["keywords"])
                print("Domain Keywords:", keyword_results.get("domain_keywords", {}))
            elif "error" in keyword_results:
                print(f"Error: {keyword_results['error']}")
        
        # Display theme results
        print("\nTheme Analysis:")
        print("-" * 20)
        self.theme_tester.display_theme_results(results.get("themes", {}))



In [7]:
async def run_analysis_examples():
    """Run example analyses on different content types."""
    
    # Example texts
    example_texts = {
        "Business Analysis": """
            Q3 revenue increased by 15% with strong growth in enterprise sales.
            Customer retention improved while acquisition costs decreased.
            New market expansion initiatives are showing positive early results.
        """,
        
        "Technical Content": """
            The application uses microservices architecture with containerized deployments.
            Data processing pipeline incorporates machine learning models for prediction.
            System monitoring ensures high availability and performance metrics.
        """,
        
        "Mixed Content": """
            The IT department's cloud migration project reduced infrastructure costs by 25%.
            DevOps implementation improved deployment frequency while maintaining quality.
            Monthly recurring revenue from SaaS products grew steadily.
        """
    }
    
    pipeline = AnalysisPipeline()
    
    for title, text in example_texts.items():
        print(f"\nAnalyzing {title}")
        print("=" * 50)
        results = await pipeline.analyze_text(text)
        pipeline.display_results(results)
        print("\n" + "-" * 80 + "\n")

In [8]:
async def debug_theme_analysis(text: str):
    """Debug theme analysis with detailed output."""
    print("\nDebug Theme Analysis")
    print("=" * 50)
    
    print("\nInput Text:")
    print("-" * 20)
    print(text.strip())
    
    # Configure logging
    logger = logging.getLogger("src.analyzers.theme_analyzer")
    logger.setLevel(logging.DEBUG)
    
    # Add handler if not already present
    if not logger.handlers:
        handler = logging.StreamHandler()
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        )
        handler.setFormatter(formatter)
        logger.addHandler(handler)
    
    print("\nRunning Analysis...")
    print("-" * 20)
    
    # Run analysis
    theme_tester = ThemeTester()
    results = await theme_tester.test_theme_analysis(text)
    
    # Display results
    theme_tester.display_theme_results(results)
    
    if logger.isEnabledFor(logging.DEBUG):
        print("\nDebug Information:")
        print("-" * 20)
        if hasattr(results, "model_dump"):
            print(json.dumps(results.model_dump(), indent=2))
        else:
            print(json.dumps(results, indent=2))
    
    return results



In [9]:
def verify_environment() -> bool:
    """Verify notebook environment setup."""
    from dotenv import load_dotenv
    
    # Load environment variables
    env_path = Path(project_root) / ".env"
    env_loaded = load_dotenv(env_path)
    
    # Required variables
    required_env_vars = [
        'OPENAI_API_KEY',
        'ANTHROPIC_API_KEY',
    ]
    
    # Basic checks
    basic_checks = {
        "Project root in path": project_root in sys.path,
        "Can import src": "src" in sys.modules,
        "FileUtils initialized": hasattr(file_utils, "project_root"),
        ".env file loaded": env_loaded,
    }
    
    # Environment variable checks
    env_var_checks = {
        f"{var} set": os.getenv(var) is not None
        for var in required_env_vars
    }
    
    # Path checks
    expected_paths = {
        "Raw data": file_utils.get_data_path("raw"),
        "Processed data": file_utils.get_data_path("processed"),
        "Configuration": file_utils.get_data_path("configurations"),
        "Main config.yaml": Path(project_root) / "config.yaml"
    }
    
    path_checks = {
        f"{name} exists": path.exists()
        for name, path in expected_paths.items()
    }
    
    # Combine all checks
    all_checks = {
        **basic_checks,
        **env_var_checks,
        **path_checks
    }
    
    print("Environment Check Results:")
    print("=" * 50)
    print()
    
    # Print Basic Setup section
    print("Basic Setup:")
    print("-" * 11)
    for check, result in basic_checks.items():
        status = "✓" if result else "✗"
        print(f"{status} {check}")
    
    # Print Environment Variables section
    print("\nEnvironment Variables:")
    print("-" * 21)
    for check, result in env_var_checks.items():
        status = "✓" if result else "✗"
        print(f"{status} {check}")
    
    # Print Project Structure section
    print("\nProject Structure:")
    print("-" * 17)
    for check, result in path_checks.items():
        status = "✓" if result else "✗"
        print(f"{status} {check}")
    
    # Overall status
    all_passed = all(all_checks.values())
    print("\n" + "=" * 50)
    print("Environment Status:", "Ready ✓" if all_passed else "Setup needed ✗")
    
    # Print setup instructions if needed
    if not all_passed:
        print("\nSetup Instructions:")
        if not env_loaded:
            print("- Create a .env file in the project root with required API keys")
        for var in required_env_vars:
            if not os.getenv(var):
                print(f"- Add {var} to your .env file")
        for name, path in expected_paths.items():
            if not path.exists():
                print(f"- Create {name} directory at {path}")
    
    return all_passed

In [10]:


# First cell: Verify environment
verify_environment()



Environment Check Results:

Basic Setup:
-----------
✓ Project root in path
✓ Can import src
✓ FileUtils initialized
✓ .env file loaded

Environment Variables:
---------------------
✓ OPENAI_API_KEY set
✓ ANTHROPIC_API_KEY set

Project Structure:
-----------------
✓ Raw data exists
✓ Processed data exists
✓ Configuration exists
✓ Main config.yaml exists

Environment Status: Ready ✓


True

In [11]:
# Run the analysis
text = """
Q3 revenue increased by 15% with strong growth in enterprise sales.
Customer retention improved while acquisition costs decreased.
New market expansion initiatives are showing positive early results.
"""

await debug_theme_analysis(text)


Debug Theme Analysis

Input Text:
--------------------
Q3 revenue increased by 15% with strong growth in enterprise sales.
Customer retention improved while acquisition costs decreased.
New market expansion initiatives are showing positive early results.

Running Analysis...
--------------------


2024-11-12 15:16:02 - src.utils.FileUtils.file_utils - INFO - Attempting to load file: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\raw\test_content_en.xlsx
2024-11-12 15:16:02 - src.utils.FileUtils.file_utils - INFO - Successfully loaded Excel file: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\raw\test_content_en.xlsx
2024-11-12 15:16:02 - src.utils.FileUtils.file_utils - INFO - Attempting to load file: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\raw\test_content_fi.xlsx
2024-11-12 15:16:03 - src.utils.FileUtils.file_utils - INFO - Successfully loaded Excel file: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\raw\test_content_fi.xlsx
20


Theme Analysis Results:
--------------------------------------------------
Error: None

Debug Information:
--------------------
{
  "language": "en",
  "error": null,
  "success": true,
  "themes": [
    {
      "name": "Revenue Growth",
      "description": "The increase in revenue by 15% indicates strong performance, particularly in enterprise sales.",
      "confidence": 0.9,
      "keywords": [
        "enterprise sales",
        "revenue increase"
      ]
    },
    {
      "name": "Customer Retention",
      "description": "Improvement in customer retention alongside a decrease in acquisition costs suggests effective customer relationship management.",
      "confidence": 0.8,
      "keywords": [
        "customer retention",
        "acquisition costs"
      ]
    },
    {
      "name": "Market Expansion",
      "description": "Positive early results from new market expansion initiatives highlight the company's strategic growth efforts.",
      "confidence": 0.75,
      "keywor

ThemeOutput(language='en', error=None, success=True, themes=[ThemeInfo(name='Revenue Growth', description='The increase in revenue by 15% indicates strong performance, particularly in enterprise sales.', confidence=0.9, keywords=['enterprise sales', 'revenue increase']), ThemeInfo(name='Customer Retention', description='Improvement in customer retention alongside a decrease in acquisition costs suggests effective customer relationship management.', confidence=0.8, keywords=['customer retention', 'acquisition costs']), ThemeInfo(name='Market Expansion', description="Positive early results from new market expansion initiatives highlight the company's strategic growth efforts.", confidence=0.75, keywords=['market expansion', 'early results'])], theme_descriptions={'Revenue Growth': 'The increase in revenue by 15% indicates strong performance, particularly in enterprise sales.', 'Customer Retention': 'Improvement in customer retention alongside a decrease in acquisition costs suggests effe

In [None]:
# Second cell: Run analysis tests
async def run_analysis_examples():
    """Run example analyses on different content types."""
    
    # Example texts
    example_texts = {
        "Business Analysis": """
            Q3 revenue increased by 15% with strong growth in enterprise sales.
            Customer retention improved while acquisition costs decreased.
            New market expansion initiatives are showing positive early results.
        """,
        
        "Technical Content": """
            The application uses microservices architecture with containerized deployments.
            Data processing pipeline incorporates machine learning models for prediction.
            System monitoring ensures high availability and performance metrics.
        """,
        
        "Mixed Content": """
            The IT department's cloud migration project reduced infrastructure costs by 25%.
            DevOps implementation improved deployment frequency while maintaining quality.
            Monthly recurring revenue from SaaS products grew steadily.
        """
    }
    
    pipeline = AnalysisPipeline()
    
    for title, text in example_texts.items():
        print(f"\nAnalyzing {title}")
        print("=" * 50)
        results = await pipeline.analyze_text(text)
        pipeline.display_results(results)
        print("\n" + "-" * 80 + "\n")



In [None]:
# # Third cell: Interactive analysis
# async def analyze_custom_text(text: str, language: str = None):
#     """Analyze custom text with complete pipeline."""
#     return await analyze_text_pipeline(text, language)



In [None]:
# Run complete analysis examples
await run_analysis_examples()



In [None]:
# Or analyze specific text
text = """Your text here..."""
await analyze_custom_text(text)

In [None]:
# Example usage:
await run_analysis_examples()



In [None]:
# Or analyze specific text:
# text = """Your text here..."""
# await analyze_custom_text(text)