# Cross-Environment Semantic Analysis Demo

See also separate [documentation](../docs/ANALYSIS_DEMO_DOC.md) sheet


## 1. Environment Setup


#### Import dependencies

In [1]:
import asyncio
import logging
from pathlib import Path
import sys
from typing import Optional


# Add project root to path (for local environment)
project_root = str(Path().resolve().parent)
if project_root not in sys.path:
    sys.path.append(project_root)


In [2]:
# import main interface to analyzers
from src.semantic_analyzer import SemanticAnalyzer

# import formatting
from src.utils.formatting_config import OutputDetail, ExcelOutputConfig


In [3]:
# Import environment setup
from src.core.managers import EnvironmentManager, EnvironmentConfig

### Set up environment

In [4]:
# Set environment type
ENV_TYPE = "local"  # Change to "azure" when running in Azure ML and you want persistent blob storage

# Configure environment
env_config = EnvironmentConfig(
    env_type=ENV_TYPE,
    project_root=Path().resolve().parent,
    log_level="INFO" # use config.yaml or .env for now to change logging level
)
environment = EnvironmentManager(env_config)

# Get initialized components
components = environment.get_components()
file_utils = components["file_utils"]

# Configure logging for HTTP clients
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)

2025-02-20 21:54:20,453 - FileUtils.core.file_utils - INFO - Project root: /Users/topi/data-science/repos/text-analyzer


2025-02-20 21:54:20,453 - FileUtils.core.file_utils - INFO - Project root: /Users/topi/data-science/repos/text-analyzer


2025-02-20 21:54:20,454 - FileUtils.core.file_utils - INFO - FileUtils initialized with local storage


2025-02-20 21:54:20,454 - FileUtils.core.file_utils - INFO - FileUtils initialized with local storage
2025-02-20 21:54:20,471 - src.core.managers.environment_manager - INFO - Environment initialized successfully


### User defined setup
- parameter file (how) and content file to be analyzed (what)


In [5]:
# parameter_file = "parameters_en.xlsx"
# content_file = "test_content_en.xlsx"

parameter_file = "support_parameters_fi.xlsx"
content_file = "support_test_content_fi.xlsx"
# parameter_file = "business_parameters_en.xlsx"
# content_file = "business_test_content_en.xlsx"

# Change to True if you want to use Azure OpenAI API, if not already defined in config.yaml
azure = False

## 2. Initialize Analyzer

-  Initialize analyzer with formatting config
-  Parameter file paths are handled automatically by FileUtils


In [6]:
# Example texts
texts = {
    #"en": "Machine learning models analyze data efficiently.",
    #"fi": "Koneoppimismallit analysoivat dataa tehokkaasti.",
    "fi": "Onko Python-ohjelmoinnin yrityskoulutuspaketteihin saatavilla ryhmäalennuksia? Meillä on 10 hengen kehittäjätiimi."
}


# Initialize analyzer
analyzer = SemanticAnalyzer(
    parameter_file=parameter_file,
    file_utils=file_utils
)

async def analyze_text(text: str, language: str):
    # Set the language first
    analyzer.set_language(language)
    
    result = await analyzer.analyze(
        text=text,
        analysis_types=["keywords", "themes", "categories"]
    )
    
    if result.success:
        print(f"\nAnalysis results for {language}:")
        print("Keywords:")
        for kw in result.keywords.keywords:
            print(f"• {kw.keyword} (score: {kw.score:.2f})")
            
        print("\nThemes:")
        for theme in result.themes.themes:
            print(f"• {theme.name} ({theme.confidence:.2f})")
            
        if result.categories and result.categories.matches:
            print("\nCategories:")
            for cat in result.categories.matches:
                print(f"• {cat.name} ({cat.confidence:.2f})")
    else:
        print(f"Error: {result.error}")

2025-02-20 21:54:21,057 - src.core.language_processing.finnish - INFO - Loaded 747 stopwords from /Users/topi/data-science/repos/text-analyzer/data/config/stop_words/fi.txt
2025-02-20 21:54:21,058 - src.core.language_processing.finnish.VoikkoHandler - DEBUG - Initialized with config: {'min_keyword_length': 3, 'include_compounds': True}
2025-02-20 21:54:21,058 - src.core.language_processing.finnish.VoikkoHandler - DEBUG - Trying library paths: ['/opt/homebrew/lib/libvoikko.dylib', '/usr/local/lib/libvoikko.dylib']
2025-02-20 21:54:21,058 - src.core.language_processing.finnish.VoikkoHandler - DEBUG - Trying dictionary paths: ['/opt/homebrew/lib/voikko', '/usr/local/lib/voikko', '/usr/local/share/voikko']
2025-02-20 21:54:21,066 - src.core.language_processing.finnish.VoikkoHandler - INFO - Successfully initialized Voikko with path: /opt/homebrew/lib/voikko
2025-02-20 21:54:21,197 - src.semantic_analyzer.analyzer - INFO - Verifying analyzer configuration:
2025-02-20 21:54:21,198 - src.sema

#### Single text analysis


In [7]:
print("\n=== Single Text Analysis ===")
for lang, text in texts.items():
    await analyze_text(text, lang)

2025-02-20 21:54:21,215 - src.core.language_processing.finnish - INFO - Loaded 747 stopwords from /Users/topi/data-science/repos/text-analyzer/data/config/stop_words/fi.txt
2025-02-20 21:54:21,216 - src.core.language_processing.finnish.VoikkoHandler - DEBUG - Initialized with config: {'min_keyword_length': 3, 'include_compounds': True}
2025-02-20 21:54:21,216 - src.core.language_processing.finnish.VoikkoHandler - DEBUG - Trying library paths: ['/opt/homebrew/lib/libvoikko.dylib', '/usr/local/lib/libvoikko.dylib']
2025-02-20 21:54:21,216 - src.core.language_processing.finnish.VoikkoHandler - DEBUG - Trying dictionary paths: ['/opt/homebrew/lib/voikko', '/usr/local/lib/voikko', '/usr/local/share/voikko']
2025-02-20 21:54:21,218 - src.core.language_processing.finnish.VoikkoHandler - INFO - Successfully initialized Voikko with path: /opt/homebrew/lib/voikko
2025-02-20 21:54:21,220 - src.core.language_processing.finnish - INFO - Loaded 747 stopwords from /Users/topi/data-science/repos/text-


=== Single Text Analysis ===

Analysis results for fi:
Keywords:
• Python-ohjelmointi (score: 0.90)
• yrityskoulutuspaketti (score: 0.85)
• ryhmäalennus (score: 0.80)
• kehittäjätiimi (score: 0.75)
• henki (score: 0.70)

Themes:
• Ryhmät ja alennukset (0.90)
• Koulutustarpeet (0.85)

Categories:
• yrityskoulutus (0.85)
• koulutus (0.75)


#### Excel processing


In [8]:
# Configure output formatting
output_config = ExcelOutputConfig(
    output_detail=OutputDetail.MINIMAL,
    include_metadata=True,
    include_confidence_scores=True
)

# Analyze Excel file
result_df = await analyzer.analyze_excel(
    content_file=content_file,
    analysis_types=["keywords", "themes", "categories"],
    save_results=True,
    output_file="results.xlsx",
    output_config=output_config
)

print("\nExcel analysis completed successfully")
print(f"Results saved to: results.xlsx")
print("\nAnalysis Results:")
print(result_df)

2025-02-20 21:54:28,071 - src.semantic_analyzer.analyzer - INFO - Running analysis types: ['themes', 'categories', 'keywords']
Processing rows:   0%|          | 0/4 [00:00<?, ?it/s]


Processing row 1


Processing rows:  25%|██▌       | 1/4 [00:13<00:40, 13.49s/it]

✓ Row completed

Processing row 2


Processing rows:  50%|█████     | 2/4 [00:25<00:24, 12.36s/it]

✓ Row completed

Processing row 3


Processing rows:  75%|███████▌  | 3/4 [00:35<00:11, 11.65s/it]

✓ Row completed

Processing row 4


Processing rows: 100%|██████████| 4/4 [00:44<00:00, 11.23s/it]
2025-02-20 21:55:13,048 - LocalStorage - INFO - Saved Excel file with sheets: ['Analysis Results', 'Summary']
2025-02-20 21:55:13,048 - LocalStorage - INFO - Saved Excel file with sheets: ['Analysis Results', 'Summary']


✓ Row completed
2025-02-20 21:55:13,049 - FileUtils.core.file_utils - INFO - Data saved successfully: {'results_20250220_215513': '/Users/topi/data-science/repos/text-analyzer/data/processed/results_20250220_215513.xlsx'}


2025-02-20 21:55:13,049 - FileUtils.core.file_utils - INFO - Data saved successfully: {'results_20250220_215513': '/Users/topi/data-science/repos/text-analyzer/data/processed/results_20250220_215513.xlsx'}



Excel analysis completed successfully
Results saved to: results.xlsx

Analysis Results:
                                              themes  \
0  Tekninen ongelma (0.90): Tekstissä kuvataan on...   
1  Tekninen ongelma (0.90): Raporttien vientiomin...   
2  Virhekoodin ongelmat (0.90): Tekstissä mainita...   
3  Access Issues (0.90): The text highlights a pr...   

                                          categories  \
0  kirjautumisongelmat (0.90): Tunnistautumis- ja...   
1  järjestelmävirheet (0.90): Tekniset virheet ja...   
2  järjestelmävirheet (0.90): Tekniset virheet ja...   
3  järjestelmävirheet (0.90): Tekniset virheet ja...   

                                            keywords  \
0  ongelma (0.90) [teknologia]; kirjautuminen (0....   
1  raportti (0.80) [ohjelmointi]; vientiominaisuu...   
2  virhekoodi (0.90) [teknologia]; E1234 (0.80) [...   
3  API-dokumentaatio (0.90) [teknologia]; kehittä...   

                                             content  \
0  Minulla o