# Cross-Environment Semantic Analysis Demo

See also separate [documentation](../docs/ANALYSIS_DEMO_DOC.md) sheet


## 1. Environment Setup


#### Import dependencies

In [1]:
import asyncio
import logging
from pathlib import Path
import sys
from typing import Optional


# Add project root to path (for local environment)
project_root = str(Path().resolve().parent)
if project_root not in sys.path:
    sys.path.append(project_root)


In [2]:
# import main interface to analyzers
from src.semantic_analyzer import SemanticAnalyzer

# import formatting
from src.utils.formatting_config import OutputDetail, ExcelOutputConfig


In [3]:
# Import environment setup
from src.core.managers import EnvironmentManager, EnvironmentConfig

### Set up environment

In [4]:
# Set environment type
ENV_TYPE = "local"  # Change to "azure" when running in Azure ML and you want persistent blob storage

# Configure environment
env_config = EnvironmentConfig(
    env_type=ENV_TYPE,
    project_root=Path().resolve().parent,
    log_level="INFO" # use config.yaml or .env for now to change logging level
)
environment = EnvironmentManager(env_config)

# Get initialized components
components = environment.get_components()
file_utils = components["file_utils"]

# Configure logging for HTTP clients
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)

2025-02-16 17:55:16,829 - FileUtils.core.file_utils - INFO - Project root: /Users/topi/data-science/repos/text-analyzer


2025-02-16 17:55:16,829 - FileUtils.core.file_utils - INFO - Project root: /Users/topi/data-science/repos/text-analyzer


2025-02-16 17:55:16,829 - FileUtils.core.file_utils - INFO - FileUtils initialized with local storage


2025-02-16 17:55:16,829 - FileUtils.core.file_utils - INFO - FileUtils initialized with local storage
2025-02-16 17:55:16,848 - src.core.managers.environment_manager - INFO - Environment initialized successfully


### User defined setup
- parameter file (how) and content file to be analyzed (what)


In [5]:
# parameter_file = "parameters_en.xlsx"
# content_file = "test_content_en.xlsx"

parameter_file = "support_parameters_fi.xlsx"
content_file = "support_test_content_fi.xlsx"
# parameter_file = "business_parameters_en.xlsx"
# content_file = "business_test_content_en.xlsx"

# Change to True if you want to use Azure OpenAI API, if not already defined in config.yaml
azure = False

## 2. Initialize Analyzer

-  Initialize analyzer with formatting config
-  Parameter file paths are handled automatically by FileUtils


In [6]:
# Example texts
texts = {
    "en": "Machine learning models analyze data efficiently.",
    "fi": "Koneoppimismallit analysoivat dataa tehokkaasti."
}

# Initialize analyzer
analyzer = SemanticAnalyzer(
    parameter_file=parameter_file,
    file_utils=file_utils
)

async def analyze_text(text: str, language: str):
    # Set the language first
    analyzer.set_language(language)
    
    result = await analyzer.analyze(
        text=text,
        analysis_types=["keywords", "themes", "categories"]
    )
    
    if result.success:
        print(f"\nAnalysis results for {language}:")
        print("Keywords:")
        for kw in result.keywords.keywords:
            print(f"• {kw.keyword} (score: {kw.score:.2f})")
            
        print("\nThemes:")
        for theme in result.themes.themes:
            print(f"• {theme.name} ({theme.confidence:.2f})")
            
        if result.categories and result.categories.matches:
            print("\nCategories:")
            for cat in result.categories.matches:
                print(f"• {cat.name} ({cat.confidence:.2f})")
    else:
        print(f"Error: {result.error}")

2025-02-16 17:55:17,035 - src.core.language_processing.finnish - INFO - Loaded 747 stopwords from /Users/topi/data-science/repos/text-analyzer/data/config/stop_words/fi.txt
2025-02-16 17:55:17,035 - src.core.language_processing.finnish.VoikkoHandler - DEBUG - Initialized with config: {'min_keyword_length': 3, 'include_compounds': True}
2025-02-16 17:55:17,035 - src.core.language_processing.finnish.VoikkoHandler - DEBUG - Trying library paths: ['/opt/homebrew/lib/libvoikko.dylib', '/usr/local/lib/libvoikko.dylib']
2025-02-16 17:55:17,035 - src.core.language_processing.finnish.VoikkoHandler - DEBUG - Trying dictionary paths: ['/opt/homebrew/lib/voikko', '/usr/local/lib/voikko', '/usr/local/share/voikko']
2025-02-16 17:55:17,042 - src.core.language_processing.finnish.VoikkoHandler - INFO - Successfully initialized Voikko with path: /opt/homebrew/lib/voikko
2025-02-16 17:55:17,174 - src.semantic_analyzer.analyzer - INFO - Verifying analyzer configuration:
2025-02-16 17:55:17,174 - src.sema

#### Single text analysis


In [7]:
print("\n=== Single Text Analysis ===")
for lang, text in texts.items():
    await analyze_text(text, lang)


=== Single Text Analysis ===


2025-02-16 17:55:18,809 - src.semantic_analyzer.analyzer - INFO - Verifying analyzer configuration:
2025-02-16 17:55:18,809 - src.semantic_analyzer.analyzer - INFO - Language: en
2025-02-16 17:55:18,809 - src.semantic_analyzer.analyzer - INFO - Categories loaded: 3
2025-02-16 17:55:18,810 - src.semantic_analyzer.analyzer - INFO -   - kirjautumisongelmat: 4 keywords, threshold: 0.6
2025-02-16 17:55:18,810 - src.semantic_analyzer.analyzer - INFO -   - järjestelmävirheet: 4 keywords, threshold: 0.6
2025-02-16 17:55:18,810 - src.semantic_analyzer.analyzer - INFO -   - dokumentaatio-ongelmat: 4 keywords, threshold: 0.6
2025-02-16 17:55:18,810 - src.semantic_analyzer.analyzer - INFO - Language processor: EnglishTextProcessor
2025-02-16 17:55:18,811 - src.semantic_analyzer.analyzer - INFO - All analyzers initialized for language: en
2025-02-16 17:55:18,811 - src.semantic_analyzer.analyzer - INFO - Language switched to en
2025-02-16 17:55:22,008 - src.core.language_processing.finnish - INFO - 


Analysis results for en:
Keywords:
• machine learning (score: 0.90)
• model (score: 0.70)
• analyze (score: 0.60)
• data (score: 0.80)
• efficiently (score: 0.50)

Themes:
• Efficiency of Machine Learning (0.90)

Categories:
• Machine Learning (0.90)
• Data Analysis (0.85)

Analysis results for fi:
Keywords:
• koneoppimismalli (score: 0.90)
• data (score: 0.80)
• analysoida (score: 0.70)
• tehokas (score: 0.60)

Themes:
• Koneoppimismallit (0.90)
• Datan analysointi (0.85)
• Tehokkuus (0.80)

Categories:
• järjestelmävirheet (0.70)
• dokumentaatio-ongelmat (0.50)


#### Excel processing


In [8]:
# Configure output formatting
output_config = ExcelOutputConfig(
    output_detail=OutputDetail.MINIMAL,
    include_metadata=True,
    include_confidence_scores=True
)

# Analyze Excel file
result_df = await analyzer.analyze_excel(
    content_file=content_file,
    analysis_types=["keywords", "themes", "categories"],
    save_results=True,
    output_file="results.xlsx",
    output_config=output_config
)

print("\nExcel analysis completed successfully")
print(f"Results saved to: results.xlsx")
print("\nAnalysis Results:")
print(result_df)

2025-02-16 17:55:25,950 - src.semantic_analyzer.analyzer - INFO - Running analysis types: ['categories', 'keywords', 'themes']
Processing rows:   0%|          | 0/4 [00:00<?, ?it/s]


Processing row 1


Processing rows:  25%|██▌       | 1/4 [00:09<00:29,  9.79s/it]

✓ Row completed

Processing row 2


Processing rows:  50%|█████     | 2/4 [00:22<00:22, 11.37s/it]

✓ Row completed

Processing row 3


Processing rows:  75%|███████▌  | 3/4 [00:31<00:10, 10.49s/it]

✓ Row completed

Processing row 4


Processing rows: 100%|██████████| 4/4 [00:38<00:00,  9.60s/it]
2025-02-16 17:56:04,408 - LocalStorage - INFO - Saved Excel file with sheets: ['Analysis Results', 'Summary']
2025-02-16 17:56:04,408 - LocalStorage - INFO - Saved Excel file with sheets: ['Analysis Results', 'Summary']


✓ Row completed
2025-02-16 17:56:04,408 - FileUtils.core.file_utils - INFO - Data saved successfully: {'results_20250216_175604': '/Users/topi/data-science/repos/text-analyzer/data/processed/results_20250216_175604.xlsx'}


2025-02-16 17:56:04,408 - FileUtils.core.file_utils - INFO - Data saved successfully: {'results_20250216_175604': '/Users/topi/data-science/repos/text-analyzer/data/processed/results_20250216_175604.xlsx'}



Excel analysis completed successfully
Results saved to: results.xlsx

Analysis Results:
                                          categories  \
0  kirjautumisongelmat (0.90): Tunnistautumis- ja...   
1  järjestelmävirheet (0.90): Tekniset virheet ja...   
2  järjestelmävirheet (0.90): Tekniset virheet ja...   
3  järjestelmävirheet (0.90): Tekniset virheet ja...   

                                            keywords  \
0  ongelma (0.90) [tekninen tuki]; kirjautua (0.8...   
1  raporttien vientiominaisuus (0.90) [teknologia...   
2  virhekoodin e1234 (0.90) [teknologia]; synkron...   
3  api-dokumentaatioon (0.90) [teknologia]; kehit...   

                                              themes  \
0  Tekninen ongelma (0.90): Tekstissä kuvataan on...   
1  Tekninen ongelma (0.90): Raporttien vientiomin...   
2  Virhekoodin ongelmat (0.90): Tekstissä mainita...   
3  Access Issues (0.95): The text highlights a pr...   

                                             content  \
0  Minulla o