# Semantic Analysis Pipeline

This notebook demonstrates the semantic text analysis capabilities using our custom analyzers.

## Setup
Import required packages and configure the environment:



In [1]:
# At start of notebook
import sys
from pathlib import Path
import logging
import os

# Add project root to Python path
project_root = str(Path().resolve().parent)
if project_root not in sys.path:
    sys.path.append(project_root)

In [2]:
# import logging
# from src.nb_helpers.logging import configure_logging

# Set up environment with DEBUG level
from src.nb_helpers.environment import setup_notebook_env, verify_environment
setup_notebook_env(log_level="DEBUG")

# Any verification needed will maintain DEBUG level
verify_environment(log_level="DEBUG")

2024-11-20 14:03:30,306 - src.utils.FileUtils.file_utils - DEBUG [file_utils.py:192] - Initialized FileUtils with log level: INFO
2024-11-20 14:03:30,310 - src.utils.FileUtils.file_utils - DEBUG [file_utils.py:198] - Project root: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer
2024-11-20 14:03:30,320 - src.utils.FileUtils.file_utils - DEBUG [file_utils.py:192] - Initialized FileUtils with log level: INFO
2024-11-20 14:03:30,324 - src.utils.FileUtils.file_utils - DEBUG [file_utils.py:198] - Project root: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer


Environment Check Results:

Basic Setup:
-----------
✓ Project root in path
✓ FileUtils initialized
✓ .env file loaded

Environment Variables:
---------------------
✓ OPENAI_API_KEY set
✓ ANTHROPIC_API_KEY set

Project Structure:
-----------------
✓ Raw data exists
✓ Processed data exists
✓ Configuration exists
✓ Main config.yaml exists

Environment Status: Ready ✓


True

In [3]:
# Import necessary components
from src.loaders.parameter_handler import ParameterHandler
from src.nb_helpers.analyzers import (
    analyze_keywords,
    analyze_themes,
    analyze_categories,
    analyze_text,
    AnalysisOptions
)

from scripts.migrate_parameters import create_example_parameters
from src.nb_helpers.logging import configure_logging, verify_logging_setup_with_hierarchy, reset_debug_logging
from src.loaders.parameter_handler import ParameterHandler, get_parameter_file_path, verify_parameter_file



In [4]:
# Set initial logging
configure_logging(level="DEBUG")
# Keep HTTP loggers at INFO
for name in ["httpx", "httpcore", "openai", "anthropic"]:
    logging.getLogger(name).setLevel(logging.INFO)
    
verify_logging_setup_with_hierarchy()

2024-11-20 14:03:30,508 - src.nb_helpers.logging - DEBUG - Logging configured at DEBUG level



Logging Configuration:
--------------------------------------------------

Logger: root
Set Level: DEBUG
Effective Level: DEBUG
Propagates to root: True
Handlers:
  Handler 1 level: DEBUG

Logger: src.nb_helpers.analyzers
Hierarchy:
  src: NOTSET
  src.nb_helpers: NOTSET
  src.nb_helpers.analyzers: DEBUG
Set Level: DEBUG
Effective Level: DEBUG
Propagates to root: True
No handlers (uses root handlers)

Logger: src.analyzers.keyword_analyzer
Hierarchy:
  src: NOTSET
  src.analyzers: NOTSET
  src.analyzers.keyword_analyzer: DEBUG
Set Level: DEBUG
Effective Level: DEBUG
Propagates to root: True
No handlers (uses root handlers)

Logger: src.analyzers.theme_analyzer
Hierarchy:
  src: NOTSET
  src.analyzers: NOTSET
  src.analyzers.theme_analyzer: DEBUG
Set Level: DEBUG
Effective Level: DEBUG
Propagates to root: True
No handlers (uses root handlers)

Logger: src.analyzers.category_analyzer
Hierarchy:
  src: NOTSET
  src.analyzers: NOTSET
  src.analyzers.category_analyzer: DEBUG
Set Level: DEB

In [5]:
# detailed_logging_info = True
# if detailed_logging_info:
#     from src.nb_helpers.logging import verify_logging_setup_with_hierarchy
#     # Configure logging
#     # configure_logging(level="DEBUG")
#     # Verify with detailed information
#     verify_logging_setup_with_hierarchy()


In [6]:
# Example texts in different languages
example_texts = {
    "English Technical": """
        The cloud migration project improved system scalability while reducing costs.
        New DevOps practices streamlined the deployment pipeline significantly.
    """,
    
    "Finnish Technical": """
        Pilvipalveluihin siirtyminen paransi järjestelmän skaalautuvuutta ja vähensi kustannuksia.
        Uudet DevOps-käytännöt tehostivat merkittävästi käyttöönottoprosessia.
    """,
    
    "English Business": """
        Q3 financial results show 15% revenue growth and improved profit margins.
        Customer acquisition costs decreased while retention rates increased.
    """,
    
    "Finnish Business": """
        Q3 taloudelliset tulokset osoittavat 15% liikevaihdon kasvun ja parantuneet katteet.
        Asiakashankinnan kustannukset laskivat ja asiakaspysyvyys parani.
    """
}



In [7]:
# Create and load parameters
params_file_name = "parameters_en.xlsx"

# Get the full parameter file path
params_file = get_parameter_file_path(params_file_name)

# Create file if it doesn't exist
if not params_file.exists():
    params_file = create_example_parameters(params_file_name)
    print(f"Created parameter file at: {params_file}")
else:
    print(f"Using existing parameter file at: {params_file}")

# Verify the file
verify_parameter_file(params_file)

# Load parameters
handler = ParameterHandler(params_file_name)  # Can now use just the file name
params = handler.get_parameters()



2024-11-20 14:03:30,640 - src.utils.FileUtils.file_utils - DEBUG [file_utils.py:192] - Initialized FileUtils with log level: INFO
2024-11-20 14:03:30,640 - src.utils.FileUtils.file_utils - DEBUG - Initialized FileUtils with log level: INFO
2024-11-20 14:03:30,642 - src.utils.FileUtils.file_utils - DEBUG [file_utils.py:198] - Project root: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer
2024-11-20 14:03:30,642 - src.utils.FileUtils.file_utils - DEBUG - Project root: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer


Using existing parameter file at: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\parameters\parameters_en.xlsx

Parameter File Verification:
Absolute path: C:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\parameters\parameters_en.xlsx
File exists: True


2024-11-20 14:03:32,636 - src.utils.FileUtils.file_utils - DEBUG [file_utils.py:192] - Initialized FileUtils with log level: INFO



Found sheets:

General Parameters:
                parameter                           value  \
0            max_keywords                              10   
1                focus_on  technical and business content   
2  column_name_to_analyze                         content   
3      min_keyword_length                               3   
4       include_compounds                            True   

                          description  
0  Maximum keywords to extract (1-20)  
1                 Analysis focus area  
2          Name of the content column  
3              Minimum keyword length  
4              Include compound words  

Categories:
              category                                 description  \
0    technical_content  Technical and software development content   
1     business_content              Business and financial content   
2  educational_content            Educational and training content   

                                            keywords  threshold

2024-11-20 14:03:32,636 - src.utils.FileUtils.file_utils - DEBUG - Initialized FileUtils with log level: INFO
2024-11-20 14:03:32,639 - src.utils.FileUtils.file_utils - DEBUG [file_utils.py:198] - Project root: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer
2024-11-20 14:03:32,639 - src.utils.FileUtils.file_utils - DEBUG - Project root: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer
2024-11-20 14:03:32,639 - src.utils.FileUtils.file_utils - DEBUG [file_utils.py:198] - Project root: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer
2024-11-20 14:03:32,639 - src.utils.FileUtils.file_utils - DEBUG - Project root: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer
2024-11-20

In [8]:
print("\nLoaded parameters:")
# params.print()  # Uses the new print method

# Or just
print(params)  # Uses the new __str__ method


Loaded parameters:
ParameterSet(
  general=max_keywords=10 min_keyword_length=3 language='en' focus_on='technical and business content' include_compounds=True max_themes=3 min_confidence=0.3 column_name_to_analyze='content',
  categories=3 items,
  predefined_keywords=3 items,
  excluded_keywords=3 items,
  analysis_settings=theme_analysis=ThemeAnalysisSettings(enabled=True, min_confidence=0.5) weights=AnalysisWeights(statistical=0.4, llm=0.6),
  domain_context=0 items
)


In [9]:
# Create example files

create_new_example_files=False

if create_new_example_files:
    en_params = create_example_parameters("parameters_en.xlsx", "en")
    fi_params = create_example_parameters("parameters_fi.xlsx", "fi")
    print(f"Created parameter files:\n- {en_params}\n- {fi_params}")
    print(f"Created parameter files:\n- {en_params}\n- {fi_params}")
else:
    print("Not creating new example files.")

Not creating new example files.


In [10]:
# Check parameter validation
print("Parameter Validation:")
print("-" * 50)
is_valid, warnings, errors = handler.validate()
if warnings:
    print("\nWarnings:")
    for warning in warnings:
        print(f"- {warning}")
if not is_valid:
    print("\nErrors:")
    for error in errors:
        print(f"- {error}")
else:
    print("\nParameters validated successfully!")

Parameter Validation:
--------------------------------------------------

Parameters validated successfully!


In [11]:
# Test cases
test_texts = {
    "Technical Compound Terms": """
        The cloud migration project improved system scalability.
        DevOps practices streamlined the deployment pipeline.
        Our microservices architecture enables API integrations.
    """,
    
    "Mixed Domain Content": """
        The IT department's infrastructure costs decreased by 25%
        after implementing cloud-native solutions. Monthly recurring
        revenue from SaaS products grew steadily while deployment
        frequency improved.
    """,
    
    "Business Focus": """
        Market analysis shows 15% revenue growth in Q3.
        Customer acquisition costs decreased while retention rates
        increased. Strategic partnerships drove innovation.
    """,
    
    "Multiple Compounds": """
        Machine learning models process real-time data streams.
        The CI/CD pipeline integrates automated testing workflows.
        Cloud-based infrastructure supports multi-region deployments.
    """
}

finnish_texts = {
    "technical_fi_1":"Pilvipalveluiden käyttöönotto tehosti järjestelmän skaalautuvuutta merkittävästi. DevOps-prosessit nopeuttivat julkaisusykliä ja automatisoivat laadunvarmistusta. Kuukausittainen tilaustuotto SaaS-ratkaisuista kasvoi 25%.",
    "technical_fi_2":"Mikropalveluarkkitehtuuri mahdollisti järjestelmän modulaarisen kehityksen. Konttiteknologian avulla saavutettiin parempi resurssien käyttöaste ja joustavampi ylläpito. Rajapintojen dokumentointi helpotti integraatioiden toteuttamista.",
    "technical_fi_3":"Tekoälypohjaiset ennusteet auttoivat optimoimaan kuormantasausta. Pilvinatiivi lähestymistapa vähensi infrastruktuurikustannuksia ja paransi vikasietoisuutta. Monitorointi tarjosi reaaliaikaista näkyvyyttä suorituskykyyn.",
    "business_fi_1":"Liikevaihdon kasvu vahvistui kolmannella vuosineljänneksellä 15 prosenttiin. Asiakashankinnan kustannukset laskivat samalla kun asiakaspysyvyys parani. Markkinaosuus kasvoi erityisesti pilvipalveluiden segmentissä.",
    "business_fi_2":"Analytiikkatyökalut paljastivat uusia käyttäytymismalleja asiakasrajapinnassa. Toistuvaislaskutuksen osuus kokonaistuotoista nousi 75 prosenttiin. Automaattinen raportointi tehosti päätöksentekoa.",
    "business_fi_3":"Uudet tuotelanseeraukset vahvistivat kilpailuasemaa. Strategiset kumppanuudet mahdollistivat laajentumisen uusille markkina-alueille. Resurssien kohdentaminen tuotekehitykseen tuotti merkittävää kasvua."
}



In [None]:
from src import KeywordAnalyzer, ThemeAnalyzer, CategoryAnalyzer, TextAnalyzer
from src.core.language_processing import create_text_processor


### Theme analysis

In [None]:
# Create language processor
processor = create_text_processor(language="fi")

# Initialize analyzer
analyzer = ThemeAnalyzer(
    language_processor=processor,
    config={
        "max_themes": 3,
        "min_confidence": 0.3,
        "focus_on": "technical"
    }
)

# Analyze text
results = await analyzer.analyze(text)

# Display results
analyzer.display_themes(results)

In [12]:
# Testing function
async def test_keyword_analyzer(text: str, show_debug: bool = True, language: str = "en"):
    options = AnalysisOptions(
        show_confidence=True,
        show_evidence=True,
        show_keywords=True,
        show_raw_data=show_debug,
        debug_mode=True,
        language=language
    )
    
    results = await analyze_keywords(text, options)
    return results



In [None]:
# from src.nb_helpers.testers import analyze_problematic_words, KeywordTester
# Method 1: Use directly with a processor
# processor = create_text_processor(language="fi")
# problematic_words = ["para", "parani", "parantua", "kasvu", "kasvaa"]
# analyze_problematic_words(processor, problematic_words)

# # Method 2: Use through KeywordTester
# tester = KeywordTester(language_processor=create_text_processor(language="fi"))
# tester.analyze_words(["para", "parani", "parantua", "kasvu", "kasvaa"])

2024-11-20 14:03:35,710 - src.core.language_processing.factory - DEBUG - Using default configuration
2024-11-20 14:03:35,712 - src.core.language_processing.factory - DEBUG - Creating fi processor
2024-11-20 14:03:35,716 - src.utils.FileUtils.file_utils - DEBUG [file_utils.py:192] - Initialized FileUtils with log level: INFO
2024-11-20 14:03:35,716 - src.utils.FileUtils.file_utils - DEBUG - Initialized FileUtils with log level: INFO
2024-11-20 14:03:35,719 - src.utils.FileUtils.file_utils - DEBUG [file_utils.py:198] - Project root: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer
2024-11-20 14:03:35,719 - src.utils.FileUtils.file_utils - DEBUG - Project root: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer
2024-11-20 14:03:35,746 - src.core.language_processing.finnish - INFO - Loaded 747 stopwords from c:\Users\tja\OneDrive - Ra


Analyzing word: 'para'
Base form: parantua
Is verb: True
Should keep: False

Voikko analysis:
  BASEFORM: para
  CLASS: nimisana
  FSTOUTPUT: [Ln][Xp]para[X]par[Sn][Ny]a
  NUMBER: singular
  SIJAMUOTO: nimento
  STRUCTURE: =pppp
  WORDBASES: +para(para)
--------------------------------------------------

Analyzing word: 'parani'
Base form: None
Is verb: True
Should keep: False

Voikko analysis:
  BASEFORM: para
  CLASS: nimisana
  FSTOUTPUT: [Ln][Xp]para[X]par[Sn][Ny]a[O1y]ni
  NUMBER: singular
  POSSESSIVE: 1s
  SIJAMUOTO: nimento
  STRUCTURE: =pppppp
  WORDBASES: +para(para)
--------------------------------------------------

Analyzing word: 'parantua'
Base form: None
Is verb: True
Should keep: False

Voikko analysis:
  BASEFORM: parantua
  CLASS: teonsana
  FSTOUTPUT: [Lt][Xp]parata[X]paran[Xj]tua[X]tu[Tn1][Eb]a
  MOOD: A-infinitive
  NEGATIVE: both
  STRUCTURE: =pppppppp
  WORDBASES: +paran(parata)+tua(+tua)
--------------------------------------------------

Analyzing word: 'kasv

2024-11-20 14:03:38,629 - src.utils.FileUtils.file_utils - DEBUG [file_utils.py:192] - Initialized FileUtils with log level: INFO
2024-11-20 14:03:38,629 - src.utils.FileUtils.file_utils - DEBUG - Initialized FileUtils with log level: INFO
2024-11-20 14:03:38,633 - src.utils.FileUtils.file_utils - DEBUG [file_utils.py:198] - Project root: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer
2024-11-20 14:03:38,633 - src.utils.FileUtils.file_utils - DEBUG - Project root: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer
2024-11-20 14:03:38,629 - src.utils.FileUtils.file_utils - DEBUG - Initialized FileUtils with log level: INFO
2024-11-20 14:03:38,633 - src.utils.FileUtils.file_utils - DEBUG [file_utils.py:198] - Project root: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\sem

In [None]:
from src.core.language_processing.finnish import FinnishTextProcessor,analyze_problematic_words
# Usage example:
problematic_words = ["para", "parani", "parantua", "kasvu", "kasvaa"]
analyze_problematic_words(processor, problematic_words)

In [15]:
# test_case = "technical_fi_1"
test_case = "business_fi_1"

results = await test_keyword_analyzer(finnish_texts[test_case], language="fi")

2024-11-20 14:04:37,736 - src.nb_helpers.analyzers - DEBUG - Starting keyword analysis
2024-11-20 14:04:37,740 - src.utils.FileUtils.file_utils - DEBUG [file_utils.py:192] - Initialized FileUtils with log level: INFO
2024-11-20 14:04:37,740 - src.utils.FileUtils.file_utils - DEBUG - Initialized FileUtils with log level: INFO
2024-11-20 14:04:37,743 - src.utils.FileUtils.file_utils - DEBUG [file_utils.py:198] - Project root: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer
2024-11-20 14:04:37,743 - src.utils.FileUtils.file_utils - DEBUG - Project root: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer
2024-11-20 14:04:37,750 - src.nb_helpers.analyzers - DEBUG - Initialized TextAnalyzer with options: AnalysisOptions(show_confidence=True, show_evidence=True, show_keywords=True, show_raw_data=True, debug_mode=True, language='fi', par


Keywords Found:
  • markkinaosuus        [████████████████████] (1.00)
  • liikevaihto          [████████████████████] (1.00)
  • pilvipalvelut        [████████████████████] (1.00)
  • asiakas              [███████████████░░░░░] (0.75)
  • kasvu                [███████████████░░░░░] (0.75)
  • vuosineljännes       [████████░░░░░░░░░░░░] (0.43)
  • segmentti            [█████░░░░░░░░░░░░░░░] (0.26)

Debug Information:
--------------------
{
  "keywords": [
    {
      "keyword": "markkinaosuus",
      "score": 1.0,
      "domain": "business",
      "compound_parts": [
        "markkina",
        "osuus"
      ]
    },
    {
      "keyword": "liikevaihto",
      "score": 1.0,
      "domain": "business",
      "compound_parts": [
        "liike",
        "vaihto"
      ]
    },
    {
      "keyword": "pilvipalvelut",
      "score": 1.0,
      "domain": "technical",
      "compound_parts": [
        "pilvi",
        "palvella"
      ]
    },
    {
      "keyword": "asiakas",
      "score"

In [None]:
test_case = "Mixed Domain Content"
results = await test_keyword_analyzer(test_texts[test_case])


In [None]:
# Run tests
for case_name, text in test_texts.items():
    print(f"\nTesting: {case_name}")
    print("=" * 50)
    results = await test_keyword_analyzer(text)

previous examples

In [None]:
STOP

In [None]:
# # 2. Test single language analysis
# print("\nSingle Language Analysis:")
# print("-" * 50)

# options_en = AnalysisOptions(
#     show_confidence=True,
#     show_evidence=True,
#     show_keywords=True,
#     show_raw_data=True,
#     debug_mode=True,
#     language="en"  # Explicitly set language
# )

# # Analyze English text
# text = example_texts["English Technical"]
# results_en = await analyze_keywords(text, options_en)



In [None]:
# # Test Finnish keyword analysis
# print("\nTest Finnish keyword analysis:")
# print("-" * 50)

# options_fi = AnalysisOptions(
#     show_confidence=True,
#     show_evidence=True,
#     debug_mode=True,
#     language="fi"  # Explicitly set language
# )

# # Analyze Finnish text with auto-detection
# text = example_texts["Finnish Technical"]
# # results_fi = await analyze_text(text, options_fi)
# keywords_fi = await analyze_keywords(text, options_fi)


# # # 4. Test batch analysis with mixed languages
# # print("\nBatch Analysis with Mixed Languages:")
# # print("-" * 50)

# # batch_results = {}
# # for name, text in example_texts.items():
# #     print(f"\nAnalyzing {name}:")
# #     results = await analyze_text(text, options_auto)
# #     batch_results[name] = results

# # # 5. Test Excel file processing
# # from src.nb_helpers.analyzers import analyze_excel_content

# # # Create a test DataFrame
# # import pandas as pd
# # df = pd.DataFrame({
# #     "content": example_texts.values(),
# #     "type": [name.split()[0] for name in example_texts.keys()],  # "English" or "Finnish"
# # })

# # # Save to temporary Excel file
# # temp_excel = "temp_test_content.xlsx"
# # df.to_excel(temp_excel, index=False)

# # print("\nExcel File Analysis:")
# # print("-" * 50)

# # await analyze_excel_content(
# #     input_file=temp_excel,
# #     output_file="analysis_results",
# #     content_column="content",
# #     parameter_file="parameters_en.xlsx",  # Use our parameter file
# #     language_column="type"  # Use type column for language
# # )

# # # Clean up temporary file
# # os.remove(temp_excel)

# # # 6. Compare analysis results
# # print("\nAnalysis Results Comparison:")
# # print("-" * 50)

# # def print_analysis_summary(results: dict, name: str):
# #     print(f"\n{name}:")
# #     if "keywords" in results:
# #         keywords = results["keywords"].get("keywords", [])
# #         print(f"Keywords found: {len(keywords)}")
# #         for kw in keywords[:3]:  # Show top 3 keywords
# #             print(f"- {kw.keyword}: {kw.score:.2f}")
    
# #     if "themes" in results:
# #         themes = results["themes"].get("themes", [])
# #         print(f"Themes found: {len(themes)}")
# #         for theme in themes[:2]:  # Show top 2 themes
# #             print(f"- {theme.name}: {theme.confidence:.2f}")
            
# #     if "categories" in results:
# #         categories = results["categories"].get("categories", [])
# #         print(f"Categories found: {len(categories)}")
# #         for cat in categories[:2]:  # Show top 2 categories
# #             print(f"- {cat.name}: {cat.confidence:.2f}")

# # for name, results in batch_results.items():
# #     print_analysis_summary(results, name)


In [None]:
# Run environment verification
# from src.nb_helpers.environment import verify_environment
# verify_environment()

In [None]:
# Test logging
logger = logging.getLogger("src.analyzers.keyword_analyzer")
logger.debug("Testing keyword analyzer logging")

NOTE: following are not working with the new parameter handling model

In [None]:
# Example texts in different languages
example_texts = {
    "English Technical": """
        The cloud migration project improved system scalability while reducing costs.
        New DevOps practices streamlined the deployment pipeline significantly.
    """,
    
    "Finnish Technical": """
        Pilvipalveluihin siirtyminen paransi järjestelmän skaalautuvuutta ja vähensi kustannuksia.
        Uudet DevOps-käytännöt tehostivat merkittävästi käyttöönottoprosessia.
    """
}

# Analyze with automatic language detection
for name, text in example_texts.items():
    print(f"\nAnalyzing {name}:")
    results = await analyze_text(text, options)

# Example with specific language and parameters
fi_options = AnalysisOptions(
    show_confidence=True,
    show_evidence=True,
    debug_mode=True,
    language="fi",
    parameter_file="finnish_params.yaml"
)

# Analyze Finnish text with specific parameters
fi_results = await analyze_text(example_texts["Finnish Technical"], fi_options)

# Batch process Excel file with language detection
await analyze_excel_content(
    input_file="multilingual_texts.xlsx",
    output_file="analysis_results",
    content_column="content",
    parameter_file="analysis_params.yaml",
    language_column="language"  # Optional column specifying language
)

<!-- ## Analysis Functions

### Single Analysis with Debug Output
Run detailed analysis for a single text: -->


In [None]:
example_texts = {
    "Business Analysis": """
        Q3 revenue increased by 15% with strong growth in enterprise sales.
        Customer retention improved while acquisition costs decreased.
        New market expansion initiatives are showing positive early results.
    """,
    
    "Technical Content": """
        The application uses microservices architecture with containerized deployments.
        Data processing pipeline incorporates machine learning models for prediction.
        System monitoring ensures high availability and performance metrics.
    """,
    
    "Mixed Content": """
        The IT department's cloud migration project reduced infrastructure costs by 25%.
        DevOps implementation improved deployment frequency while maintaining quality.
        Monthly recurring revenue from SaaS products grew steadily.
    """,
    "koulutus":
    """
        Verkko-oppimisalusta sisältää interaktiivisia moduuleja ja oman tahdin edistymisen seurannan. 
        Virtuaaliluokat mahdollistavat reaaliaikaisen yhteistyön opiskelijoiden ja ohjaajien välillä. 
        Digitaaliset arviointityökalut antavat välitöntä palautetta oppimistuloksista.
    """,
    "tekninen":
    """
        Koneoppimismalleja koulutetaan suurilla datajoukolla tunnistamaan kaavoja. 
        Neuroverkon arkkitehtuuri sisältää useita kerroksia piirteiden erottamiseen. 
        Datan esikäsittely ja piirteiden suunnittelu ovat keskeisiä vaiheita prosessissa.

    """
}

In [None]:
# # New imports
# from src.core.language_parameters import LanguageParameterManager

# # Initialize parameter manager
# param_manager = LanguageParameterManager()

# # Example analysis with automatic language detection
# text_en = "Cloud computing enables scalable infrastructure deployment."
# text_fi = "Pilvipalvelut mahdollistavat skaalautuvan infrastruktuurin käyttöönoton."

# # Analyze with automatic language detection and default parameters
# async def analyze_text_with_language(text: str, parameter_file: Optional[str] = None):
#     """Analyze text with automatic language handling."""
#     # Get language-specific parameters
#     params = param_manager.get_parameters(text, parameter_file)
    
#     # Create analyzers with parameters
#     keyword_analyzer = KeywordAnalyzer(config=params.dict())
#     theme_analyzer = ThemeAnalyzer(config=params.dict())
#     category_analyzer = CategoryAnalyzer(config=params.dict())
    
#     # Run analysis
#     results = {
#         "keywords": await keyword_analyzer.analyze(text),
#         "themes": await theme_analyzer.analyze(text),
#         "categories": await category_analyzer.analyze(text)
#     }
    
#     return results

# # Example with Excel parameters
# async def analyze_batch_with_excel_params(texts: List[str], excel_params: str):
#     """Analyze texts using parameters from Excel."""
#     # Load language-specific parameters
#     params_by_lang = param_manager.load_excel_parameters(excel_params)
    
#     results = []
#     for text in texts:
#         # Detect language
#         lang = param_manager.detect_language(text)
#         # Get parameters for language
#         params = params_by_lang.get(lang, param_manager.get_parameters(text))
        
#         # Create analyzer with language-specific parameters
#         analyzer = KeywordAnalyzer(config=params.dict())
#         result = await analyzer.analyze(text)
#         results.append(result)
    
#     return results

# # Example usage:
# # With default parameters
# results_en = await analyze_text_with_language(text_en)

# # With parameter file
# results_fi = await analyze_text_with_language(text_fi, "finnish_params.yaml")

# # With Excel parameters
# texts = [text_en, text_fi]
# batch_results = await analyze_batch_with_excel_params(texts, "analysis_params.xlsx")

In [None]:
# text = example_texts["Mixed Content"]
# text = example_texts["koulutussisältö"]
# Debug specific analyzer

# Example usage
text = example_texts["Mixed Content"]

In [None]:
await analyze_keywords(text, options=options)


In [None]:
await analyze_themes(text, options=options)


In [None]:
await analyze_categories(text, options=options)


In [None]:
# Or run full pipeline with debug info
await debug_full_pipeline(text)


### Batch Processing from Excel
Process multiple texts from Excel file:


In [None]:
await analyze_excel_content(
    input_file="test_content.xlsx",  # Input Excel file path
    output_file="analysis_results",  # Output filename (without extension)
    content_column="content"         # Column containing text to analyze
)


## Parameters
- Configure analyzers using parameter files
- Control output detail with DebugOptions
- Set logging level for verbosity control

## Example Outputs
The analysis provides:
- Keywords with confidence scores
- Theme identification and descriptions
- Category classification with evidence
- Confidence visualizations with Unicode bars

## Notes
- Set logging level to WARNING to minimize output
- Use debug functions for detailed analysis inspection
- Excel output combines all analysis types