In [23]:
# Import required modules
import sys
from pathlib import Path
from typing import List, Dict, Any, Tuple, Union
import logging
import asyncio

# Add project root to Python path if needed
project_root = str(Path().resolve().parent)
if project_root not in sys.path:
    sys.path.append(project_root)

# Import necessary components
from src.nb_helpers.environment import setup_notebook_env, verify_environment
from src.semantic_analyzer import SemanticAnalyzer
from src.core.config import AnalyzerConfig

from src.core.language_processing import create_text_processor
from src.core.llm.factory import create_llm
from src.loaders.parameter_handler import ParameterHandler
from src.analyzers.keyword_analyzer import KeywordAnalyzer
from src.analyzers.theme_analyzer import ThemeAnalyzer
from src.analyzers.category_analyzer import CategoryAnalyzer

import FileUtils


In [24]:
# In analyzer_demo_local_nb.ipynb and azure_notebook.ipynb
# from src.core.llm.factory import create_llm

# Setup
# config = AnalyzerConfig()
# llm = create_llm(config=config)
# analyzer = SemanticAnalyzer(llm=llm)

In [25]:
print(FileUtils.__version__)

0.5.3


In [26]:
# Set up environment and logging
setup_notebook_env(log_level="DEBUG")
verify_environment()



2024-12-05 22:34:40,699 - FileUtils.core.file_utils - INFO - Project root: /home/topi/data-science/repos/semantic-text-analyzer


INFO: Project root: /home/topi/data-science/repos/semantic-text-analyzer


2024-12-05 22:34:40,716 - FileUtils.core.file_utils - INFO - FileUtils initialized with local storage


INFO: FileUtils initialized with local storage


2024-12-05 22:34:40,769 - FileUtils.core.file_utils - INFO - Project root: /home/topi/data-science/repos/semantic-text-analyzer


INFO: Project root: /home/topi/data-science/repos/semantic-text-analyzer


2024-12-05 22:34:40,776 - FileUtils.core.file_utils - INFO - FileUtils initialized with local storage


INFO: FileUtils initialized with local storage


Environment Check Results:

Basic Setup:
-----------
✓ Project root in path
✓ FileUtils initialized
✓ .env file loaded

Environment Variables:
---------------------
✓ OPENAI_API_KEY set
✓ ANTHROPIC_API_KEY set

Project Structure:
-----------------
✓ Raw data exists
✓ Processed data exists
✓ Configuration exists
✓ Main config.yaml exists

Environment Status: Ready ✓


True

In [27]:
# Test data to use
test_texts = {
    "en": {
        "technical": """Machine learning models are trained using large datasets to recognize patterns. 
                     The neural network architecture includes multiple layers for feature extraction. 
                     Data preprocessing and feature engineering are crucial steps.""",
        "business": """Q3 financial results show 15% revenue growth and improved profit margins. 
                    Customer acquisition costs decreased while retention rates increased. 
                    Market expansion strategy focuses on emerging technology sectors."""
    },
    "fi": {
        "technical": """Koneoppimismalleja koulutetaan suurilla datajoukolla tunnistamaan kaavoja. 
                     Neuroverkon arkkitehtuuri sisältää useita kerroksia piirteiden erottamiseen. 
                     Datan esikäsittely ja piirteiden suunnittelu ovat keskeisiä vaiheita.""",
        "business": """Q3 taloudelliset tulokset osoittavat 15% liikevaihdon kasvun ja parantuneet katteet. 
                    Asiakashankinnan kustannukset laskivat ja asiakaspysyvyys parani. 
                    Markkinalaajennusstrategia keskittyy nouseviin teknologiasektoreihin."""
    }
}



In [28]:
     # llm = create_llm()
config = AnalyzerConfig()

2024-12-05 22:34:40,953 - FileUtils.core.file_utils - INFO - Project root: /home/topi/data-science/repos/semantic-text-analyzer


INFO: Project root: /home/topi/data-science/repos/semantic-text-analyzer


2024-12-05 22:34:40,956 - FileUtils.core.file_utils - INFO - FileUtils initialized with local storage


INFO: FileUtils initialized with local storage


In [29]:
async def test_individual_analyzer(
    analyzer: Union[KeywordAnalyzer, ThemeAnalyzer, CategoryAnalyzer], 
    text: str, 
    analyzer_type: str
):
    """Test individual analyzer component."""
    print(f"\nTesting {analyzer_type} Analysis")
    print("=" * 50)
    print("\nInput text:")
    print(text[:200] + "..." if len(text) > 200 else text)
    
    try:
        results = await analyzer.analyze(text)
        
        print("\nResults:")
        print("-" * 20)
        
        if isinstance(analyzer, KeywordAnalyzer):
            if results.keywords:
                print("\nKeywords:")
                for kw in results.keywords[:10]:  # Show top 10
                    print(f"• {kw.keyword:<20} ({kw.score:.2f})")
                    if kw.domain:
                        print(f"  Domain: {kw.domain}")
                
                if results.compound_words:
                    print("\nCompound Words:")
                    print(", ".join(results.compound_words))
                    
                if results.domain_keywords:
                    print("\nKeywords by Domain:")
                    for domain, kws in results.domain_keywords.items():
                        print(f"\n{domain}:")
                        print(", ".join(kws))
                        
        elif isinstance(analyzer, ThemeAnalyzer):
            if results.themes:
                print("\nThemes:")
                for theme in results.themes:
                    print(f"\n• {theme.name}")
                    print(f"  Confidence: {theme.confidence:.2f}")
                    print(f"  Description: {theme.description}")
                    if theme.keywords:
                        print(f"  Keywords: {', '.join(theme.keywords)}")
                
                if results.theme_hierarchy:
                    print("\nTheme Hierarchy:")
                    for parent, children in results.theme_hierarchy.items():
                        print(f"{parent} -> {', '.join(children)}")
                        
        elif isinstance(analyzer, CategoryAnalyzer):
            if results.categories:
                print("\nCategories:")
                for cat in results.categories:
                    print(f"\n• {cat.name}")
                    print(f"  Confidence: {cat.confidence:.2f}")
                    if cat.description:
                        print(f"  Description: {cat.description}")
                    if cat.evidence:
                        print("\n  Evidence:")
                        for ev in cat.evidence:
                            print(f"  - {ev.text} (relevance: {ev.relevance:.2f})")
                            
        if hasattr(results, 'error') and results.error:
            print(f"\nErrors occurred: {results.error}")
            
        return results
        
    except Exception as e:
        print(f"\nError in analysis: {e}")
        return None

In [30]:
# Example 1: Test individual keyword analyzer
async def test_keyword_analyzer(provider: str = "openai"):
    """Test keyword analyzer with different languages."""
    print("Testing Keyword Analyzer")
    print("=" * 50)
    
    # Initialize components
    parameter_handler = ParameterHandler("parameters_fi.xlsx")
    # llm = create_llm()
    # config = AnalyzerConfig()
    llm = create_llm(provider=provider, config=config)
    # analyzer = SemanticAnalyzer(llm=llm)
    
    # Test English
    print("\nTesting English Technical Content:")
    en_processor = create_text_processor(language="en")
    keyword_analyzer_en = KeywordAnalyzer(
        llm=llm,
        config=parameter_handler.parameters.general.model_dump(),
        language_processor=en_processor
    )
    await test_individual_analyzer(keyword_analyzer_en, test_texts["en"]["technical"], "Keyword")
    
    # Test Finnish
    print("\nTesting Finnish Technical Content:")
    fi_processor = create_text_processor(language="fi")
    keyword_analyzer_fi = KeywordAnalyzer(
        llm=llm,
        config=parameter_handler.parameters.general.model_dump(),
        language_processor=fi_processor
    )
    await test_individual_analyzer(keyword_analyzer_fi, test_texts["fi"]["technical"], "Keyword")



In [31]:
# Example 2: Test all components
async def test_components_for_language(language: str, provider: str = "openai"):
    """Test all components for a specific language."""
    print(f"\nTesting All Components for {language.upper()}")
    print("=" * 50)
    
    # Initialize components
    parameter_handler = ParameterHandler(f"parameters_{language}.xlsx")
     # llm = create_llm()
    # config = AnalyzerConfig()
    llm = create_llm(provider=provider, config=config)

    language_processor = create_text_processor(language=language)
    
    # Create analyzers
    keyword_analyzer = KeywordAnalyzer(
        llm=llm,
        config=parameter_handler.parameters.general.model_dump(),
        language_processor=language_processor
    )
    
    theme_analyzer = ThemeAnalyzer(
        llm=llm,
        config=parameter_handler.parameters.general.model_dump(),
        language_processor=language_processor
    )
    
    category_analyzer = CategoryAnalyzer(
        categories=parameter_handler.parameters.categories,
        llm=llm,
        config=parameter_handler.parameters.general.model_dump(),
        language_processor=language_processor
    )
    
    # Test technical content
    print(f"\nTesting {language.upper()} Technical Content:")
    await test_individual_analyzer(keyword_analyzer, test_texts[language]["technical"], "Keyword")
    await test_individual_analyzer(theme_analyzer, test_texts[language]["technical"], "Theme")
    await test_individual_analyzer(category_analyzer, test_texts[language]["technical"], "Category")
    
    # Test business content
    print(f"\nTesting {language.upper()} Business Content:")
    await test_individual_analyzer(keyword_analyzer, test_texts[language]["business"], "Keyword")
    await test_individual_analyzer(theme_analyzer, test_texts[language]["business"], "Theme")
    await test_individual_analyzer(category_analyzer, test_texts[language]["business"], "Category")



In [32]:
# Example 3: Quick test of full pipeline
async def test_pipeline(provider='openai'):
    """Test full pipeline with both languages."""
    print("Testing Full Pipeline")
    print("=" * 50)
    
    llm = create_llm(provider=provider, config=config)
    # analyzer = SemanticAnalyzer(llm=llm)
    # Test English pipeline
    print("\nEnglish Pipeline:")
    en_analyzer = SemanticAnalyzer(llm=llm, parameter_file="parameters_en.xlsx")
    result = await en_analyzer.analyze(test_texts["en"]["technical"])
    print(f"Success: {result.success}")
    print(f"Keywords found: {len(result.keywords.keywords)}")
    print(f"Themes found: {len(result.themes.themes)}")
    print(f"Categories found: {len(result.categories.matches)}")
    
    # Test Finnish pipeline
    print("\nFinnish Pipeline:")
    fi_analyzer = SemanticAnalyzer(llm=llm, parameter_file="parameters_fi.xlsx")
    result = await fi_analyzer.analyze(test_texts["fi"]["technical"])
    print(f"Success: {result.success}")
    print(f"Keywords found: {len(result.keywords.keywords)}")
    print(f"Themes found: {len(result.themes.themes)}")
    print(f"Categories found: {len(result.categories.matches)}")

# Run the tests
async def run_all_tests():
    """Run all tests."""
    # Test individual component
    await test_keyword_analyzer()
    
    # Test all components by language
    await test_components_for_language("en")
    await test_components_for_language("fi")
    
    # Test full pipeline
    await test_pipeline()



In [33]:
# Run in notebook
await run_all_tests()

# Or run individual tests:
# await test_keyword_analyzer()
# await test_components_for_language("fi")
# await test_pipeline()

Testing Keyword Analyzer
2024-12-05 22:34:41,133 - FileUtils.core.file_utils - INFO - Project root: /home/topi/data-science/repos/semantic-text-analyzer


INFO: Project root: /home/topi/data-science/repos/semantic-text-analyzer


2024-12-05 22:34:41,136 - FileUtils.core.file_utils - INFO - FileUtils initialized with local storage


INFO: FileUtils initialized with local storage



Testing English Technical Content:
2024-12-05 22:34:41,436 - FileUtils.core.file_utils - INFO - Project root: /home/topi/data-science/repos/semantic-text-analyzer


INFO: Project root: /home/topi/data-science/repos/semantic-text-analyzer


2024-12-05 22:34:41,439 - FileUtils.core.file_utils - INFO - FileUtils initialized with local storage


INFO: FileUtils initialized with local storage



Testing Keyword Analysis

Input text:
Machine learning models are trained using large datasets to recognize patterns. 
                     The neural network architecture includes multiple layers for feature extraction. 
                ...


INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Results:
--------------------

Keywords:
• machine learning     (0.95)
  Domain: technical
• neural network       (0.95)
  Domain: technical
• feature extraction   (0.90)
  Domain: technical
• data preprocessing   (0.90)
  Domain: technical
• feature engineering  (0.90)
  Domain: technical

Compound Words:
machine learning, neural network, feature extraction, data preprocessing, feature engineering

Testing Finnish Technical Content:
2024-12-05 22:34:45,287 - FileUtils.core.file_utils - INFO - Project root: /home/topi/data-science/repos/semantic-text-analyzer


INFO: Project root: /home/topi/data-science/repos/semantic-text-analyzer


2024-12-05 22:34:45,292 - FileUtils.core.file_utils - INFO - FileUtils initialized with local storage


INFO: FileUtils initialized with local storage
INFO: Loaded 747 stopwords from /home/topi/data-science/repos/semantic-text-analyzer/data/configurations/stop_words/fi.txt
INFO: Successfully initialized Voikko with path: /usr/lib/voikko



Testing Keyword Analysis

Input text:
Koneoppimismalleja koulutetaan suurilla datajoukolla tunnistamaan kaavoja. 
                     Neuroverkon arkkitehtuuri sisältää useita kerroksia piirteiden erottamiseen. 
                     Data...


INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Results:
--------------------

Keywords:
• koneoppimismalli     (0.95)
  Domain: technical
• datajoukko           (0.95)
  Domain: technical
• neuroverkon arkkitehtuuri (0.95)
  Domain: technical
• datan esikäsittely   (0.95)
  Domain: technical
• piirteiden suunnittelu (0.95)
  Domain: technical

Compound Words:
koneoppimismalli, datajoukko, neuroverkon arkkitehtuuri, datan esikäsittely, piirteiden suunnittelu

Testing All Components for EN
2024-12-05 22:34:48,930 - FileUtils.core.file_utils - INFO - Project root: /home/topi/data-science/repos/semantic-text-analyzer


INFO: Project root: /home/topi/data-science/repos/semantic-text-analyzer


2024-12-05 22:34:48,933 - FileUtils.core.file_utils - INFO - FileUtils initialized with local storage


INFO: FileUtils initialized with local storage


2024-12-05 22:34:49,023 - FileUtils.core.file_utils - INFO - Project root: /home/topi/data-science/repos/semantic-text-analyzer


INFO: Project root: /home/topi/data-science/repos/semantic-text-analyzer


2024-12-05 22:34:49,026 - FileUtils.core.file_utils - INFO - FileUtils initialized with local storage


INFO: FileUtils initialized with local storage



Testing EN Technical Content:

Testing Keyword Analysis

Input text:
Machine learning models are trained using large datasets to recognize patterns. 
                     The neural network architecture includes multiple layers for feature extraction. 
                ...


INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Results:
--------------------

Keywords:
• machine learning     (0.95)
  Domain: technical
• neural network       (0.95)
  Domain: technical
• feature extraction   (0.90)
  Domain: technical
• data preprocessing   (0.90)
  Domain: technical
• feature engineering  (0.90)
  Domain: technical

Compound Words:
machine learning, neural network, feature extraction, data preprocessing, feature engineering

Testing Theme Analysis

Input text:
Machine learning models are trained using large datasets to recognize patterns. 
                     The neural network architecture includes multiple layers for feature extraction. 
                ...


INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Raw LLM response: {'themes': [{'name': 'Machine Learning Fundamentals', 'description': 'The foundational concepts and processes involved in machine learning, including model training and pattern recognition.', 'confidence': 0.95, 'keywords': ['machine learning', 'train', 'recognize', 'model'], 'domain': 'general content analysis'}, {'name': 'Neural Network Architecture', 'description': 'The structure and design of neural networks, emphasizing the importance of layers and feature extraction.', 'confidence': 0.9, 'keywords': ['neural', 'network', 'architecture', 'layers', 'feature extraction'], 'domain': 'general content analysis', 'parent_theme': 'Machine Learning Fundamentals'}, {'name': 'Data Preparation Techniques', 'description': 'The essential processes of data preprocessing and feature engineering that enhance the quality of input data for machine learning models.', 'confidence': 0.85, 'keywords': ['data preprocessing', 'feature engineering', 'datasets'], 'domain': 'general conte

INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Processing response: {'categories': [{'category': 'Machine Learning', 'confidence': 0.95, 'explanation': 'The text discusses key concepts and processes involved in machine learning, such as model training, neural networks, and feature extraction.', 'evidence': [{'text': 'Machine learning models are trained using large datasets to recognize patterns.', 'relevance': 0.9, 'matched_keywords': ['machine', 'learning', 'models', 'trained', 'datasets', 'recognize', 'patterns'], 'context': 'The sentence introduces the concept of machine learning and its reliance on data.'}, {'text': 'The neural network architecture includes multiple layers for feature extraction.', 'relevance': 0.9, 'matched_keywords': ['neural', 'network', 'architecture', 'include', 'multiple', 'layers', 'feature', 'extraction'], 'context': 'This sentence elaborates on the structure of neural networks, a fundamental aspect of machine learning.'}, {'text': 'Data preprocessing and feature engineering are crucial steps.', 'relev

INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Results:
--------------------

Keywords:
• revenue growth       (0.95)
  Domain: business
• profit margins       (0.90)
  Domain: business
• customer acquisition costs (0.92)
  Domain: business
• retention rates      (0.88)
  Domain: business
• market expansion strategy (0.93)
  Domain: business
• emerging technology sectors (0.91)
  Domain: technical

Compound Words:
revenue growth, profit margins, customer acquisition costs, retention rates, market expansion strategy, emerging technology sectors

Testing Theme Analysis

Input text:
Q3 financial results show 15% revenue growth and improved profit margins. 
                    Customer acquisition costs decreased while retention rates increased. 
                    Market expansi...


INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Raw LLM response: {'themes': [{'name': 'Financial Performance', 'description': 'The analysis of financial results indicating revenue growth and profit margin improvements.', 'confidence': 0.95, 'keywords': ['revenue', 'profit', 'growth', 'margin'], 'domain': 'general content analysis'}, {'name': 'Customer Dynamics', 'description': 'Trends in customer acquisition costs and retention rates, highlighting the effectiveness of customer strategies.', 'confidence': 0.9, 'keywords': ['customer', 'acquisition', 'retention', 'cost'], 'domain': 'general content analysis', 'parent_theme': 'Financial Performance'}, {'name': 'Market Strategy', 'description': 'The focus on market expansion into emerging technology sectors as a strategic initiative.', 'confidence': 0.85, 'keywords': ['market', 'expansion', 'strategy', 'technology'], 'domain': 'general content analysis'}], 'evidence': {'Financial Performance': [{'text': 'Q3 financial results show 15% revenue growth and improved profit margins.', 'rele

INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Processing response: {'categories': [{'category': 'Financial Performance', 'confidence': 0.95, 'explanation': 'The text discusses financial results, specifically mentioning revenue growth and profit margins, which are key indicators of financial performance.', 'evidence': [{'text': 'Q3 financial results show 15% revenue growth and improved profit margins.', 'relevance': 0.9, 'matched_keywords': ['financial', 'revenue', 'growth', 'profit', 'margin', 'Q3', 'result'], 'context': 'The text explicitly states financial results and metrics.'}], 'themes': ['revenue growth', 'profit margins', 'financial analysis']}, {'category': 'Customer Strategy', 'confidence': 0.85, 'explanation': 'The text highlights customer acquisition costs and retention rates, indicating a focus on customer strategy.', 'evidence': [{'text': 'Customer acquisition costs decreased while retention rates increased.', 'relevance': 0.9, 'matched_keywords': ['customer', 'acquisition', 'cost', 'decrease', 'retention', 'increase

INFO: Project root: /home/topi/data-science/repos/semantic-text-analyzer


2024-12-05 22:35:30,214 - FileUtils.core.file_utils - INFO - FileUtils initialized with local storage


INFO: FileUtils initialized with local storage


2024-12-05 22:35:30,368 - FileUtils.core.file_utils - INFO - Project root: /home/topi/data-science/repos/semantic-text-analyzer


INFO: Project root: /home/topi/data-science/repos/semantic-text-analyzer


2024-12-05 22:35:30,371 - FileUtils.core.file_utils - INFO - FileUtils initialized with local storage


INFO: FileUtils initialized with local storage
INFO: Loaded 747 stopwords from /home/topi/data-science/repos/semantic-text-analyzer/data/configurations/stop_words/fi.txt
INFO: Successfully initialized Voikko with path: /usr/lib/voikko



Testing FI Technical Content:

Testing Keyword Analysis

Input text:
Koneoppimismalleja koulutetaan suurilla datajoukolla tunnistamaan kaavoja. 
                     Neuroverkon arkkitehtuuri sisältää useita kerroksia piirteiden erottamiseen. 
                     Data...


INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Results:
--------------------

Keywords:
• koneoppimismalli     (0.95)
  Domain: technical
• datajoukko           (0.95)
  Domain: technical
• neuroverkon arkkitehtuuri (0.95)
  Domain: technical
• datan esikäsittely   (0.95)
  Domain: technical
• piirteiden suunnittelu (0.95)
  Domain: technical

Compound Words:
koneoppimismalli, datajoukko, neuroverkon arkkitehtuuri, datan esikäsittely, piirteiden suunnittelu

Testing Theme Analysis

Input text:
Koneoppimismalleja koulutetaan suurilla datajoukolla tunnistamaan kaavoja. 
                     Neuroverkon arkkitehtuuri sisältää useita kerroksia piirteiden erottamiseen. 
                     Data...


INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Raw LLM response: {'themes': [{'name': 'Koneoppimismallien koulutus', 'description': 'Koneoppimismallien koulutus suurilla datajoukoilla on keskeinen prosessi, jossa malleja opetetaan tunnistamaan kaavoja datasta.', 'confidence': 0.95, 'keywords': ['koneoppimismalli', 'datajoukko', 'kaava'], 'domain': 'general content analysis'}, {'name': 'Neuroverkon arkkitehtuuri', 'description': 'Neuroverkon arkkitehtuuri koostuu useista kerroksista, jotka mahdollistavat piirteiden erottamisen ja analysoinnin.', 'confidence': 0.9, 'keywords': ['neuroverkko', 'arkkitehtuuri', 'kerros', 'piirre'], 'domain': 'general content analysis', 'parent_theme': 'Koneoppimismallien koulutus'}, {'name': 'Datan esikäsittely ja piirteiden suunnittelu', 'description': 'Datan esikäsittely ja piirteiden suunnittelu ovat keskeisiä vaiheita koneoppimismallien kehittämisessä, jotka vaikuttavat mallin suorituskykyyn.', 'confidence': 0.85, 'keywords': ['esikäsittely', 'suunnittelu', 'keskeinen'], 'domain': 'general content

INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Processing response: {'categories': [{'category': 'Machine Learning', 'confidence': 0.95, 'explanation': 'The text discusses concepts related to machine learning, including training models on large datasets, neural network architecture, and the importance of data preprocessing and feature design.', 'evidence': [{'text': 'Koneoppimismalleja koulutetaan suurilla datajoukolla tunnistamaan kaavoja.', 'relevance': 0.9, 'matched_keywords': ['koneoppimismalli', 'datajoukko', 'kaava'], 'context': 'The sentence explicitly mentions training machine learning models on large datasets to recognize patterns.'}, {'text': 'Neuroverkon arkkitehtuuri sisältää useita kerroksia piirteiden erottamiseen.', 'relevance': 0.9, 'matched_keywords': ['neuroverkon', 'arkkitehtuuri', 'kerros', 'piirre'], 'context': 'This part of the text refers to the architecture of neural networks, which is a fundamental concept in machine learning.'}, {'text': 'Datan esikäsittely ja piirteiden suunnittelu ovat keskeisiä vaiheit

INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Results:
--------------------

Keywords:
• taloudellinen        (0.95)
  Domain: business
• tulos                (0.95)
  Domain: business
• liikevaihto          (0.95)
  Domain: business
• kasvu                (0.95)
  Domain: business
• parantunut           (0.95)
  Domain: business
• asiakashankinnan kustannukset (0.90)
  Domain: business
• asiakaspysyvyys      (0.90)
  Domain: business
• markkinalaajennusstrategia (0.90)
  Domain: business
• nousevat teknologiasektorit (0.90)
  Domain: technical

Compound Words:
asiakashankinnan kustannukset, markkinalaajennusstrategia, nousevat teknologiasektorit

Testing Theme Analysis

Input text:
Q3 taloudelliset tulokset osoittavat 15% liikevaihdon kasvun ja parantuneet katteet. 
                    Asiakashankinnan kustannukset laskivat ja asiakaspysyvyys parani. 
                    Markkin...


INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Raw LLM response: {'themes': [{'name': 'Taloudellinen kasvu', 'description': 'Tämä teema käsittelee yrityksen taloudellista kehitystä, erityisesti liikevaihdon kasvua ja parantuneita katteita.', 'confidence': 0.95, 'keywords': ['liikevaihto', 'kasvu', 'taloudellinen', 'kate'], 'domain': 'general content analysis'}, {'name': 'Asiakaspysyvyys ja asiakashankinta', 'description': 'Teema keskittyy asiakassuhteiden hallintaan, mukaan lukien asiakashankinnan kustannusten lasku ja asiakaspysyvyyden parantuminen.', 'confidence': 0.9, 'keywords': ['asiakaspysyvyys', 'asiakashankinta', 'kustannus'], 'domain': 'general content analysis', 'parent_theme': 'Taloudellinen kasvu'}, {'name': 'Markkinalaajennusstrategia', 'description': 'Tämä teema käsittelee yrityksen strategiaa laajentaa markkinoitaan erityisesti nouseviin teknologiasektoreihin.', 'confidence': 0.85, 'keywords': ['markkinalaajennusstrategia', 'teknologia', 'nouseva', 'sektori'], 'domain': 'general content analysis'}], 'evidence': {'Ta

INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Processing response: {'categories': [{'category': 'Financial Performance', 'confidence': 0.9, 'explanation': 'The text discusses financial results, specifically mentioning revenue growth and improved margins, which are key indicators of financial performance.', 'evidence': [{'text': 'Q3 taloudelliset tulokset osoittavat 15% liikevaihdon kasvun ja parantuneet katteet.', 'relevance': 0.95, 'matched_keywords': ['taloudelliset', 'liikevaihto', 'kasvu', 'kate'], 'context': 'The text provides specific financial metrics indicating performance.'}], 'themes': ['revenue growth', 'profit margins']}, {'category': 'Customer Acquisition', 'confidence': 0.85, 'explanation': 'The text mentions a decrease in customer acquisition costs and an improvement in customer retention, which are critical aspects of customer acquisition strategies.', 'evidence': [{'text': 'Asiakashankinnan kustannukset laskivat ja asiakaspysyvyys parani.', 'relevance': 0.9, 'matched_keywords': ['asiakashankinta', 'kustannukset',

INFO: Project root: /home/topi/data-science/repos/semantic-text-analyzer


2024-12-05 22:36:41,658 - FileUtils.core.file_utils - INFO - FileUtils initialized with local storage


INFO: FileUtils initialized with local storage


2024-12-05 22:36:41,692 - FileUtils.core.file_utils - INFO - Project root: /home/topi/data-science/repos/semantic-text-analyzer


INFO: Project root: /home/topi/data-science/repos/semantic-text-analyzer


2024-12-05 22:36:41,696 - FileUtils.core.file_utils - INFO - FileUtils initialized with local storage


INFO: FileUtils initialized with local storage


2024-12-05 22:36:41,754 - FileUtils.core.file_utils - INFO - Project root: /home/topi/data-science/repos/semantic-text-analyzer


INFO: Project root: /home/topi/data-science/repos/semantic-text-analyzer


2024-12-05 22:36:41,759 - FileUtils.core.file_utils - INFO - FileUtils initialized with local storage


INFO: FileUtils initialized with local storage
INFO: Semantic analyzer initialization complete
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Raw LLM response: {'themes': [{'name': 'Machine Learning Fundamentals', 'description': 'The foundational concepts and processes involved in machine learning, including model training and pattern recognition.', 'confidence': 0.95, 'keywords': ['machine learning', 'train', 'recognize', 'model'], 'domain': 'general content analysis'}, {'name': 'Neural Network Architecture', 'description': 'The structure and design of neural networks, emphasizing the importance of layers and feature extraction.', 'confidence': 0.9, 'keywords': ['neural', 'network', 'architecture', 'layers', 'feature extraction'], 'domain': 'general content analysis', 'parent_theme': 'Machine Learning Fundamentals'}, {'name': 'Data Preprocessing and Feature Engineering', 'description': 'The essential steps of preparing data and creating features to improve model performance.', 'confidence': 0.85, 'keywords': ['data preprocessing', 'feature engineering', 'crucial', 'datasets'], 'domain': 'general content analysis', 'parent_

INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Processing response: {'categories': [{'category': 'Machine Learning', 'confidence': 0.95, 'explanation': 'The text discusses key concepts and processes involved in machine learning, such as model training, neural networks, and feature extraction.', 'evidence': [{'text': 'Machine learning models are trained using large datasets to recognize patterns.', 'relevance': 0.9, 'matched_keywords': ['machine', 'learning', 'models', 'trained', 'datasets', 'recognize', 'patterns'], 'context': 'The sentence introduces the concept of machine learning and its reliance on datasets.'}, {'text': 'The neural network architecture includes multiple layers for feature extraction.', 'relevance': 0.9, 'matched_keywords': ['neural', 'network', 'architecture', 'include', 'multiple', 'layers', 'feature', 'extraction'], 'context': 'This sentence elaborates on the structure of neural networks, a fundamental aspect of machine learning.'}, {'text': 'Data preprocessing and feature engineering are crucial steps.', 'r

INFO: Project root: /home/topi/data-science/repos/semantic-text-analyzer


2024-12-05 22:36:48,552 - FileUtils.core.file_utils - INFO - FileUtils initialized with local storage


INFO: FileUtils initialized with local storage


2024-12-05 22:36:48,589 - FileUtils.core.file_utils - INFO - Project root: /home/topi/data-science/repos/semantic-text-analyzer


INFO: Project root: /home/topi/data-science/repos/semantic-text-analyzer


2024-12-05 22:36:48,593 - FileUtils.core.file_utils - INFO - FileUtils initialized with local storage


INFO: FileUtils initialized with local storage


2024-12-05 22:36:48,626 - FileUtils.core.file_utils - INFO - Project root: /home/topi/data-science/repos/semantic-text-analyzer


INFO: Project root: /home/topi/data-science/repos/semantic-text-analyzer


2024-12-05 22:36:48,631 - FileUtils.core.file_utils - INFO - FileUtils initialized with local storage


INFO: FileUtils initialized with local storage
INFO: Loaded 747 stopwords from /home/topi/data-science/repos/semantic-text-analyzer/data/configurations/stop_words/fi.txt
INFO: Successfully initialized Voikko with path: /usr/lib/voikko
INFO: Semantic analyzer initialization complete
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Processing response: {'categories': [{'category': 'Data Science', 'confidence': 0.9, 'explanation': 'The text discusses concepts related to data processing, machine learning models, and neural network architecture, which are all key components of data science.', 'evidence': [{'text': 'Koneoppimismalleja koulutetaan suurilla datajoukolla tunnistamaan kaavoja.', 'relevance': 0.9, 'matched_keywords': ['koneoppimismalleja', 'data'], 'context': 'The sentence describes training machine learning models on large datasets to identify patterns.'}, {'text': 'Datan esikäsittely ja piirteiden suunnittelu ovat keskeisiä vaiheita.', 'relevance': 0.85, 'matched_keywords': ['esikäsittely', 'keskeinen', 'data'], 'context': 'This part emphasizes the importance of data preprocessing and feature design in the data science workflow.'}], 'themes': ['machine learning', 'data preprocessing']}], 'relationships': {'Data Science': ['Machine Learning', 'Data Analysis']}}

Processing category: {'category': 'Data S

INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Raw LLM response: {'themes': [{'name': 'Koneoppimismallit', 'description': 'Koneoppimismallit koulutetaan suurilla datajoukolla kaavojen tunnistamiseksi, mikä on keskeinen osa modernia data-analytiikkaa.', 'confidence': 0.95, 'keywords': ['koneoppiminen', 'datajoukot', 'kaavojen tunnistaminen'], 'domain': 'general content analysis'}, {'name': 'Neuroverkon arkkitehtuuri', 'description': 'Neuroverkon arkkitehtuuri koostuu useista kerroksista, jotka mahdollistavat piirteiden erottamisen tehokkaasti.', 'confidence': 0.9, 'keywords': ['neuroverkot', 'arkkitehtuuri', 'kerrokset'], 'domain': 'general content analysis', 'parent_theme': 'Koneoppimismallit'}, {'name': 'Datan esikäsittely', 'description': 'Datan esikäsittely ja piirteiden suunnittelu ovat keskeisiä vaiheita koneoppimismallien kehittämisessä.', 'confidence': 0.85, 'keywords': ['datan esikäsittely', 'piirteiden suunnittelu', 'kehittäminen'], 'domain': 'general content analysis', 'parent_theme': 'Koneoppimismallit'}], 'evidence': {