# Semantic Text Analyzer Demo Notebook


## Setup and Environment

In [1]:

import os
import sys
from pathlib import Path
import asyncio
import logging
from typing import Dict, Any, Optional
import pandas as pd

# Add project root to Python path
project_root = str(Path().resolve().parent)
if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Added {project_root} to Python path")

# Core components
from src.semantic_analyzer.analyzer import SemanticAnalyzer
from src.utils.FileUtils.file_utils import FileUtils
from src.loaders.parameter_adapter import ParameterAdapter
from src.core.config import AnalyzerConfig
from src.loaders.models import ParameterSet

# Initialize FileUtils and set up logging
file_utils = FileUtils()
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)



Added C:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer to Python path


## Environment Verification

In [2]:
def verify_environment():
    """Verify that the notebook environment is properly configured."""
    # Load environment variables
    from dotenv import load_dotenv
    env_path = Path(project_root) / ".env"
    env_loaded = load_dotenv(env_path)

    # Required environment variables
    required_env_vars = [
        'OPENAI_API_KEY',
        'ANTHROPIC_API_KEY',
    ]

    # Basic checks
    basic_checks = {
        "Project root in path": project_root in sys.path,
        "Can import src": "src" in sys.modules,
        "FileUtils initialized": hasattr(file_utils, "project_root"),
        ".env file loaded": env_loaded,
    }

    # Environment variable checks
    env_var_checks = {
        f"{var} set": os.getenv(var) is not None
        for var in required_env_vars
    }

    # Check for required paths using FileUtils
    expected_paths = {
        "Raw data": file_utils.get_data_path("raw"),
        "Processed data": file_utils.get_data_path("processed"),
        "Configuration": file_utils.get_data_path("configurations"),
        "Main config.yaml": Path(project_root) / "config.yaml"
    }
    
    path_checks = {
        f"{name} exists": path.exists()
        for name, path in expected_paths.items()
    }

    # Combine all checks
    all_checks = {
        **basic_checks,
        **env_var_checks,
        **path_checks
    }
    
    # Print results
    print("Environment Check Results:")
    print("=" * 50)
    
    def print_section(title, checks):
        print(f"\n{title}:")
        print("-" * len(title))
        for check, result in checks.items():
            status = "✓" if result else "✗"
            print(f"{status} {check}")
    
    print_section("Basic Setup", basic_checks)
    print_section("Environment Variables", env_var_checks)
    print_section("Project Structure", path_checks)
    
    # Overall status
    all_passed = all(all_checks.values())
    print("\n" + "=" * 50)
    print("Environment Status:", "Ready ✓" if all_passed else "Setup needed ✗")
    
    if not all_passed:
        print("\nSetup Instructions:")
        if not env_loaded:
            print("- Create a .env file in the project root with required API keys")
        for var in required_env_vars:
            if not os.getenv(var):
                print(f"- Add {var} to your .env file")
        for name, path in expected_paths.items():
            if not path.exists():
                print(f"- Create {name} directory at {path}")

    return all_passed

# Run verification
verify_environment()



Environment Check Results:

Basic Setup:
-----------
✓ Project root in path
✓ Can import src
✓ FileUtils initialized
✓ .env file loaded

Environment Variables:
---------------------
✓ OPENAI_API_KEY set
✓ ANTHROPIC_API_KEY set

Project Structure:
-----------------
✓ Raw data exists
✓ Processed data exists
✓ Configuration exists
✓ Main config.yaml exists

Environment Status: Ready ✓


True

## Helper Functions and Test Data Setup

In [3]:
class NotebookAnalyzer:
    """Helper class for running analyses in the notebook."""
    
    def __init__(self):
        self.file_utils = FileUtils()
        self.test_texts = self._load_test_texts()
        
        # Use parameter file from configurations directory
        self.parameter_file = self.file_utils.get_data_path("configurations") / "example_params.xlsx"
        
        # Create the parameter file if it doesn't exist
        if not self.parameter_file.exists():
            self._create_parameter_file()
        
        self.param_adapter = ParameterAdapter(self.parameter_file)
        
        logger.info(f"Initialized NotebookAnalyzer with {len(self.test_texts)} test texts")
    
    def _load_test_texts(self) -> Dict[str, str]:
        """Load or create test texts."""
        texts = {
            "technical": """
            Python is a high-level programming language known for its simplicity.
            It supports multiple programming paradigms including procedural and
            object-oriented programming.
            """,
            "business": """
            The company's Q3 results exceeded expectations with revenue growth of 15%.
            Customer acquisition costs decreased while retention rates improved.
            The board has approved a new strategic initiative focusing on expansion.
            """,
            "finnish": """
            Ohjelmistokehittäjä työskentelee asiakasprojektissa kehittäen
            verkkokauppajärjestelmää. Tekninen toteutus vaatii erityistä huomiota
            tietoturvan osalta.
            """
        }
        
        # Save texts using FileUtils
        df = pd.DataFrame([
            {"name": name, "content": content.strip()}
            for name, content in texts.items()
        ])
        
        self.file_utils.save_data_to_disk(
            data={"texts": df},
            output_type="raw",
            file_name="test_texts",
            output_filetype="xlsx",
            include_timestamp=False
        )
        
        return texts

    def _create_parameter_file(self):
        """Create initial parameter Excel file."""
        parameters_data = {
            'General Parameters': pd.DataFrame({
                'parameter': [
                    'max_kws',
                    'max_themes',
                    'focus_on',
                    'language',
                    'additional_context',
                    'min_confidence',
                    'include_compounds',
                    'column_name_to_analyze'
                ],
                'value': [
                    8,
                    3,
                    'business and financial content',
                    'en',
                    'Business performance analysis',
                    0.3,
                    True,
                    'text'
                ]
            }),
            'Excluded Keywords': pd.DataFrame({
                'keyword': [
                    'the', 'with', 'while', 'new',
                    'company', 'business', 'results',
                    'approximately', 'significantly',
                    'current', 'various', 'multiple'
                ],
                'reason': [
                    'Common word', 'Common word', 'Common word', 'Common word',
                    'Too generic', 'Too generic', 'Too generic',
                    'Modifier', 'Modifier',
                    'Vague term', 'Vague term', 'Vague term'
                ]
            }),
            'Categories': pd.DataFrame({
                'category': [
                    'business_performance',
                    'financial_metrics',
                    'strategy',
                    'operations'
                ],
                'description': [
                    'Business performance indicators and results',
                    'Financial and revenue related metrics',
                    'Strategic initiatives and planning',
                    'Operational metrics and processes'
                ],
                'keywords': [
                    'revenue,growth,performance,results,expectations',
                    'costs,revenue,profit,margin,acquisition',
                    'initiative,strategic,expansion,planning,board',
                    'operations,efficiency,retention,improvement,process'
                ],
                'threshold': [0.6, 0.6, 0.6, 0.6]
            })
        }
        
        self.file_utils.save_data_to_disk(
            data=parameters_data,
            output_type="configurations",
            file_name="example_params",
            output_filetype="xlsx",
            include_timestamp=False
        )

    async def analyze_text(self, text_key: str, **kwargs):
        """Analyze a specific text sample."""
        if text_key not in self.test_texts:
            raise ValueError(f"Unknown text key: {text_key}. Available keys: {list(self.test_texts.keys())}")
        
        # Load parameters
        params = self.param_adapter.load_and_convert()
        
        # Get excluded keywords from parameter file
        excluded_keywords = set()
        try:
            excluded_df = pd.read_excel(self.parameter_file, sheet_name='Excluded Keywords')
            excluded_keywords = set(excluded_df['keyword'].dropna())
        except Exception as e:
            logger.warning(f"Could not load excluded keywords: {e}")
        
        # Create analyzer with explicit config
        analyzer = SemanticAnalyzer(
            parameter_file=self.parameter_file,
            config={
                "excluded_keywords": excluded_keywords,
                "min_keyword_length": 3,
                **kwargs.get('config', {})
            },
            **kwargs
        )
        
        # Run analysis
        results = await analyzer.analyze(self.test_texts[text_key], **kwargs)
        await self.display_results(results)
        return results
            
    
    @staticmethod
    async def display_results(results: Dict[str, Any]) -> None:
        """Display analysis results in a formatted way."""
        for analysis_type, data in results.items():
            print(f"\n{'='*20} {analysis_type.upper()} {'='*20}")
            if isinstance(data, dict):
                for key, value in data.items():
                    if isinstance(value, (list, dict)):
                        print(f"\n{key}:")
                        print(value)
                    else:
                        print(f"{key}: {value}")
            else:
                print(data)

In [4]:
# Initialize analyzer
notebook_analyzer = NotebookAnalyzer()



2024-11-08 20:20:23 - src.utils.FileUtils.file_utils - INFO - Data saved to c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\raw\test_texts.xlsx
2024-11-08 20:20:23 - src.utils.FileUtils.file_utils - INFO - Successfully loaded 4 sheets from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\example_params.xlsx
2024-11-08 20:20:23 - __main__ - INFO - Initialized NotebookAnalyzer with 3 test texts


### Verify setup


In [5]:
async def verify_setup():
    """Verify analyzer setup and available texts."""
    print("=== Setup Verification ===")
    print("\nAvailable test texts:")
    for name, text in notebook_analyzer.test_texts.items():
        print(f"\n{name}:")
        print(text[:100] + "...")  # Show first 100 chars
    
    print("\n=== Parameter File ===")
    print(f"Location: {notebook_analyzer.parameter_file}")
    print("Parameters loaded:", notebook_analyzer.param_adapter is not None)

# Run verification
await verify_setup()

=== Setup Verification ===

Available test texts:

technical:

            Python is a high-level programming language known for its simplicity.
            It su...

business:

            The company's Q3 results exceeded expectations with revenue growth of 15%.
            ...

finnish:

            Ohjelmistokehittäjä työskentelee asiakasprojektissa kehittäen
            verkkokauppaj...

=== Parameter File ===
Location: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\example_params.xlsx
Parameters loaded: True


In [6]:
# Verify parameter loading and stopword handling
async def verify_parameters():
    """Verify parameter loading and stopword handling."""
    # Load parameters
    params = notebook_analyzer.param_adapter.load_and_convert()
    
    print("=== Parameter Verification ===")
    print("\nExcluded Keywords from parameters:")
    if hasattr(params, 'excluded_keywords'):
        print(sorted(list(params.excluded_keywords)))
    else:
        print("No excluded keywords in parameters")
    
    # Create analyzer with parameters
    analyzer = SemanticAnalyzer(
        parameter_file=notebook_analyzer.parameter_file,
        language="en"
    )
    
    print("\n=== Language Processing Setup ===")
    print(f"Language: {analyzer.text_processor.language}")
    print(f"\nStopwords (NLTK + custom):")
    print(f"Total stopwords: {len(analyzer.text_processor._stop_words)}")
    print("\nSample stopwords (first 20):")
    print(sorted(list(analyzer.text_processor._stop_words))[:20])
    
    print(f"\nExcluded keywords from config:")
    print(sorted(list(analyzer.text_processor.excluded_keywords)))
    
    return analyzer

# Run verification
# analyzer = await verify_parameters()

# Test analysis with verified setup
# results = await analyzer.analyze(
#     notebook_analyzer.test_texts['business'],
#     analysis_types=["keywords", "themes", "categories"]
# )

In [7]:
async def test_keyword_extraction():
    """Test keyword extraction with full debug output."""
    text_key = "business"
    text = notebook_analyzer.test_texts[text_key]
    
    # Create analyzer with explicit config
    excluded_df = pd.read_excel(notebook_analyzer.parameter_file, sheet_name='Excluded Keywords')
    excluded_keywords = set(excluded_df['keyword'].dropna())
    
    analyzer = SemanticAnalyzer(
        parameter_file=notebook_analyzer.parameter_file,
        language="en",
        config={
            "excluded_keywords": excluded_keywords,
            "min_keyword_length": 3,
            "max_keywords": 8,
            "focus": "business metrics and performance"
        }
    )
    
    print("=== Keyword Extraction Test ===")
    print("\nOriginal text:")
    print(text)
    
    # Show processing steps
    tokens = analyzer.text_processor.tokenize(text)
    print("\nTokens after stopword removal:")
    filtered = [word for word in tokens 
               if not analyzer.text_processor.should_exclude_word(word)]
    print(filtered)
    
    # Run analysis
    results = await analyzer.analyze(
        text, 
        analysis_types=["keywords"]
    )
    
    print("\nExtracted Keywords:")
    if "keywords" in results:
        keywords = results["keywords"]
        print("\nKeywords:", keywords.get("keywords", []))
        print("\nCompound words:", keywords.get("compound_words", []))
        print("\nDomain keywords:", keywords.get("domain_keywords", {}))

# Run the test
await test_keyword_extraction()

2024-11-08 20:20:44 - src.utils.FileUtils.file_utils - INFO - Successfully loaded 4 sheets from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\example_params.xlsx
2024-11-08 20:20:44 - src.utils.FileUtils.file_utils - INFO - Successfully loaded 4 sheets from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\example_params.xlsx
2024-11-08 20:20:44 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-08 20:20:46 - src.semantic_analyzer.analyzer - INFO - Initialized SemanticAnalyzer with language: en, using parameter file: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\example_params.xlsx
2024-11-08 20:20:47 - src.semantic_analyzer.analyzer - INFO - Initialized

=== Keyword Extraction Test ===

Original text:

            The company's Q3 results exceeded expectations with revenue growth of 15%.
            Customer acquisition costs decreased while retention rates improved.
            The board has approved a new strategic initiative focusing on expansion.
            

Tokens after stopword removal:
['company', "'s", 'Q3', 'results', 'exceeded', 'expectations', 'revenue', 'growth', '15', '%', '.', 'Customer', 'acquisition', 'costs', 'decreased', 'retention', 'rates', 'improved', '.', 'board', 'approved', 'new', 'strategic', 'initiative', 'focusing', 'expansion', '.']


2024-11-08 20:20:56 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Extracted Keywords:

Keywords: ['company', 'result', 'exceed', 'expectation', 'revenue', 'growth', 'customer', 'acquisition']

Compound words: ['customer+acquisition', 'retention+rates', 'strategic+initiative']

Domain keywords: {'business': ['Q3', 'results', 'revenue', 'growth', 'acquisition', 'retention', 'strategic', 'initiative']}


In [8]:
# Cell 4 - Run Analysis
# Analyze business text
results = await notebook_analyzer.analyze_text(
    "business",
    analysis_types=["keywords", "themes", "categories"],
    language="en"
)

2024-11-08 20:21:13 - src.utils.FileUtils.file_utils - INFO - Successfully loaded 4 sheets from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\example_params.xlsx


2024-11-08 20:21:13 - src.utils.FileUtils.file_utils - INFO - Successfully loaded 4 sheets from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\example_params.xlsx
2024-11-08 20:21:13 - src.utils.FileUtils.file_utils - INFO - Successfully loaded 4 sheets from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\example_params.xlsx
2024-11-08 20:21:13 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-08 20:21:15 - src.semantic_analyzer.analyzer - INFO - Initialized SemanticAnalyzer with language: en, using parameter file: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\example_params.xlsx
2024-11-08 20:21:16 - src.semantic_analyzer.analyzer - INFO - Initialized



keywords:
['company', 'result', 'exceed', 'expectation', 'revenue', 'growth', 'customer', 'acquisition']

keyword_scores:
{'company': 0.4, 'result': 0.4, 'exceed': 0.4, 'expectation': 0.4, 'revenue': 0.9400000000000001, 'growth': 0.88, 'customer': 0.4, 'acquisition': 0.8200000000000001, 'cost': 0.4, 'decrease': 0.4, 'retention': 0.8200000000000001, 'rate': 0.4, 'improve': 0.4, 'board': 0.4, 'approve': 0.4, 'new': 0.4, 'strategic': 0.4, 'initiative': 0.4, 'focus': 0.4, 'expansion': 0.4, 'Q3': 0.54, 'results': 0.51, 'exceeded': 0.48, 'expectations': 0.44999999999999996}

compound_words:
['customer+acquisition', 'retention+rates', 'strategic+initiative']

domain_keywords:
{'business': ['Q3', 'results', 'revenue', 'growth', 'acquisition', 'retention', 'strategic', 'initiative']}
success: True
language: en

error: None
success: False
language: en


categories:
[]

explanations:
{}

evidence:
{}
success: True
language: en


In [None]:
# Analysis with explicit stopword settings
params = notebook_analyzer.param_adapter.load_and_convert()

# Add some business-specific words to exclude
additional_config = {
    "excluded_keywords": ["the", "with", "while", "new"],  # Common words we want to exclude
}

# Run analysis
results = await notebook_analyzer.analyze_text(
    "business",
    analysis_types=["keywords", "themes", "categories"],
    language="en",
    parameter_file=notebook_analyzer.parameter_file,
    config=additional_config  # Add our additional config
)



## Basic Analysis Example

In [5]:
# Load parameters and analyze business text
params = notebook_analyzer.param_adapter.load_and_convert()

# Analyze with additional settings
results = await notebook_analyzer.analyze_text(
    "business",
    analysis_types=["keywords", "themes", "categories"],
    language="en",
    parameter_file=notebook_analyzer.parameter_file,
    min_confidence=0.3,  # Ensure we get broader results
    include_compounds=True,  # Explicitly enable compound words
    focus_on="business and financial content"  # Set specific focus
)

2024-11-08 19:47:18 - src.utils.FileUtils.file_utils - INFO - Successfully loaded 4 sheets from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\example_params.xlsx
2024-11-08 19:47:18 - src.utils.FileUtils.file_utils - INFO - Successfully loaded 4 sheets from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\example_params.xlsx
2024-11-08 19:47:18 - src.utils.FileUtils.file_utils - INFO - Successfully loaded 4 sheets from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\example_params.xlsx
2024-11-08 19:47:18 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-08 19:47:19 - src.semantic_analyzer.analyzer - INFO - Initialized SemanticAnalyzer with language: en,


=== KEYWORDS ===
keywords: ['the', 'company', 'result', 'exceed', 'expectation', 'with', 'revenue', 'growth']
keyword_scores: {'the': 0.4, 'company': 0.23300844255160014, 'result': 0.23300844255160014, 'exceed': 0.23300844255160014, 'expectation': 0.23300844255160014, 'with': 0.23300844255160014, 'revenue': 0.23300844255160014, 'growth': 0.23300844255160014, 'customer': 0.23300844255160014, 'acquisition': 0.23300844255160014, 'cost': 0.23300844255160014, 'decrease': 0.23300844255160014, 'while': 0.23300844255160014, 'retention': 0.23300844255160014, 'rate': 0.23300844255160014, 'improve': 0.23300844255160014, 'board': 0.23300844255160014, 'have': 0.23300844255160014, 'approve': 0.23300844255160014, 'new': 0.23300844255160014, 'strategic': 0.23300844255160014, 'initiative': 0.23300844255160014, 'focus': 0.23300844255160014, 'expansion': 0.6530084425516001, 'Q3 results': 0.54, 'revenue growth': 0.51, 'customer acquisition': 0.48, 'retention rates': 0.44999999999999996, 'strategic initia

In [6]:
# Display results in a more readable format
async def display_formatted_results(results):
    """Display results with better formatting."""
    for analysis_type, data in results.items():
        print(f"\n{'='*20} {analysis_type.upper()} {'='*20}")
        if isinstance(data, dict):
            for key, value in data.items():
                if isinstance(value, list):
                    print(f"\n{key}:")
                    for item in value:
                        print(f"  - {item}")
                elif isinstance(value, dict):
                    print(f"\n{key}:")
                    for k, v in value.items():
                        print(f"  {k}: {v}")
                else:
                    print(f"{key}: {value}")

await display_formatted_results(results)





keywords:
  - the
  - company
  - result
  - exceed
  - expectation
  - with
  - revenue
  - growth

keyword_scores:
  the: 0.4
  company: 0.23300844255160014
  result: 0.23300844255160014
  exceed: 0.23300844255160014
  expectation: 0.23300844255160014
  with: 0.23300844255160014
  revenue: 0.23300844255160014
  growth: 0.23300844255160014
  customer: 0.23300844255160014
  acquisition: 0.23300844255160014
  cost: 0.23300844255160014
  decrease: 0.23300844255160014
  while: 0.23300844255160014
  retention: 0.23300844255160014
  rate: 0.23300844255160014
  improve: 0.23300844255160014
  board: 0.23300844255160014
  have: 0.23300844255160014
  approve: 0.23300844255160014
  new: 0.23300844255160014
  strategic: 0.23300844255160014
  initiative: 0.23300844255160014
  focus: 0.23300844255160014
  expansion: 0.6530084425516001
  Q3 results: 0.54
  revenue growth: 0.51
  customer acquisition: 0.48
  retention rates: 0.44999999999999996
  strategic initiative: 0.48

compound_words:
  - cust

## Analysis with Parameters

In [7]:
# Load parameters and analyze business text
params = notebook_analyzer.param_adapter.load_and_convert()

results = await notebook_analyzer.analyze_text(
    "business",
    analysis_types=["keywords", "themes", "categories"],
    language="en",
    parameter_file=params
)



2024-11-08 19:47:38 - src.utils.FileUtils.file_utils - INFO - Successfully loaded 4 sheets from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\example_params.xlsx
2024-11-08 19:47:38 - src.utils.FileUtils.file_utils - INFO - Successfully loaded 4 sheets from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\example_params.xlsx
2024-11-08 19:47:38 - src.utils.FileUtils.file_utils - INFO - Successfully loaded 4 sheets from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\example_params.xlsx
2024-11-08 19:47:38 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-08 19:47:39 - src.semantic_analyzer.analyzer - INFO - Initialized SemanticAnalyzer with language: en,


=== KEYWORDS ===
keywords: ['the', 'company', 'result', 'exceed', 'expectation', 'with', 'revenue', 'growth']
keyword_scores: {'the': 0.4, 'company': 0.23300844255160014, 'result': 0.23300844255160014, 'exceed': 0.23300844255160014, 'expectation': 0.23300844255160014, 'with': 0.23300844255160014, 'revenue': 0.23300844255160014, 'growth': 0.23300844255160014, 'customer': 0.23300844255160014, 'acquisition': 0.23300844255160014, 'cost': 0.23300844255160014, 'decrease': 0.23300844255160014, 'while': 0.23300844255160014, 'retention': 0.23300844255160014, 'rate': 0.23300844255160014, 'improve': 0.23300844255160014, 'board': 0.23300844255160014, 'have': 0.23300844255160014, 'approve': 0.23300844255160014, 'new': 0.23300844255160014, 'strategic': 0.23300844255160014, 'initiative': 0.23300844255160014, 'focus': 0.23300844255160014, 'expansion': 0.6530084425516001, 'Q3 results': 0.54, 'revenue growth': 0.51, 'customer acquisition': 0.48, 'retention rates': 0.44999999999999996, 'strategic initia

## Finnish Text Analysis

In [8]:
# Analyze Finnish text
results = await notebook_analyzer.analyze_text(
    "finnish",
    analysis_types=["keywords", "categories"],
    language="fi"
)

2024-11-08 19:47:49 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-08 19:47:49 - src.core.language_processing.finnish - INFO - Successfully initialized Voikko with path: C:/scripts/Voikko
2024-11-08 19:47:49 - src.core.language_processing.finnish - INFO - Loaded 847 Finnish stopwords from c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\configurations\stop_words\fi.txt
2024-11-08 19:47:50 - src.semantic_analyzer.analyzer - INFO - Initialized SemanticAnalyzer with language: fi, using parameter file: None
2024-11-08 19:47:51 - src.semantic_analyzer.analyzer - INFO - Initialized SemanticAnalyzer with language: fi, using parameter file: None
2024-11-08 19:47:54 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-08 19:47:55 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK


=== KEYWORDS ===
keywords: ['ohjelmistokehittäjä', 'työskennellä', 'asiakasprojekti', 'kehittää', 'verkkokauppajärjestelmä', 'tekninen', 'toteutus', 'vaatia', 'erityinen', 'huomio']
keyword_scores: {'ohjelmistokehittäjä': 0.9400000000000001, 'työskennellä': 0.88, 'asiakasprojekti': 0.91, 'kehittää': 0.9400000000000001, 'verkkokauppajärjestelmä': 0.97, 'tekninen': 0.4, 'toteutus': 0.8200000000000001, 'vaatia': 0.4, 'erityinen': 0.4, 'huomio': 0.79, 'tietoturpa': 0.4, 'osa': 0.4, 'tietoturva': 0.44999999999999996}
compound_words: ['verkkokauppajärjestelmä', 'asiakasprojekti']
domain_keywords: {'software_development': ['ohjelmistokehittäjä', 'kehittää', 'toteutus', 'tietoturva']}
success: True
language: fi

=== CATEGORIES ===
categories: []
explanations: {}
evidence: {}
success: True
language: fi


## Batch Analysis

In [9]:
# Analyze all texts
all_results = await notebook_analyzer.analyze_all(
    analysis_types=["keywords", "themes"]
)

2024-11-08 19:47:55 - src.core.language_processing.factory - INFO - Using default configuration
2024-11-08 19:47:56 - src.semantic_analyzer.analyzer - INFO - Initialized SemanticAnalyzer with language: en, using parameter file: None
2024-11-08 19:47:57 - src.semantic_analyzer.analyzer - INFO - Initialized SemanticAnalyzer with language: en, using parameter file: None
2024-11-08 19:48:00 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-08 19:48:01 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-08 19:48:02 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-08 19:48:03 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-08 19:48:03 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-11-08 19:48:04 - httpx - INFO - HTTP Req


=== Text 1 ===

=== KEYWORDS ===
keywords: ['python', 'high-level', 'program', 'language', 'know', 'for', 'simplicity', 'support', 'multiple', 'paradigm']
keyword_scores: {'python': 0.18754640123200894, 'high-level': 0.6375464012320089, 'program': 0.4, 'language': 0.6675464012320089, 'know': 0.18754640123200894, 'for': 0.18754640123200894, 'simplicity': 0.6075464012320089, 'support': 0.18754640123200894, 'multiple': 0.18754640123200894, 'paradigm': 0.18754640123200894, 'include': 0.18754640123200894, 'procedural': 0.577546401232009, 'and': 0.18754640123200894, 'object-oriented': 0.5475464012320089, 'Python': 0.54, 'programming': 0.51}
compound_words: ['high-level', 'object-oriented']
domain_keywords: {'programming': ['Python', 'programming', 'procedural', 'object-oriented'], 'languages': ['language', 'high-level']}
success: True
language: en

=== THEMES ===
error: None
success: False
language: en

=== Text 2 ===

=== KEYWORDS ===
keywords: ['the', 'company', 'result', 'exceed', 'expec

## Saving Results

In [10]:
# Save results
saved_path = notebook_analyzer.save_results(results, "analysis_results")
print(f"Results saved to: {saved_path}")

2024-11-08 19:48:04 - src.utils.FileUtils.file_utils - INFO - Data saved to c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\processed\analysis_results_20241108_194804.yaml


Results saved to: c:\Users\tja\OneDrive - Rastor-instituutti ry\Tiedostot\Rastor-instituutti\kehittäminen\analytiikka\repos\semantic-text-analyzer\data\processed\analysis_results_20241108_194804.yaml


## Custom Categories Analysis

In [11]:
# Cell 19 - Code
# Define custom categories
categories = {
    "technical": {
        "description": "Technical content",
        "keywords": ["programming", "software", "technology"],
        "threshold": 0.7
    },
    "business": {
        "description": "Business content",
        "keywords": ["revenue", "growth", "financial"],
        "threshold": 0.6
    }
}

# Analyze with custom categories
results = await notebook_analyzer.analyze_text(
    "technical",
    analysis_types=["categories"],
    categories=categories
)

2024-11-08 19:48:04 - src.core.language_processing.factory - INFO - Using default configuration


2024-11-08 19:48:06 - src.semantic_analyzer.analyzer - INFO - Initialized SemanticAnalyzer with language: en, using parameter file: None
2024-11-08 19:48:07 - src.semantic_analyzer.analyzer - INFO - Initialized SemanticAnalyzer with language: en, using parameter file: None
2024-11-08 19:48:08 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



=== CATEGORIES ===
categories: []
explanations: {}
evidence: {}
success: True
language: en
