In [1]:
# Import required modules
import sys
from pathlib import Path
# from typing import List, Dict, Any, Tuple, Union
# import logging
# import asyncio

# Add project root to Python path if needed
project_root = str(Path().resolve().parent)
if project_root not in sys.path:
    sys.path.append(project_root)
print(f"Project root: {project_root}")

Project root: /mnt/batch/tasks/shared/LS_root/mounts/clusters/basic-cpu/code/Users/topi.jarvinen/semantic-text-analyzer


In [2]:
# azure_notebook.ipynb
import os
from src.az_helpers.setup_azure import init_azure_ml # setup_environment
from src.semantic_analyzer import SemanticAnalyzer

In [3]:
from src.az_helpers.az_environment import setup_notebook_env, verify_environment

In [4]:
# Set up environment and logging
setup_notebook_env(log_level="DEBUG")
verify_environment()


Environment Check Results:
✓ Project root in path
✓ FileUtils initialized
✓ .env file loaded
✓ OPENAI_API_KEY set
✓ ANTHROPIC_API_KEY set
✓ Raw data exists
✓ Processed data exists
✓ Configuration exists
✓ Main config.yaml exists

Environment Status: Ready ✓


True

In [5]:
# Setup environment
from FileUtils import FileUtils
file_utils = FileUtils()

In [6]:
# Initialize analyzer
analyzer = SemanticAnalyzer(
    parameter_file="azure://parameters/parameters_en.xlsx",
    file_utils=file_utils
)

In [7]:
# Test analysis
text = "Machine learning models process data efficiently."
result = await analyzer.analyze(text)


Processing response: {'categories': [{'category': 'Technical', 'confidence': 0.9, 'explanation': 'The text discusses machine learning models, which are a key concept in the technical domain, particularly in data processing and analysis.', 'evidence': [{'text': 'Machine learning models process data efficiently.', 'relevance': 0.95, 'matched_keywords': ['machine', 'learning', 'models', 'process', 'data', 'efficiently'], 'context': 'The entire sentence focuses on the capabilities of machine learning in handling data.'}], 'themes': ['data processing', 'machine learning']}], 'relationships': {'Technical': ['Business']}}

Processing category: {'category': 'Technical', 'confidence': 0.9, 'explanation': 'The text discusses machine learning models, which are a key concept in the technical domain, particularly in data processing and analysis.', 'evidence': [{'text': 'Machine learning models process data efficiently.', 'relevance': 0.95, 'matched_keywords': ['machine', 'learning', 'models', 'pro

In [14]:
"""Helper functions for handling analysis results."""

from typing import Any, Dict, Optional, Union
from pathlib import Path

import pandas as pd
from FileUtils import FileUtils, OutputFileType


def save_analysis_to_excel(
    result: Any,
    file_utils: FileUtils,
    file_name: str = "analysis_results",
    output_type: str = "processed",
    include_timestamp: Optional[bool] = None,
    include_metadata: bool = True
) -> Dict[str, str]:
    """Convert analysis result to DataFrames and save to Excel.
    
    Args:
        result: Analysis result object
        file_utils: FileUtils instance
        file_name: Name for the output file
        output_type: Type of output directory (e.g., "processed")
        include_timestamp: Whether to include timestamp in filename
        include_metadata: Whether to include metadata sheet
    
    Returns:
        Dict[str, str]: Dictionary of saved file paths
        
    Example:
        >>> file_utils = FileUtils()
        >>> save_analysis_to_excel(analysis_result, file_utils)
    """
    # Convert keywords to DataFrame
    keywords_df = pd.DataFrame([
        {
            'keyword': k.keyword,
            'score': k.score,
            'domain': k.domain,
            'compound_parts': ', '.join(k.compound_parts)
        } for k in result.keywords.keywords
    ])
    
    # Convert themes to DataFrame
    themes_df = pd.DataFrame([
        {
            'name': t.name,
            'description': t.description,
            'confidence': t.confidence,
            'keywords': ', '.join(t.keywords)
        } for t in result.themes.themes
    ])
    
    # Convert categories to DataFrame
    categories_df = pd.DataFrame([
        {
            'name': m.name,
            'confidence': m.confidence,
            'description': m.description,
            'themes': ', '.join(m.themes),
            'evidence': '\n'.join([e.text for e in m.evidence]) if hasattr(m, 'evidence') else ''
        } for m in result.categories.matches
    ])
    
    # Prepare data dictionary
    data_dict = {
        'Keywords': keywords_df,
        'Themes': themes_df,
        'Categories': categories_df,
    }
    
    # Add metadata if requested
    if include_metadata:
        metadata_df = pd.DataFrame([{
            'language': result.language,
            'success': result.success,
            'processing_time': result.processing_time,
            'keywords_language': result.keywords.language,
            'themes_language': result.themes.language,
            'categories_language': result.categories.language,
            'compound_words': ', '.join(result.keywords.compound_words)
        }])
        data_dict['Metadata'] = metadata_df
    
    # Save to Excel
    saved_files, _ = file_utils.save_data_to_disk(
        data=data_dict,
        output_filetype=OutputFileType.XLSX,
        output_type=output_type,
        file_name=file_name,
        include_timestamp=include_timestamp
    )
    
    return saved_files


def read_analysis_from_excel(
    file_path: Union[str, Path],
    file_utils: FileUtils,
    input_type: str = "processed"
) -> Dict[str, pd.DataFrame]:
    """Read analysis results from Excel file.
    
    Args:
        file_path: Path to Excel file
        file_utils: FileUtils instance
        input_type: Type of input directory (e.g., "processed")
    
    Returns:
        Dict[str, pd.DataFrame]: Dictionary of DataFrames for each sheet
        
    Example:
        >>> file_utils = FileUtils()
        >>> data = read_analysis_from_excel("analysis_results.xlsx", file_utils)
        >>> keywords_df = data['Keywords']
    """
    return file_utils.load_excel_sheets(file_path, input_type=input_type)

In [17]:
analysis_result = result

In [18]:
"""Converter functions for analysis results."""

from typing import Any, Dict, List, Optional, Union
import pandas as pd


def convert_analysis_to_dataframes(result: Any) -> Dict[str, pd.DataFrame]:
    """Convert analysis result to a dictionary of DataFrames.
    
    Args:
        result: Analysis result object
        
    Returns:
        Dict[str, pd.DataFrame]: Dictionary with DataFrames for each component
        
    Example:
        >>> dfs = convert_analysis_to_dataframes(analysis_result)
        >>> file_utils.save_data_to_disk(
        ...     data=dfs,
        ...     output_filetype="xlsx",
        ...     file_name="analysis"
        ... )
    """
    # Keywords DataFrame
    keywords_df = pd.DataFrame([
        {
            'keyword': k.keyword,
            'score': k.score,
            'domain': k.domain,
            'compound_parts': ', '.join(k.compound_parts)
        } for k in result.keywords.keywords
    ])
    
    # Themes DataFrame
    themes_df = pd.DataFrame([
        {
            'name': t.name,
            'description': t.description,
            'confidence': t.confidence,
            'keywords': ', '.join(t.keywords)
        } for t in result.themes.themes
    ])
    
    # Categories DataFrame
    categories_df = pd.DataFrame([
        {
            'name': m.name,
            'confidence': m.confidence,
            'description': m.description,
            'themes': ', '.join(m.themes),
            'evidence': '\n'.join([e.text for e in m.evidence]) if hasattr(m, 'evidence') else ''
        } for m in result.categories.matches
    ])
    
    # Metadata DataFrame
    metadata_df = pd.DataFrame([{
        'language': result.language,
        'success': result.success,
        'processing_time': result.processing_time,
        'keywords_language': result.keywords.language,
        'themes_language': result.themes.language,
        'categories_language': result.categories.language,
        'compound_words': ', '.join(result.keywords.compound_words)
    }])
    
    return {
        'Keywords': keywords_df,
        'Themes': themes_df,
        'Categories': categories_df,
        'Metadata': metadata_df
    }


def convert_analysis_to_dict(result: Any) -> Dict[str, Any]:
    """Convert analysis result to a nested dictionary.
    
    Args:
        result: Analysis result object
        
    Returns:
        Dict[str, Any]: Dictionary representation of the analysis result
        
    Example:
        >>> data_dict = convert_analysis_to_dict(analysis_result)
        >>> file_utils.save_json(
        ...     data=data_dict,
        ...     file_path="analysis"
        ... )
    """
    return {
        'keywords': {
            'items': [
                {
                    'keyword': k.keyword,
                    'score': k.score,
                    'domain': k.domain,
                    'compound_parts': k.compound_parts
                } for k in result.keywords.keywords
            ],
            'compound_words': result.keywords.compound_words,
            'language': result.keywords.language,
            'success': result.keywords.success,
            'error': result.keywords.error
        },
        'themes': {
            'items': [
                {
                    'name': t.name,
                    'description': t.description,
                    'confidence': t.confidence,
                    'keywords': t.keywords,
                    'parent_theme': t.parent_theme
                } for t in result.themes.themes
            ],
            'theme_hierarchy': result.themes.theme_hierarchy,
            'language': result.themes.language,
            'success': result.themes.success,
            'error': result.themes.error
        },
        'categories': {
            'matches': [
                {
                    'name': m.name,
                    'confidence': m.confidence,
                    'description': m.description,
                    'evidence': [
                        {
                            'text': e.text,
                            'relevance': e.relevance
                        } for e in m.evidence
                    ] if hasattr(m, 'evidence') else [],
                    'themes': m.themes
                } for m in result.categories.matches
            ],
            'language': result.categories.language,
            'success': result.categories.success,
            'error': result.categories.error
        },
        'metadata': {
            'language': result.language,
            'success': result.success,
            'error': result.error,
            'processing_time': result.processing_time
        }
    }


def get_analysis_summary(result: Any) -> pd.DataFrame:
    """Create a summary DataFrame of the analysis results.
    
    Args:
        result: Analysis result object
        
    Returns:
        pd.DataFrame: Summary of key findings
        
    Example:
        >>> summary_df = get_analysis_summary(analysis_result)
        >>> file_utils.save_data_to_disk(
        ...     data={'Summary': summary_df},
        ...     output_filetype="xlsx",
        ...     file_name="analysis_summary"
        ... )
    """
    # Get top keywords by score
    top_keywords = sorted(
        result.keywords.keywords,
        key=lambda k: k.score,
        reverse=True
    )[:5]
    
    # Get top themes by confidence
    top_themes = sorted(
        result.themes.themes,
        key=lambda t: t.confidence,
        reverse=True
    )[:5]
    
    # Create summary rows
    rows = []
    
    # Add keyword information
    rows.append({
        'Category': 'Top Keywords',
        'Item': ', '.join(k.keyword for k in top_keywords),
        'Score': ', '.join(f"{k.score:.2f}" for k in top_keywords)
    })
    
    # Add theme information
    for theme in top_themes:
        rows.append({
            'Category': 'Theme',
            'Item': theme.name,
            'Description': theme.description,
            'Score': f"{theme.confidence:.2f}"
        })
    
    # Add metadata
    rows.append({
        'Category': 'Metadata',
        'Item': 'Processing Time',
        'Score': f"{result.processing_time:.2f}s"
    })
    
    return pd.DataFrame(rows)

In [21]:
# Example usage:
# if __name__ == "__main__":
# Convert to DataFrames
dfs = convert_analysis_to_dataframes(analysis_result)

# Use with FileUtils
file_utils = FileUtils()

# Save as Excel
file_utils.save_data_to_disk(
    data=dfs,
    output_filetype="xlsx",
    output_type="processed",
    file_name="analysis_results"
)

# Convert to dict and save as JSON
data_dict = convert_analysis_to_dict(analysis_result)
file_utils.save_json(
    data=data_dict,
    file_path="analysis_results",
    output_type="processed"
)

# Get summary and save
summary_df = get_analysis_summary(analysis_result)
file_utils.save_data_to_disk(
    data={'Summary': summary_df},
    output_filetype="xlsx",
    output_type="processed",
    file_name="analysis_summary"
    )

({'analysis_summary_20241203_141116': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/basic-cpu/code/Users/topi.jarvinen/semantic-text-analyzer/data/processed/analysis_summary_20241203_141116.xlsx'},
 None)

In [11]:
# # Save results
# file_utils.save_data_to_disk(
#     data=result.to_dict(),
#     output_filetype="csv",
#     output_type="processed",
#     file_name="analysis_result"
# )

AttributeError: 'CompleteAnalysisResult' object has no attribute 'to_dict'