### Import dependencies

In [20]:
import os
import numpy as np
import pandas as pd
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_classic.chains import LLMChain
from langchain_core.output_parsers import StrOutputParser
from typing import Optional, List
import logging
from tqdm.auto import tqdm

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Disable httpx INFO logs
logging.getLogger("httpx").setLevel(logging.WARNING)
# Also disable openai client logs if needed
logging.getLogger("openai").setLevel(logging.WARNING)

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

### Read input data

In [15]:
def read_excel_file(
        file_path: str,
        nrows: int = 128
        ) -> pd.DataFrame:
    """Reads an Excel file and returns a DataFrame."""
    try:
        df = pd.read_excel(file_path, nrows=nrows)
        logger.info(f"Successfully read the Excel file: {file_path}")
        logger.info(f"Rows loaded: {len(df)}")
        return df
    except Exception as e:
        logger.error(f"Error reading the Excel file: {e}")
        raise

### Define class and function to execute the summary & translation task

In [None]:
class NotesSummarizer:
    """
    Summarizes notes from Excel files using OpenAI and LangChain.
    """
    
    def __init__(
        self,
        model: str = "gpt-4.1",
        api_version: str ="2025-01-01-preview",
        azure_endpoint: str = "https://your-azure-openai-endpoint.openai.azure.com/",
        input_file_path: str = 'text_collection_workshops.xlsx',
        nrows: int = 128,
        temperature: float = 0,
        base_url: Optional[str] = None
    ):
        """
        Initialize the summarizer with OpenAI credentials.
        
        Args:
            api_key: OpenAI API key
            model: Model name (e.g., "gpt-4o-mini", "gpt-4", "gpt-3.5-turbo")
            api_version: API version for Azure OpenAI
            input_file_path: Path to the input Excel file
            nrows: Number of rows to read from the Excel file (Default: 128)
            temperature: Temperature for text generation (lower = more focused)
            base_url: Optional base URL for OpenAI API (useful for proxies)
        """
        self.llm = AzureChatOpenAI(
            azure_deployment=model,
            api_version=api_version,
            azure_endpoint=azure_endpoint,
            temperature=temperature
        )
        
        self.input_file_path = input_file_path
        self.nrows = nrows

        # Define the summarization prompt
        self.prompt_template = PromptTemplate(
            input_variables=["text"],
            template="""Summarize the following text. 
Keep the summary concise and under 15 words.
DO NOT include any personal data in the summary (e.g., names, email, locations).
The summary should be written in English, regardless of input language.

Text: {text}

Summary:"""
        )

        # Define the translation prompt
        self.prompt_template_translation = PromptTemplate(
            input_variables=["text"],
            template="""Identify the language of the following text.
If it is not English, then translate the following text into English language.
Keep the length of the translation under 20 words.
DO NOT include any personal data in the summary (e.g., names, email, locations).

Text: {text}

Summary:"""
        )
        
        # Create the summarization chain
        #self.summarization_chain = LLMChain(
        #    llm=self.llm,
        #    prompt=self.prompt_template
        #)
        
        #logger.info(f"Excel Notes Summarizer initialized with model: {model}")

        # Create the summarization chain
        #self.translation_chain = LLMChain(
        #    llm=self.llm,
        #    prompt=self.prompt_template_tanslation
        #)
        
        #logger.info(f"Excel Notes Translator initialized with model: {model}")

        # Create chains using RunnableSequence (pipe operator)
        self.summarization_chain = self.prompt_template | self.llm | StrOutputParser()
        logger.info(f"Excel Notes Summarizer initialized with model: {model}")

        self.translation_chain = self.prompt_template_translation | self.llm | StrOutputParser()
        logger.info(f"Excel Notes Translator initialized with model: {model}")
    
    def count_words(self, text: str) -> int:
        """
        Count the number of words in a text string.
        
        Args:
            text: Input text
            
        Returns:
            Number of words
        """
        if pd.isna(text) or not isinstance(text, str):
            return 0
        return len(str(text).split())
    
    def summarize_text(self, text: str) -> str:
        """
        Summarize a single text using the LLM chain.
        
        Args:
            text: Text to summarize
            
        Returns:
            Summarized text
        """
        try:
            result = self.summarization_chain.invoke({"text": text})
            #summary = result["text"].strip()
            #return summary
            return result.strip()
        except Exception as e:
            logger.error(f"Error summarizing text: {str(e)}")
            return f"Error: {str(e)}"
        
    def translate_text(self, text: str) -> str:
        """
        Translate a single text using the LLM chain.
        
        Args:
            text: Text to translate
            
        Returns:
            Summarized text
        """
        try:
            result = self.summarization_chain.invoke({"text": text})
            #summary = result["text"].strip()
            #return summary
            return result.strip()
        except Exception as e:
            logger.error(f"Error translating text: {str(e)}")
            return f"Error: {str(e)}"

    def process_batch(
        self, 
        batch_df: pd.DataFrame, 
        notes_column: str, 
        min_words: int
        ) -> List[dict]:
        """
        Process a batch of rows.
        
        Args:
            batch_df: DataFrame batch to process
            notes_column: Column name containing notes
            min_words: Minimum words for summarization
            
        Returns:
            List of results with summary and tag
        """
        results = []
        
        for _, row in batch_df.iterrows():
            notes = row[notes_column]
            word_count = self.count_words(notes)
            
            if word_count > min_words:
                summary = self.summarize_text(notes)
                tag = 'Summarized'
            else:
                summary = self.translate_text(notes)
                tag = 'Translated Only'
            
            results.append({
                'summary': summary,
                'tag': tag,
                'word_count': word_count
            })
        
        return results
    
    def process_data(
        self,
        notes_column: str = "Notes",
        min_words: int = 25,
        batch_size: int = 32
    ):
        """
        Process an Excel file and add summarizations.
        
        Args:
            notes_column: Name of the column containing notes
            min_words: Minimum number of words required for summarization
        """
        try:
            
            df = read_excel_file(file_path=self.input_file_path, nrows=self.nrows)

            # Check if the Notes column exists
            if notes_column not in df.columns:
                raise ValueError(f"Column '{notes_column}' not found in Excel file. Available columns: {df.columns.tolist()}")
            
            # Create new columns for summaries
            df["Summary"] = ""
            df["Tag"] = ""
            
            # Create variables for progress tracking
            total_rows = len(df)
            num_batches = int(np.ceil(total_rows / batch_size))
            summarized_count = 0
            translated_count = 0

            # Process in batches with progress bar
            with tqdm(total=num_batches, desc="Processing batches", unit="batch", ncols=100) as pbar_batch:
                for batch_idx in range(num_batches):
                    start_idx = batch_idx * batch_size
                    end_idx = min(start_idx + batch_size, total_rows)
                    
                    # Get batch
                    batch_df = df.iloc[start_idx:end_idx]
                    
                    # Process batch
                    results = self.process_batch(batch_df, notes_column, min_words)
                    
                    # Update dataframe
                    for i, result in enumerate(results):
                        row_idx = start_idx + i
                        df.at[row_idx, "Summary"] = result['summary']
                        df.at[row_idx, "Tag"] = result['tag']
                        
                        if result['tag'] == 'Summarized':
                            summarized_count += 1
                        else:
                            translated_count += 1
                    
                    # Update progress bar
                    pbar_batch.update(1)
                    pbar_batch.set_postfix({
                        'rows': f"{end_idx}/{total_rows}",
                        'summarized': summarized_count,
                        'translated': translated_count
                    })
            
            logger.info(f"Processing complete!")
            logger.info(f"Summarized: {summarized_count}, Translated: {translated_count}")
            
            return df
            
        except Exception as e:
            logger.error(f"Error processing Excel file: {str(e)}")
            raise


### Entry Point Function

In [19]:
def main(
        input_file_path: str,
        nrows: int=128
        ) -> pd.DataFrame:
    """
    Main function to run the Excel summarization process.
    """
       
    # Initialize the summarizer
    summarizer = NotesSummarizer(
        input_file_path=input_file_path,
        nrows=nrows,
        temperature=0
    )
    
    # Process the data
    return summarizer.process_data(
        notes_column="Notes",
        min_words=25
        )

### Execute task

In [9]:
# Set NO_PROXY to avoid proxy for localhost connections (important for local MCP server access)
os.environ["NO_PROXY"] = "localhost, 127.0.0.1"
os.environ["no_proxy"] = "localhost, 127.0.0.1"

In [None]:
df_output = main('text_collection.xlsx', nrows=2048)

### Review output data and export

In [None]:
df_output

In [None]:
df_output.to_excel('text_collection_output_with_summaries.xlsx', index=False)