In [None]:
import torch
from transformers import pipeline, AutoTokenizer
import nltk
import re
from typing import List, Dict

# Download required NLTK data if not already present
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

from nltk.tokenize import sent_tokenize

class EnhancedSummarizer:
    def __init__(self, model_name: str):
        """
        Initialize the enhanced summarizer with CPU-optimized settings.
        :param model_name: The name of the summarization model to use.
        """
        self.model_name = model_name
        self.device = -1  # Force CPU usage for 16GB RAM constraint
        # The prompt is added here to guide the summarization model
        self.prompt = "Please provide a concise and brief summary of the following text:"
        
        # Initialize the summarization pipeline
        self.summarizer = pipeline(
            "summarization",
            model=model_name,
            device=self.device,
            torch_dtype=torch.float32  # Use float32 for better CPU performance
        )
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
    def preprocess_text(self, text: str) -> str:
        """
        Cleaning the input text.
        :param text: The raw input text.
        :return: The cleaned text.
        """
        # Remove extra whitespace and normalize
        text = re.sub(r'\s+', ' ', text.strip())
        
        # Remove URLs and email addresses
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        text = re.sub(r'\S+@\S+', '', text)
        
        # Remove excessive punctuation
        text = re.sub(r'[.]{2,}', '.', text)
        
        return text

    def extract_key_information(self, text: str) -> Dict:
        """
        A simple method to extract key information like numbers and percentages.
        (This is a placeholder implementation)
        :param text: The cleaned input text.
        :return: A dictionary containing extracted key information.
        """
        # Regex to find potential stock prices (e.g., $100.00)
        prices = re.findall(r'\$\d+\.?\d*', text)
        
        # Regex to find percentages (e.g., 50%)
        percentages = re.findall(r'\d+\.?\d*%', text)
        
        return {
            'prices': prices,
            'percentages': percentages
        }
    
    def smart_chunk_text(self, text: str, max_tokens: int) -> List[str]:
        """
        Splits the text into chunks, respecting sentence boundaries.
        (This is a placeholder implementation)
        :param text: The input text.
        :param max_tokens: The maximum number of tokens per chunk.
        :return: A list of text chunks.
        """
        sentences = sent_tokenize(text)
        chunks = []
        current_chunk = ""
        for sentence in sentences:
            # Check token count for the new sentence
            sentence_tokens = len(self.tokenizer.encode(sentence))
            current_chunk_tokens = len(self.tokenizer.encode(current_chunk))
            
            # If the current chunk plus the new sentence exceeds the limit,
            # start a new chunk
            if current_chunk_tokens + sentence_tokens > max_tokens and current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = sentence + " "
            else:
                current_chunk += sentence + " "
        
        if current_chunk:
            chunks.append(current_chunk.strip())
            
        return chunks

    def generate_multi_stage_summary(self, text: str, max_tokens: int) -> str:
        """
        Generate summary using multiple stages for better quality.
        :param text: The input text to be summarized.
        :param max_tokens: The maximum number of tokens for a single model call.
        :return: The final summarized text.
        """
        # Stage 1: Preprocess
        clean_text = self.preprocess_text(text)
        
        # Stage 2: Extract key info (e.g., numbers, percentages)
        key_info = self.extract_key_information(clean_text)
        
        # Stage 3: Smart chunking
        chunks = self.smart_chunk_text(clean_text, max_tokens)
        
        if not chunks:
            return "Unable to process the provided text."
        
        # Stage 4: Summarize chunks
        chunk_summaries = []
        for chunk in chunks:
            try:
                # Prepend the prompt to each chunk for better LLM performance
                chunk_with_prompt = f"{self.prompt} {chunk}"
                
                # Adjust summary length based on chunk size
                chunk_tokens = len(self.tokenizer.encode(chunk_with_prompt))
                max_len = min(150, max(50, chunk_tokens // 4))
                min_len = min(40, max_len // 3)
                
                summary = self.summarizer(
                    chunk_with_prompt,
                    max_length=max_len,
                    min_length=min_len,
                    do_sample=False,
                    length_penalty=1.0,
                    repetition_penalty=1.1
                )
                chunk_summaries.append(summary[0]['summary_text'])
            except Exception as e:
                print(f"Error summarizing chunk: {e}")
                continue
        
        # Stage 5: Combine and final summarization if multiple chunks
        if len(chunk_summaries) > 1:
            combined_text = " ".join(chunk_summaries)
            try:
                # Prepend prompt for the final summary
                combined_with_prompt = f"{self.prompt} {combined_text}"
                final_summary = self.summarizer(
                    combined_with_prompt,
                    max_length=1024,
                    min_length=80,
                    do_sample=False,
                    length_penalty=1.0
                )
                result = final_summary[0]['summary_text']
            except:
                result = " ".join(chunk_summaries[:2])  # Fallback
        else:
            result = chunk_summaries[0] if chunk_summaries else "No summary generated."
        
        # Stage 6: Post-process and enhance
        result = self.post_process_summary(result, key_info)
        
        return result
    
    def post_process_summary(self, summary: str, key_info: Dict) -> str:
        """
        Post-process the summary to add structure and key information.
        (This is a placeholder implementation)
        :param summary: The generated summary text.
        :param key_info: A dictionary of key information to potentially add back.
        :return: The enhanced summary.
        """
        # A simple check to ensure key numbers are preserved
        if 'prices' in key_info and key_info['prices']:
            if not any(price in summary for price in key_info['prices'][:2]):
                summary = f"NVIDIA stock trades around {key_info['prices'][0]}. " + summary
        
        if 'percentages' in key_info and key_info['percentages']:
            # Make sure at least one key percentage is mentioned
            key_percentages = [p for p in key_info['percentages'] if float(p.replace('%', '')) > 10]
            if key_percentages and not any(pct in summary for pct in key_percentages[:2]):
                summary += f" Notable growth includes {key_percentages[0]} performance metrics."
        
        return summary

def get_summary(text: str, max_tokens: int, model_name: str = "facebook/bart-large-cnn") -> str:
    """
    Main function to summarize text with enhanced processing.
    :param text: The text to summarize.
    :param max_tokens: The maximum number of tokens per model call.
    :param model_name: The name of the Hugging Face model to use.
    :return: The generated summary.
    """
    try:
        # For CPU-only systems with limited RAM, consider these alternatives:
        # - "sshleifer/distilbart-cnn-12-6" (smaller, faster)
        # - "google/pegasus-xsum" (good for news)
        # - "philschmid/bart-large-cnn-samsum" (conversational)
        
        summarizer = EnhancedSummarizer(model_name)
        summary = summarizer.generate_multi_stage_summary(text, max_tokens)
        return summary
        
    except Exception as e:
        print(f"Error in summarization: {e}")
        return "Error: Unable to generate summary. Try with a smaller text or a different model."

# Example usage (uncomment to test)
# sample_text = "NVIDIA's recent financial report shows a 50% increase in Q3 revenue. The company's stock is currently trading at $150.00 per share, a significant rise from last month's $100.00. Analysts attribute this growth to strong demand for their new AI chips. The company is also investing heavily in new research and development projects to stay ahead of competitors."
# print(get_summary(sample_text, max_tokens=1024))

In [3]:
# nvidia_text = """Your paste.txt content here..."""
    
# # different modellist
# models_to_try = [
#     "sshleifer/distilbart-cnn-12-6",
#     "facebook/bart-large-cnn",        
#     "google/pegasus-xsum"
# ]
    
# for model in models_to_try:
#     try:
#         print(f"\n--- Using model: {model} ---")
#         summary = summarize_nvidia_news(nvidia_text, model)
#         print(summary)
#         break
#     except Exception as e:
#         print(f"Model {model} failed: {e}")
#         continue