<a href="https://colab.research.google.com/github/Malik-Zubair123/Gemma2_Hackathon/blob/main/Edit_this.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers torch



In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import os

In [17]:
class TranscriptSummarizer:
    def __init__(self, model_name="google/gemma-2b"):
        """
        Initialize the summarization model and tokenizer

        Args:
            model_name (str): Hugging Face model to use for summarization
        """
        try:
            # Load tokenizer and model with specific configurations
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model = AutoModelForCausalLM.from_pretrained(model_name)

            # Set pad token if not already set
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
                self.model.config.pad_token_id = self.model.config.eos_token_id

        except Exception as e:
            print(f"Model loading error: {e}")
            raise

    def preprocess_text(self, text, max_length=2048):
        """
        Preprocess input text for summarization with improved truncation

        Args:
            text (str): Input text to preprocess
            max_length (int): Maximum token length to allow

        Returns:
            str: Cleaned and processed text
        """
        # Remove extra whitespaces
        cleaned_text = ' '.join(text.split())

        # Truncate text to manageable length
        tokens = self.tokenizer.encode(
            cleaned_text,
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )

        return self.tokenizer.decode(tokens[0], skip_special_tokens=True)

    def generate_summary(self, text, max_new_tokens=150):
        """
        Generate summary using the loaded model with enhanced error handling

        Args:
            text (str): Text to summarize
            max_new_tokens (int): Maximum number of new tokens to generate

        Returns:
            str: Generated summary
        """
        try:
            # Preprocess the text with length control
            processed_text = self.preprocess_text(text)

            print(f"Processed Text Size: {len(processed_text)} characters")

            # Construct a prompt that explicitly requests a summary
            summary_prompt = f"Summarize the following text concisely:\n{processed_text}\n\nSummary:"

            # Prepare inputs with error checking
            inputs = self.tokenizer(
                summary_prompt,
                return_tensors="pt",
                truncation=True,
                max_length=2048,
                padding=True
            )

            # Generate summary with more robust parameters
            summary_ids = self.model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_new_tokens=max_new_tokens,
                num_return_sequences=1,
                no_repeat_ngram_size=2,
                do_sample=True,
                top_k=50,
                top_p=0.95,
                temperature=0.7,
                pad_token_id=self.tokenizer.pad_token_id
            )

            # Decode the summary
            summary = self.tokenizer.decode(
                summary_ids[0][inputs['input_ids'].shape[1]:],  # Only decode new generated tokens
                skip_special_tokens=True
            )

            return summary.strip()

        except Exception as e:
            print(f"Detailed Summary Generation Error: {e}")
            import traceback
            traceback.print_exc()
            return f"Could not generate summary. Error: {str(e)}"

# Function to handle different input scenarios
def summarize_text(input_text=None, file_path=None, max_new_tokens=150):
    """
    Flexible function to summarize text from different sources

    Args:
        input_text (str, optional): Direct text input
        file_path (str, optional): Path to text file
        max_new_tokens (int, optional): Maximum new tokens in summary

    Returns:
        str: Generated summary
    """
    # Initialize summarizer
    summarizer = TranscriptSummarizer()

    # Determine text source
    if input_text:
        text = input_text
    elif file_path:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
        except Exception as e:
            print(f"Error reading file: {e}")
            return f"File reading error: {e}"
    else:
        return "No text provided. Please give input text or file path."

    # Generate and return summary
    return summarizer.generate_summary(text, max_new_tokens)

# Example Usage
def main():
    # Option 1: Directly input text
    sample_text = """
    Key Advantages of Plain Text Files:
Plain text will never require a subscription, lock away features, or go out of business. It's free and here forever.

Flexibility: Open Them Anywhere: Plain text files are the most flexible file format we have. They can be opened by hundreds, if not thousands, of applications. Since they are a basic component of computers, you can even open them on the computer command line.
No New Tools: We all get excited by new stuff, and for productivity junkies, we perk up with excitement to try out a new piece of software. Unfortunately tools come and go, and switching system can be wasteful and counterproductive. Switching to plain text means you never need to migrate to a new tool.
Portability: By portability I mean that your files can be moved to and from different operating systems, platforms and devices and you can still open them. Whether you are on a Mac, Linux, Windows or some future tool, you'll be able to open and edit your plain text files there.
Future-Proof: A while back I discovered several writings I did in high school using Windows 95 and a version of Microsoft Word. While I was able to eventually open the files, the formatting had been lost. This revealed a potential danger in locking your words into a format that may not be around forever. Fortunately, while other file formats may come and go, plain text files will remain.
    """

    # Generate summary from text
    summary = summarize_text(input_text=sample_text, max_new_tokens=100)
    print("Generated Summary:")
    print(summary)

In [18]:
main()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Processed Text Size: 1386 characters
Generated Summary:
1. Plain-text files have been around since the first computers.
2. There are many advantages to using plain-texts. The best one is that they won't disappear. You can open, edit, share, move, print, save, email, open in a browser, etc., plain texts with any program. That means there is no need for proprietary software, no licensing fees, subscription fees or anything else. Plus, there are millions of free, powerful tools that will
