<a href="https://colab.research.google.com/github/tapasmahanand/Text_summarization_using_Streamlit_and_Groq/blob/main/Text_summarization_using_Streamlit_and_Groq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Cell 1 - Install Required Libraries:
!pip install streamlit PyMuPDF4LLM pandas openpyxl groq pyngrok

In [None]:
!curl -s https://ngrok-agent.s3.amazonaws.com/ngrok.asc | sudo tee /etc/apt/trusted.gpg.d/ngrok.asc >/dev/null && echo "deb https://ngrok-agent.s3.amazonaws.com buster main" | sudo tee /etc/apt/sources.list.d/ngrok.list && sudo apt update && sudo apt install ngrok

In [None]:
#Cell 2 - Import Libraries and Set Up Environment:
import streamlit as st
import fitz  # PyMuPDF4LLM
import pandas as pd
import os
import time
from groq import Groq
from google.colab import files

In [None]:
from google.colab import userdata
os.environ["GROQ_API_KEY"] = userdata.get('groq_colab_key') #krch
os.environ["NGROK_AUTH_TOKEN"] = userdata.get('NGROK_AUTH_TOKEN')

# Get ngrok auth token from environment
ngrok_token = userdata.get('NGROK_AUTH_TOKEN')
!ngrok authtoken {ngrok_token}

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
%%writefile app.py
import streamlit as st
import fitz
import pandas as pd
import os
import time
from groq import Groq
import tempfile
import io

# Initialize Groq client
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

# Define models configuration
MODEL_CONFIGS = {
    "model1": {
        "name": "gemma2-9b-it",
        "display_name": "Gemma2_9B"
    },
    "model2": {
        "name": "llama-3.2-3b-preview",
        "display_name": "LLaMA 3.2"
    }
}

def chunk_text(text, chunk_size=4000):
    """Split text into chunks of approximately chunk_size words."""
    words = text.split()
    chunks = []
    current_chunk = []
    current_size = 0

    for word in words:
        current_chunk.append(word)
        current_size += 1

        if current_size >= chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_size = 0

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

def extract_text_from_pdf(pdf_path):
    """Extract text content from PDF file."""
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    except Exception as e:
        st.error(f"Error processing PDF {pdf_path}: {str(e)}")
        return None

def get_summary_from_groq(text, model_name, max_words=None):
    """Get summary from Groq API with specified model and token management."""
    try:
        # Split text into smaller chunks
        chunks = chunk_text(text)
        summaries = []

        for i, chunk in enumerate(chunks):
            # Add delay for API rate limiting
            time.sleep(10)

            # Create appropriate prompt based on whether it's the main chunk
            if len(chunks) > 1:
                if i == 0:
                    prompt = f"This is part 1 of {len(chunks)} parts. "
                else:
                    prompt = f"This is part {i+1} of {len(chunks)} parts. "

                if max_words:
                    prompt += f"Please provide a brief summary of this part (the full summary across all parts should be under {max_words} words):\n\n{chunk}"
                else:
                    prompt += f"Please provide a concise summary of this part:\n\n{chunk}"
            else:
                if max_words:
                    prompt = f"Please provide a summary of the following text in under {max_words} words:\n\n{chunk}"
                else:
                    prompt = f"Please provide a concise summary of the following text:\n\n{chunk}"

            try:
                chat_completion = client.chat.completions.create(
                    messages=[
                        {
                            "role": "user",
                            "content": prompt
                        }
                    ],
                    model=model_name,
                    temperature=0.7,
                    max_tokens=500  # Limiting response tokens
                )

                summaries.append(chat_completion.choices[0].message.content)

            except Exception as e:
                st.error(f"Error in chunk {i+1}: {str(e)}")
                continue

        # If we have multiple summaries, combine them
        if len(summaries) > 1:
            combined_text = " ".join(summaries)

            # Add delay for API rate limiting
            time.sleep(10)

            # Get final summary of all chunks
            final_prompt = f"Please provide a {'concise' if not max_words else f'under {max_words} words'} summary combining these separate summaries:\n\n{combined_text}"

            chat_completion = client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": final_prompt
                    }
                ],
                model=model_name,
                temperature=0.7,
                max_tokens=500
            )

            return chat_completion.choices[0].message.content
        else:
            return summaries[0] if summaries else None

    except Exception as e:
        st.error(f"Error getting summary from Groq API: {str(e)}")
        return None

def process_pdfs(uploaded_files):
    """Process PDFs and generate summaries using both models."""
    results = []

    for uploaded_file in uploaded_files:
        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
            tmp_file.write(uploaded_file.getvalue())
            tmp_path = tmp_file.name

        text = extract_text_from_pdf(tmp_path)
        os.unlink(tmp_path)  # Clean up temporary file

        if text:
            result = {'PDF_Name': uploaded_file.name}

            # Generate summaries for each model
            for model_key, model_config in MODEL_CONFIGS.items():
                model_name = model_config['name']
                display_name = model_config['display_name']

                # Get 50-word summary
                summary_50 = get_summary_from_groq(text, model_name, max_words=50)
                result[f'{display_name}_50_Words'] = summary_50

                # Get unlimited summary
                summary_unlimited = get_summary_from_groq(text, model_name)
                result[f'{display_name}_Unlimited'] = summary_unlimited

            results.append(result)

    return results

def export_to_excel(df):
    """Export DataFrame to Excel file in memory."""
    output = io.BytesIO()
    with pd.ExcelWriter(output, engine='openpyxl') as writer:
        df.to_excel(writer, index=False)
    output.seek(0)
    return output

def main():
    st.title("LLM PDF Summarization Comparison")

    # Model configuration in sidebar
    st.sidebar.title("Model Configuration")
    for model_key in MODEL_CONFIGS:
        MODEL_CONFIGS[model_key]['name'] = st.sidebar.text_input(
            f"Enter {model_key} name",
            value=MODEL_CONFIGS[model_key]['name']
        )
        MODEL_CONFIGS[model_key]['display_name'] = st.sidebar.text_input(
            f"Enter {model_key} display name",
            value=MODEL_CONFIGS[model_key]['display_name']
        )

    # File uploader for PDFs
    uploaded_files = st.file_uploader("Upload PDF files", type=['pdf'], accept_multiple_files=True)

    if uploaded_files:
        if st.button("Generate Summaries"):
            with st.spinner("Processing PDFs..."):
                results = process_pdfs(uploaded_files)

                # Display results
                for result in results:
                    st.subheader(f"Results for {result['PDF_Name']}")

                    col1, col2 = st.columns(2)

                    # Display results for each model in columns
                    for i, (model_key, model_config) in enumerate(MODEL_CONFIGS.items()):
                        with col1 if i == 0 else col2:
                            display_name = model_config['display_name']
                            st.write(f"{display_name} Summaries:")
                            st.write("50 Words:", result[f'{display_name}_50_Words'])
                            st.write("Unlimited:", result[f'{display_name}_Unlimited'])

                # Create Excel file and provide download button
                df = pd.DataFrame(results)
                excel_file = export_to_excel(df)
                st.download_button(
                    label="Download Excel Report",
                    data=excel_file,
                    file_name="summary_comparison.xlsx",
                    mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                )

if __name__ == "__main__":
    main()

Writing app.py


In [None]:
from pyngrok import ngrok
import time

# Kill any existing Streamlit processes
!kill -9 $(pgrep streamlit) 2>/dev/null

# Start Streamlit
!streamlit run app.py &>/content/logs.txt & #llama3.3 vs llama3.2
# !streamlit run app_gemma.py &>/content/logs.txt & #gemma2-9b vs llama3.2
time.sleep(5)

# Create ngrok tunnel with correct configuration
ngrok_tunnel = ngrok.connect(addr="8501", proto="http", bind_tls=True)
print(f"Streamlit app URL: {ngrok_tunnel.public_url}")

# Keep the tunnel open
try:
    while True:
        time.sleep(1)
except KeyboardInterrupt:
    print("Closing ngrok tunnel...")
    ngrok.kill()

Streamlit app URL: https://3ee3-34-60-104-169.ngrok-free.app
