In [1]:
!pip -qq install langchain_openai
!pip -qq install PyPDF2
!pip -qq install langchain_community
!pip -qq install faiss-gpu-cu11
!pip -qq install streamlit pyngrok
!pip -qq install streamlit_modal
!pip -qq install nltk
!pip -qq install codebleu
!pip -qq install rouge
!pip -qq install mistralai
!pip -qq install -U langchain-mistralai
!pip -qq install plotly
!pip -qq install langchain
!pip -qq install langchain_huggingface

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.4/74.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.0/48.0 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [18]:
with open(".env", "w") as f:
    from google.colab import userdata
    GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')
    f.write(f"GITHUB_TOKEN={GITHUB_TOKEN}\n")

    huggingface_api_key = userdata.get('HF_TOKEN')
    f.write(f"HF_TOKEN={huggingface_api_key}\n")

    gemma_key = userdata.get('GEMMA')
    f.write(f'GEMMA={gemma_key}\n')

    ngrok_key = userdata.get('NGROK_KEY')
    f.write(f'ngrok_token={ngrok_key}\n')

In [19]:
%%writefile analyzer.py
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from rouge import Rouge
from codebleu import calc_codebleu

class EvaluationMetrics:
    def __init__(self):
        self.rouge = Rouge()
        self.smoothing_function = SmoothingFunction().method1

    def calculate_bleu_score(self, reference, hypothesis):
        reference = [reference.split()]
        hypothesis = hypothesis.split()
        score = corpus_bleu([reference], [hypothesis], smoothing_function=self.smoothing_function)
        return score

    def calculate_rouge_scores(self, reference, hypothesis):
        scores = self.rouge.get_scores(hypothesis, reference, avg=True)
        return scores

Overwriting analyzer.py


In [28]:
%%writefile summarizer.py
from PyPDF2 import PdfReader
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
import os
from langchain_core.runnables import RunnableSequence
from mistralai.client import MistralClient
from langchain_mistralai import ChatMistralAI
from dotenv import load_dotenv
import os


load_dotenv()
api_key = os.getenv("GITHUB_TOKEN")
hf_key = os.getenv("HF_TOKEN")
gemma_key = os.getenv("GEMMA")

class Summarizer:
    def __init__(self, document_file, summary_file):
        self.document_file = document_file
        self.summary_file = summary_file
        self.document_text=""
        self.summary_text_golden =""
        self.summarised_text=""

    def get_pdf_text(self, pdf):
        text = ""
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text+= page.extract_text()
        return text

    def process_documents(self):
        self.document_text = self.get_pdf_text(self.document_file) if self.document_file else ""
        self.summary_text = self.get_pdf_text(self.summary_file) if self.summary_file else ""
        return self.document_text, self.summary_text

    def summarise_documents(self, model='gpt-4', task="Summarize"):
        # Modify the prompt based on the task
        if task == "Summarize":
            template = '''Generate a Resume Summary Script (6–8 sentences) for an engaging video presentation aimed at a hiring manager.
            - Tone: Friendly, Confident, Assertive.
            - Base the script strictly on the given resume content — do not hallucinate or add unverifiable details.:\n {resume}'''

        prompt = PromptTemplate(input_variables=['resume'],template=template)
        output_parser = StrOutputParser()
        model = self._get_model_interface(model)
        chain = RunnableSequence(prompt, model, output_parser)

        summarised_text = chain.invoke({'resume': self.document_text})

        return summarised_text

    def _get_model_interface(self, model_name):
        if model_name == 'GPT-4o-mini':
            return ChatOpenAI(
                    base_url = "https://models.github.ai/inference",
                    api_key= api_key,
                    model="openai/gpt-4o-mini",
                    temperature=0.1
                    )
        if model_name == 'GPT-4.1-nano':
            return ChatOpenAI(
                    base_url = "https://models.github.ai/inference",
                    api_key=api_key,
                    model="openai/gpt-4.1-nano",
                    temperature=0.1
                    )
        if model_name == 'Mistral Small 3.1':
            return ChatMistralAI(
                    base_url = "https://models.github.ai/inference",
                    api_key= api_key,
                    model="mistral-small-2503",
                    temperature=0.1
                    )

Overwriting summarizer.py


In [22]:
%%writefile visualization.py
import plotly.graph_objs as go
import streamlit as st
import numpy as np
import pandas as pd
import os

class ScoreVisualizer:
    def __init__(self, all_scores, task):
        self.all_scores = all_scores
        self.task = task
        self.color_map = {
            'GPT-4o-mini': 'red',
            'GPT-4.1-nano': 'purple',
            'Mistral Small 3.1': 'blue',
        }

    def plot_scores(self):
        data = []
        BAR_WIDTH = 0.9 / (len(self.all_scores) + 1)

        for model, scores in self.all_scores.items():
            model_color = self.color_map.get(model, 'gray')
            showlegend = True

            if self.task == "Summarize":
                bleu_trace = go.Bar(
                    name=model,
                    x=['BLEU'],
                    y=[scores['bleu_score']],
                    marker_color=model_color,
                    width=BAR_WIDTH,
                    legendgroup=model,
                    showlegend=showlegend
                )
                data.append(bleu_trace)

                showlegend = False

                for rouge_key in ['rouge-1', 'rouge-2', 'rouge-l']:
                    rouge_trace = go.Bar(
                        name=model,
                        x=[rouge_key.upper()],
                        y=[scores['rouge_scores'][rouge_key]['f']],
                        marker_color=model_color,
                        width=BAR_WIDTH,
                        legendgroup=model,
                        showlegend=showlegend
                    )
                    data.append(rouge_trace)
        layout = go.Layout(
            title= 'Summary Evaluation Scores',
            barmode='group',
            yaxis=dict(title='Score'),
            xaxis=dict(title='Metric'),
            legend=dict(groupclick="toggleitem")
        )

        fig = go.Figure(data=data, layout=layout)
        st.plotly_chart(fig)

    def render_score_table(self):
        table_data = {}
        for model, metrics in self.all_scores.items():
            table_data[model] = {'Model': model}
            if self.task == "Summarize":
                for metric, score in metrics.items():
                    if metric == 'rouge_scores':
                        for rouge_metric, rouge_scores in score.items():
                            table_data[model][f"{rouge_metric.upper()} F1"] = round(rouge_scores['f'], 3)
                    else:
                        metric_name = "BLEU"
                        table_data[model][metric_name] = round(score, 3)

        score_df = pd.DataFrame.from_dict(table_data, orient='index').reset_index(drop=True)

        def highlight_max(s, props=''):
            return np.where(s == np.nanmax(s.to_numpy()), props, '')

        styled_df = score_df.style.apply(highlight_max, props='background-color:yellow;', axis=0,
                                         subset=pd.IndexSlice[:, :] if self.task == "Summarize" else pd.IndexSlice[:, 'Codebleu':'Dataflow Match Score'])
        st.dataframe(styled_df, use_container_width=True, hide_index=True)

Overwriting visualization.py


In [23]:
%%writefile app.py
import streamlit as st
from summarizer import Summarizer
import os
from analyzer import EvaluationMetrics
from visualization import ScoreVisualizer

class SummarizationApp:
    def __init__(self):
        self.model_options = ['GPT-4.1-nano', 'GPT-4o-mini', 'Mistral Small 3.1']

    def render_sidebar(self):
        st.sidebar.header("Upload Files")
        document_upload = st.sidebar.file_uploader("Upload Document", type=['pdf'])
        summary_upload = st.sidebar.file_uploader("Upload 'Golden Summary'", type=['pdf'])

        return document_upload, summary_upload

    def select_models(self):
        selected_models = st.sidebar.multiselect("Choose models to benchmark:", self.model_options)
        return selected_models

    def execution(self):
        run_button = st.sidebar.button("Run Benchmark")
        return run_button

    def select_task(self):
        task = st.sidebar.radio("Choose Task:", ["Summarize"])
        return task

    def run(self):
        st.title("LLM Benchmarking Tool")
        st.subheader("Summarization")

        document_upload, summary_upload= self.render_sidebar()
        selected_models = self.select_models()
        task = self.select_task()
        run_button = self.execution()

        if run_button:
            if document_upload is not None and summary_upload is not None:
                st.success("Files uploaded successfully!")
                st.write("Selected models for benchmarking:", ', '.join(selected_models))
                summarizer = Summarizer(document_upload, summary_upload)
                document_text, summary_text_golden = summarizer.process_documents()

                if task == "Summarize":
                    st.subheader("'Golden Source' Target Summary Text:")
                with st.expander("Click for 'Golden Source' text"):
                    st.write(summary_text_golden)
                all_scores={}
                for model in selected_models:
                    summarizer = Summarizer(document_upload, summary_upload)
                    document_text, summary_text_golden = summarizer.process_documents()

                    generated_summary = summarizer.summarise_documents(model, task)
                    st.subheader(model)
                    with st.expander(f"Click to see {model} generated summary"):
                        st.write(generated_summary)

                    evaluation_metrics = EvaluationMetrics()
                    if task == "Summarize":
                        bleu_score = evaluation_metrics.calculate_bleu_score(summary_text_golden, generated_summary)
                        rouge_scores = evaluation_metrics.calculate_rouge_scores(summary_text_golden, generated_summary)
                        all_scores[model] = {
                            "bleu_score": bleu_score,
                            "rouge_scores": rouge_scores
                        }
                visualizer = ScoreVisualizer(all_scores, task)
                visualizer.plot_scores()
                visualizer.render_score_table()

if __name__ == "__main__":
    app = SummarizationApp()
    app.run()

Overwriting app.py


In [25]:
from pyngrok import ngrok
from dotenv import load_dotenv
load_dotenv()
import os
ngrok_token = os.getenv("ngrok_token")
!ngrok config add-authtoken {ngrok_token}



Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [26]:
public_url = ngrok.connect(addr=8501)
print(f"Streamlit URL: {public_url}")

Streamlit URL: NgrokTunnel: "https://708d48077f66.ngrok-free.app" -> "http://localhost:8501"


In [27]:
!streamlit run app.py&




Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.148.230.72:8501[0m
[0m
2025-08-20 05:38:30.764109: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755668310.797473    6351 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755668310.807416    6351 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755668310.836879    6351 computation_placer.cc:177] computation placer already regist

In [None]:
!kill streamlit

/bin/bash: line 1: kill: streamlit: arguments must be process or job IDs
