In [1]:
# !pip install uptrain rouge datasets umap-learn matplotlib py7zr torch

In [2]:
from datasets import load_dataset
import json
import numpy as np
import os
import uptrain
from rouge import Rouge 
import random
from matplotlib import pyplot as plt
import pandas as pd
import subprocess
import zipfile
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
import torch
import torch.nn.functional as F

PyTorch is available but CUDA is not. Defaulting to SciPy for SVD


#### Following functions are for testing purposes only

In [3]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[
        0
    ]  # First element of model_output contains all token embeddings
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )


# Function to get bert embeddings from sentences list
def convert_sentence_to_emb(sentences, sentence_emb_model, device):
    # Load model from HuggingFace Hub
    tokenizer = AutoTokenizer.from_pretrained(sentence_emb_model)
    model = AutoModel.from_pretrained(sentence_emb_model).to(device)

    # Tokenize sentences
    encoded_input = tokenizer(
        sentences, padding=True, truncation=True, return_tensors="pt"
    ).to(device)

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    embs = mean_pooling(model_output, encoded_input["attention_mask"])

    # Normalize embeddings
    return np.array(F.normalize(embs, p=2, dim=1).cpu())


def get_summary_and_embs(tokenizer, model, text, device, max_new_tokens=30):
    prefix = "summarize: "
    this_batch = [prefix + doc for doc in text if doc is not None]
    # Text encoder
    input_embs = tokenizer(
        this_batch, truncation=True, padding=True, return_tensors="pt"
    ).input_ids.to(device)

    # Getting output values
    output_embs = model.generate(input_embs, max_new_tokens=max_new_tokens)

    # Text decoder
    summaries = tokenizer.batch_decode(output_embs, skip_special_tokens=True)
    bert_embs = convert_sentence_to_emb(summaries, sentence_emb_model, device)
    return summaries, output_embs.cpu().numpy(), bert_embs

In [4]:
samsum_dataset = load_dataset("samsum")
golden_dataset = samsum_dataset["test"][0:100]



  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

In [6]:
summary_model = "yasminesarraj/flan-t5-small-samsum"

In [7]:
sentence_emb_model = "sentence-transformers/paraphrase-MiniLM-L6-v2"
# sentence_emb_model = "sentence-transformers/all-MiniLM-L6-v2"

In [8]:
# %%time
# summaries, output_embs, bert_embs = get_summary_and_embs(tokenizer, model, golden_dataset['dialogue'], device, max_new_tokens=30)

In [9]:
cfg = {
    "checks": [{
        'type': uptrain.Monitor.LLM_EVALUATION,
        "measurable_args": {
            "type": uptrain.MeasurableType.INPUT_FEATURE,
            "feature_name": "dialogues"
        },
        "embedding_model": sentence_emb_model,
        "llm_model_args": {"model_name": summary_model},
        "distance_types": ["cosine_distance", "l2_distance"],
    }],
    "logging_args": {
        "st_logging": True,
        "use_new_handler": True,
        "run_background_streamlit": False
    }
}

In [10]:
framework = uptrain.Framework(cfg_dict=cfg)
ids = framework.log(inputs={"dialogues": golden_dataset['dialogue']}, outputs=None)
framework.log(identifiers=ids, gts=golden_dataset["summary"])

Deleting contents of the folder:  uptrain_smart_data
Deleting contents of the log folder at: uptrain_logs
To start the streamlit dashboard, run the following command:  streamlit run /Users/vipul/Downloads/uptrain_repos/uptrain/uptrain/core/classes/logging/new_st_run.py  -- uptrain_logs


[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99]