In [1]:
from datasets import load_dataset
import json
import numpy as np
import os
import uptrain
from rouge import Rouge 
import random
from matplotlib import pyplot as plt
import pandas as pd
import subprocess
import zipfile

PyTorch is available but CUDA is not. Defaulting to SciPy for SVD


In [2]:
samsum_dataset = load_dataset("samsum")
dialogsum_dataset = load_dataset("knkarthick/dialogsum")



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
remote_url = "https://oodles-dev-training-data.s3.amazonaws.com/conversation_summarization_data.zip"
data_dir = 'data'
if not os.path.exists(data_dir):
    file_downloaded_ok = subprocess.call("wget " + remote_url, shell=True, 
                                         stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
    print("Data downloaded.")
    with zipfile.ZipFile('conversation_summarization_data.zip', 'r') as zip_ref:
        zip_ref.extractall("./")
    print("Prepared Model Outputs.")
    os.remove('conversation_summarization_data.zip')
else:
    print("Skipping data download as it already exists.")

Skipping data download as it already exists.


In [4]:
"""
Using training data (i.e., SAMSum train), we generate and save a reference 
dataset to be used by the UpTrain framework. This dataset is used to detect 
drift, apply dimensionality reductions and compare visualizations.
"""
def generate_reference_dataset(summary, output_summaries_file, bert_embs_file, file_name, dataset_label):
    data = []
    if not os.path.exists(file_name):
        
        # Load model output summaries 
        f = open(output_summaries_file)
        output_summaries = json.load(f)
        f.close()
        
        # Load respective BERT embeddings of output summaries
        f = open(bert_embs_file)
        bert_embs = list(json.load(f))
        f.close()
        
        data = []
        for idx in range(len(bert_embs)):
            if isinstance(dataset_label, str):
                data.append({
                    'id': idx,
                    'dataset_label': dataset_label,
                    'summary': summary[idx],
                    'bert_embs': list(bert_embs[idx]),
                    'output': output_summaries[idx],
                })

        with open(file_name, "w") as f:
            json.dump(data, f, cls=uptrain.UpTrainEncoder)
        print("Generated reference dataset.")
    else:
        print("Reference dataset exists. Skipping generating again.")

In [15]:
"""
Run the model in production. First, we pass 
800 data points from SAMSum test and then
12400 data points from DialogSum train.
"""
def run_production(framework, batch_size=20):
    # for dataset_name in ['samsum', 'dialogsum']:
    #     if dataset_name=='samsum':
    #         d_type = 'test'
    #         dataset = samsum_dataset[d_type]
    #     elif dataset_name=='dialogsum':
    #         d_type = 'train'
    #         dataset = dialogsum_dataset[d_type]
    #     else:
    #         raise Exception("Dataset Error")
    
        d_type = 'test'
        dataset_name = 'samsum'
        dataset = samsum_dataset[d_type][0:batch_size]

        f = open(os.path.join(data_dir, f"out_{d_type}_{dataset_name}_summaries.json"))
        all_summaries = json.load(f)
        f.close()

        """
        Note: We use sentence BERT embeddings generated from here:
        https://huggingface.co/sentence-transformers
        But any other embeddings, such as the ones generated by the
        encoder can be used as well.
        """
        f = open(os.path.join(data_dir, f"out_{d_type}_{dataset_name}_bert_embs.json"))
        all_bert_embs = json.load(f)
        f.close()

        for idx in range(len(all_bert_embs)//batch_size):
            idxs = slice(idx*batch_size, (idx+1)*batch_size)
            this_batch = dataset['summary'][idxs]
            this_batch_dialog = dataset['dialogue'][idxs]

            inputs = {
                'id': list(range(idx*batch_size, (idx+1)*batch_size)),
                'bert_embs': np.array(all_bert_embs[idxs]),
                'dataset_label': [dataset_name]*batch_size,
                'dialog': this_batch_dialog,
                'summary': this_batch,
            }
            idens = framework.log(inputs=inputs, outputs=all_summaries[idxs])
            break
        print(f"{(idx+1)*batch_size} predictions logged for {dataset_name} {d_type}")

In [16]:
# Get the locations of training-related data and outputs
output_summaries_file = os.path.join(data_dir, 'out_train_samsum_summaries.json')
bert_embs_file = os.path.join(data_dir, 'out_train_samsum_bert_embs.json')
reference_dataset_file = os.path.join(data_dir, 'reference_dataset.json')

# Generate and save reference dataset
generate_reference_dataset(samsum_dataset['train']['summary'], output_summaries_file, 
                           bert_embs_file, reference_dataset_file, 'reference')

Reference dataset exists. Skipping generating again.


In [17]:
umap_check = {
    'type': uptrain.Visual.UMAP,
    "measurable_args": {
        'type': uptrain.MeasurableType.INPUT_FEATURE,
        'feature_name': 'bert_embs'
    },
    "label_args": [{
        'type': uptrain.MeasurableType.INPUT_FEATURE,
        'feature_name': 'dataset_label'
    },
    {
        'type': uptrain.MeasurableType.GRAMMAR_SCORE,
        'feature_name': 'summary'
    }
    ],
    "hover_args": [
    {
        'type': uptrain.MeasurableType.INPUT_FEATURE,
        'feature_name': 'id'
    },
    {
        'type': uptrain.MeasurableType.PREDICTION,
    },
    {
        'type': uptrain.MeasurableType.INPUT_FEATURE,
        'feature_name': 'summary'
    },
    ],
    "update_freq": 10,
    # 'initial_dataset': reference_dataset_file,
    "do_clustering": False,
}

In [18]:
config = {
    "checks": [umap_check],
    "logging_args": {"st_logging": True},
}

framework = uptrain.Framework(cfg_dict=config)

Deleting the folder:  uptrain_smart_data
Deleting the folder:  uptrain_logs


In [19]:
run_production(framework)

20 predictions logged for samsum test
