In [2]:
# All imports should be here
from huggingface_hub.hf_api import HfFolder
from datasets import load_dataset
from datasets import Dataset
import pickle

In [3]:
# Privacy
huggingface_token = '<TOKEN>'

In [4]:
# Config 
# File name that is used to store all processed sentances
file_name_summary_all_sentances = 'all_sentances_processed.pkl'

# New dataset name
new_dataset_name = "VitaliiVrublevskyi/mrpc_llama_2_v2"

In [5]:
# Log in the HF to get access to the dataset
HfFolder.save_token(huggingface_token)

dataset = load_dataset("glue", "mrpc")

In [6]:
def load_already_processed_sentances():
    try:
        with open(file_name_summary_all_sentances, 'rb') as file:
            processed_sentances = pickle.load(file)
    except:
        processed_sentances = {}
    return processed_sentances

processed_sentances = load_already_processed_sentances()

In [7]:
def get_all_sentances_from_dataset(dataset):
    all_sentances = set()
    
    for part in ['train']:
      for elem in dataset[part]:
        all_sentances.add(elem['sentence1'])
        all_sentances.add(elem['sentence2'])
    
    return list(all_sentances)

all_sentances = get_all_sentances_from_dataset(dataset)

In [8]:
def get_clean_summaries():
    clean_summaries = {}
    for s in all_sentances:
        summaries = []
        for p in processed_sentances[s]:  
            index = p.find('Answer:') + len('Answer:') + 1
            potential_summary = p[index:]
            # It may be too big, so we need to trim it
            index = potential_summary.find("\n")
            potential_summary = potential_summary[:index]
            potential_summary = potential_summary.replace(' a concise summary of this text in 20 words:', '')
            summaries.append(potential_summary)
        clean_summaries[s] = summaries
    
    return clean_summaries

clean_summaries = get_clean_summaries()

In [9]:
def enriched_train_set_generator():
    train_set = []
    for s in dataset['train']:
        first_summaries = clean_summaries[s['sentence1']]
        second_summaries = clean_summaries[s['sentence2']]
        
        first_summaries.append(s['sentence1'])
        second_summaries.append(s['sentence2'])
        # Only unique
        first_summaries = list(set(first_summaries))
        second_summaries = list(set(second_summaries))
        
        for s1 in first_summaries:
            for s2 in second_summaries:
                elem = {}
                elem['label'] = s['label']
                elem['sentence1'] = s1
                elem['sentence2'] = s2
                elem['category'] = 'llama2'
                yield elem

enriched_dataset_train = Dataset.from_generator(enriched_train_set_generator)

Generating train split: 0 examples [00:00, ? examples/s]

In [10]:
def enriched_validation_set_generator():
    validation_set = []
    for s in dataset['validation']:
        elem = {}
        elem['label'] = s['label']
        elem['sentence1'] = s['sentence1']
        elem['sentence2'] = s['sentence2']
        elem['category'] = 'original'
        yield elem
        
enriched_dataset_validation = Dataset.from_generator(enriched_validation_set_generator)

In [11]:
def enriched_test_set_generator():
    validation_set = []
    for s in dataset['test']:
        elem = {}
        elem['label'] = s['label']
        elem['sentence1'] = s['sentence1']
        elem['sentence2'] = s['sentence2']
        elem['category'] = 'original'
        yield elem
        
enriched_dataset_test = Dataset.from_generator(enriched_test_set_generator)

In [12]:
enriched_dataset_train.push_to_hub(new_dataset_name, split='train')
enriched_dataset_validation.push_to_hub(new_dataset_name, split='validation')
enriched_dataset_test.push_to_hub(new_dataset_name, split='test')


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/28 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/557 [00:00<?, ?B/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/672 [00:00<?, ?B/s]

In [13]:
uploaded_dataset = load_dataset(new_dataset_name)

Downloading readme:   0%|          | 0.00/770 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/73.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/299k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/27739 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [14]:
uploaded_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'sentence1', 'sentence2', 'category'],
        num_rows: 27739
    })
    validation: Dataset({
        features: ['label', 'sentence1', 'sentence2', 'category'],
        num_rows: 408
    })
    test: Dataset({
        features: ['label', 'sentence1', 'sentence2', 'category'],
        num_rows: 1725
    })
})