# Generate HuggingFace Dataset

## Imports

In [92]:
import os

import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict, load_from_disk

from sentence_transformers import SentenceTransformer

## Load dataset

### Loading prompts train csv

In [15]:
prompts_train = pd.read_csv('./data/prompts_train.csv')
prompts_train

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."


### Loading summaries train csv

In [8]:
summaries_train = pd.read_csv('./data/summaries_train.csv')
summaries_train.head()

Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757


## Generate HF Dataset

### Load sentence transformers model

In [22]:
model = SentenceTransformer('all-mpnet-base-v2')

### Preprocess prompts dataset

In [58]:
def preprocess_prompts(prompts_df: pd.DataFrame) -> pd.DataFrame:
    # init an empty new  dataFrame
    new_data = pd.DataFrame({
        'prompt_id': [],
        'prompt_question': [],
        # 'prompt_title': [],
        'prompt_text': []
    })

    for index, row in prompts_df.iterrows():
        # retrieve columns
        prompt_id = row['prompt_id']
        prompt_question = row['prompt_question']
        # prompt_title = row['prompt_title'] # we do not need the title
        prompt_text = row['prompt_text']

        # we are creating a batch of the sentences we want to get embeddings
        sentences = [prompt_question, prompt_text]

        # calling model embedding
        embeddings = model.encode(sentences)

        # Create a new row
        new_row = pd.DataFrame({'prompt_id': [prompt_id], 'prompt_question': [embeddings[0]], 'prompt_text': [embeddings[1]]})
        
        # Append the row
        new_data = pd.concat([new_data.loc[:], new_row], ignore_index=True)

    path = f"./data/preprocessed_data"
    os.makedirs(path, exist_ok=True)
    new_data.to_pickle(f"{path}/prompts_train.pickle")
    return new_data

In [59]:
preprocessed_prompts_train = preprocess_prompts(prompts_train)
preprocessed_prompts_train

Unnamed: 0,prompt_id,prompt_question,prompt_text
0,39c16e,"[0.0091599105, -0.029818784, 0.0031730002, 0.0...","[-0.010795923, 0.025710443, -0.015630066, 0.01..."
1,3b9047,"[-0.012024703, 0.016922273, 0.015488309, 0.002...","[0.058067385, 0.042484563, 0.012715496, 0.0067..."
2,814d6b,"[0.054656938, -0.06503344, 0.011860348, -0.053...","[0.04722372, 0.06517054, 0.00032050064, -0.025..."
3,ebad26,"[0.0067965747, 0.07661588, -0.009160591, 0.033...","[-0.013152148, 0.059951786, -0.0068048476, -0...."


### Preprocess summaries dataset

In [63]:
def normalize_value(x, min=-2, max=4):
    return 2*((x-min)/(max-min))-1

In [84]:
def preprocess_summaries(summaries_df: pd.DataFrame, prompts_df: pd.DataFrame) -> pd.DataFrame :
    # init an empty new  dataFrame
    new_data = pd.DataFrame({
        'student_id': [],
        'prompt_id': [],
        'prompt_question': [],
        'prompt_text': [],
        'text': [],
        'content': [],
        'wording': [],
        'normalized_content': [],
        'normalized_wording': []
    })
    
    for index, row in summaries_df.iterrows():
        print(f"\r{index+1}/{len(summaries_df)}", end="")
        # retrieve columns
        student_id = row["student_id"]
        prompt_id = row["prompt_id"]
        text = row["text"]
        content = row["content"]
        wording = row["wording"]
        normalized_content = normalize_value(content)
        normalized_wording = normalize_value(wording)

        # we are creating a batch of the sentences we want to get embeddings
        sentences = [text]

        # calling model embedding
        embeddings = model.encode(sentences)

        prompt_row = prompts_df.loc[prompts_df['prompt_id'] == prompt_id]
        prompt_question, prompt_text = prompt_row['prompt_question'].item(), prompt_row['prompt_text'].item()

        # Create a new row
        new_row = pd.DataFrame({
            'student_id': [student_id],
            'prompt_id': [prompt_id],
            'prompt_question': [prompt_question],
            'prompt_text': [prompt_text],
            'text': [embeddings[0]],
            'content': [content],
            'wording': [wording],
            'normalized_content': [normalized_content],
            'normalized_wording': [normalized_wording]
        })
        
        # Append the row
        new_data = pd.concat([new_data.loc[:], new_row], ignore_index=True)

    path = f"./data/preprocessed_data"
    os.makedirs(path, exist_ok=True)
    new_data.to_pickle(f"{path}/summaries_train.pickle") 
    return new_data

In [85]:
preprocess_summaries_train = preprocess_summaries(summaries_train, preprocessed_prompts_train)
preprocess_summaries_train

7165/7165

Unnamed: 0,student_id,prompt_id,prompt_question,prompt_text,text,content,wording,normalized_content,normalized_wording
0,fffbccfd8a08,ebad26,"[0.0067965747, 0.07661588, -0.009160591, 0.033...","[-0.013152148, 0.059951786, -0.0068048476, -0....","[0.0332336, 0.12647918, 0.0012254969, -0.01061...",1.771596,0.547742,0.257199,-0.150753


In [94]:
preprocess_summaries_train = pd.read_pickle("./data/preprocessed_data/summaries_train.pickle")

In [95]:
preprocess_summaries_train

Unnamed: 0,student_id,prompt_id,prompt_question,prompt_text,text,content,wording,normalized_content,normalized_wording
0,000e8c3c7ddb,814d6b,"[0.054656938, -0.06503344, 0.011860348, -0.053...","[0.04722372, 0.06517054, 0.00032050064, -0.025...","[0.041094355, -0.006460443, 0.008264857, -0.06...",0.205683,0.380538,-0.264772,-0.206487
1,0020ae56ffbf,ebad26,"[0.0067965747, 0.07661588, -0.009160591, 0.033...","[-0.013152148, 0.059951786, -0.0068048476, -0....","[0.033534568, 0.112103865, 0.013419032, 0.0166...",-0.548304,0.506755,-0.516101,-0.164415
2,004e978e639e,3b9047,"[-0.012024703, 0.016922273, 0.015488309, 0.002...","[0.058067385, 0.042484563, 0.012715496, 0.0067...","[0.022079999, 0.030259458, 0.016786981, 0.0001...",3.128928,4.231226,0.709643,1.077075
3,005ab0199905,3b9047,"[-0.012024703, 0.016922273, 0.015488309, 0.002...","[0.058067385, 0.042484563, 0.012715496, 0.0067...","[0.041241955, -0.01394414, 0.015658101, 0.0315...",-0.210614,-0.471415,-0.403538,-0.490472
4,0070c9e7af47,814d6b,"[0.054656938, -0.06503344, 0.011860348, -0.053...","[0.04722372, 0.06517054, 0.00032050064, -0.025...","[0.008337094, 0.04310772, 0.0132706165, -0.052...",3.272894,3.219757,0.757631,0.739919
...,...,...,...,...,...,...,...,...,...
7160,ff7c7e70df07,ebad26,"[0.0067965747, 0.07661588, -0.009160591, 0.033...","[-0.013152148, 0.059951786, -0.0068048476, -0....","[0.035653163, 0.043147907, -0.030623915, -0.00...",0.205683,0.380538,-0.264772,-0.206487
7161,ffc34d056498,3b9047,"[-0.012024703, 0.016922273, 0.015488309, 0.002...","[0.058067385, 0.042484563, 0.012715496, 0.0067...","[-0.035234254, 0.033364758, 0.02028155, 0.0068...",-0.308448,0.048171,-0.436149,-0.317276
7162,ffd1576d2e1b,3b9047,"[-0.012024703, 0.016922273, 0.015488309, 0.002...","[0.058067385, 0.042484563, 0.012715496, 0.0067...","[0.022816774, 0.011155369, -0.0006411369, 0.02...",-1.408180,-0.493603,-0.802727,-0.497868
7163,ffe4a98093b2,39c16e,"[0.0091599105, -0.029818784, 0.0031730002, 0.0...","[-0.010795923, 0.025710443, -0.015630066, 0.01...","[0.012417577, -0.052821647, -0.0033073293, -0....",-0.393310,0.627128,-0.464437,-0.124291


### Serialize and deserialize functions

In [96]:
def serialize_array(arr):
    return arr.tobytes()
    
def deserialize_array(binary_string, dtype, shape):
    return np.frombuffer(binary_string, dtype=dtype).reshape(shape)

In [97]:
def preprocess_data(data):
    data['prompt_question'] = data['prompt_question'].apply(serialize_array)
    data['prompt_text'] = data['prompt_text'].apply(serialize_array)
    data['text'] = data['text'].apply(serialize_array)
    return data

In [98]:
preprocess_summaries_train = preprocess_data(preprocess_summaries_train)

### Converting the pandas dataset to huggingFace dataset

In [99]:
train_dataset = Dataset.from_pandas(preprocess_summaries_train)

In [100]:
dataset = {
    'train': train_dataset,
}

In [101]:
dataset = DatasetDict(dataset)
path = './data/hugging_face/'
os.makedirs(path, exist_ok=True)
dataset.save_to_disk(path)

Saving the dataset (0/1 shards):   0%|          | 0/7165 [00:00<?, ? examples/s]

### Test the HF dataset

In [102]:
# Later you can load the entire DatasetDict back from disk like this
dataset_dict = DatasetDict.load_from_disk(path)

In [103]:
print(dataset_dict)
print()
print(deserialize_array(dataset_dict['train'][0]['prompt_question'], np.float32, (768)).shape)
print()
print(deserialize_array(dataset_dict['train'][0]['prompt_text'], np.float32, (768)).shape)
print()
print(deserialize_array(dataset_dict['train'][0]['text'], np.float32, (768)).shape)
print()
print(dataset_dict['train'][0]['content'])
print()
print(dataset_dict['train'][0]['normalized_content'])

DatasetDict({
    train: Dataset({
        features: ['student_id', 'prompt_id', 'prompt_question', 'prompt_text', 'text', 'content', 'wording', 'normalized_content', 'normalized_wording'],
        num_rows: 7165
    })
})

(768,)

(768,)

(768,)

0.205682506482641

-0.2647724978391196
