In [1]:
import pandas as pd
import re

from sklearn.model_selection import train_test_split

from datasets import load_dataset

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-v0.3')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'


def clean_text(text):
    # Remove non-UTF-8 characters (weird UTF-16 symbols)
    clean_text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    # Remove special characters except letters, digits, spaces, and punctuation marks (.,!?;:)
    clean_text = re.sub(r'[^a-zA-Z0-9\s.,!?;:]', '', clean_text)
    
    return clean_text.strip()


def split_text(human_text, prompt_length, answer_length):

    tokens = tokenizer(clean_text(human_text), return_tensors='pt', padding=True, truncation=True)
    prompt = tokens['input_ids'].squeeze(0)[:prompt_length]
    answer = tokens['input_ids'].squeeze(0)[prompt_length:prompt_length + answer_length]

    prompt = tokenizer.decode(prompt)
    answer = tokenizer.decode(answer)
    
    return prompt, answer

## Data Preparation

### Data
1. Squad
2. Wikitext

### Methodology

Both datasets contain human written text. We will use this text, to prompt an LLM and let it generate few sentences to complete the text.

As mentioned in bibliography, we will use `PRIME` tokens from the human text, and let it generate.

Thus, we will have two samples starting from the same `PRIME` tokens:
1. Human written - positive sample
2. Machine generated - negative sample

In [3]:
def word_count_fn(text):
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    text = tokens['input_ids'].squeeze(0)
    output = len(text)
    return output

In [4]:
# loading squad
squad_ds = load_dataset("rajpurkar/squad")

# Get dataset split and keep only the context
train_squad = squad_ds['train'].to_pandas()[['context']]
train_squad = train_squad.drop_duplicates(subset='context')
print(f'The number of training samples is {len(train_squad)}')

# Get dataset split and keep only the context
val_squad = squad_ds['validation'].to_pandas()[['context']]
val_squad = val_squad.drop_duplicates(subset='context')
print(f'The number of validation samples is {len(val_squad)}')

The number of training samples is 18891
The number of validation samples is 2067


In [5]:
# loading wiki
wiki_ds = load_dataset("Salesforce/wikitext", "wikitext-103-v1")

# getting dataset splits
train_wiki = wiki_ds['train'].to_pandas()
val_wiki = wiki_ds['validation'].to_pandas()
test_wiki = wiki_ds['test'].to_pandas()

In [6]:
# count tokens for our dfs

# wikitext
train_wiki['word_count'] = train_wiki['text'].apply(word_count_fn)
val_wiki['word_count'] = val_wiki['text'].apply(word_count_fn)
test_wiki['word_count'] = test_wiki['text'].apply(word_count_fn)

# squad
train_squad['word_count'] = train_squad['context'].apply(word_count_fn)
val_squad['word_count'] = val_squad['context'].apply(word_count_fn)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [7]:
#some statistics
print(train_wiki['word_count'].describe())
print(val_wiki['word_count'].describe())
print(test_wiki['word_count'].describe())

print(train_squad['word_count'].describe())
print(val_squad['word_count'].describe())

count    1.801350e+06
mean     7.563011e+01
std      1.026569e+02
min      1.000000e+00
25%      1.000000e+00
50%      1.300000e+01
75%      1.380000e+02
max      1.781000e+03
Name: word_count, dtype: float64
count    3760.000000
mean       76.341489
std        99.312526
min         1.000000
25%         1.000000
50%        14.000000
75%       141.000000
max       538.000000
Name: word_count, dtype: float64
count    4358.000000
mean       74.891923
std       100.971810
min         1.000000
25%         1.000000
50%        13.000000
75%       132.000000
max       600.000000
Name: word_count, dtype: float64
count    18891.000000
mean       176.584723
std         76.554855
min         27.000000
25%        129.000000
50%        163.000000
75%        212.000000
max       1053.000000
Name: word_count, dtype: float64
count    2067.000000
mean      183.435897
std        80.565770
min        31.000000
25%       134.000000
50%       168.000000
75%       216.000000
max       839.000000
Name: word_c

In [8]:
# keep above one threshold
prompt_length: int = 20
answer_length: int = 140

low_thres: int = prompt_length + answer_length
high_thres: int = 1000

train_wiki = train_wiki[(train_wiki['word_count']>low_thres) & (train_wiki['word_count']<high_thres)]
val_wiki = val_wiki[(val_wiki['word_count']>low_thres) & (val_wiki['word_count']<high_thres)]
test_wiki = test_wiki[(test_wiki['word_count']>low_thres) & (test_wiki['word_count']<high_thres)]
train_squad = train_squad[(train_squad['word_count']>low_thres) & (train_squad['word_count']<high_thres)]
val_squad = val_squad[(val_squad['word_count']>low_thres) & (val_squad['word_count']<high_thres)]

In [9]:
#transformations os squad

train_squad, test_squad = train_test_split(train_squad, test_size=0.1)

print(len(train_squad))
print(len(val_squad))
print(len(test_squad))

8751
1143
973


In [12]:
print(len(train_wiki))
print(len(val_wiki))
print(len(test_wiki))

36000
789
846


In [11]:
train_wiki = train_wiki.sample(n=36000, random_state=42)

In [13]:
# Function to split the text into three parts
def dep_split_text(text, prompt_length, answer_length):
    words = text.split()
    
    first_batch = ' '.join(words[:prompt_length])
    next_batch = ' '.join(words[prompt_length:prompt_length+answer_length])
    
    # The rest of the words
    rest = ' '.join(words[prompt_length+answer_length:])
    
    return first_batch, next_batch, rest


In [14]:
train_wiki[['prompt', 'answer']] = train_wiki['text'].apply(split_text, args=(prompt_length, answer_length)).apply(pd.Series)
val_wiki[['prompt', 'answer']] = val_wiki['text'].apply(split_text, args=(prompt_length, answer_length)).apply(pd.Series)
test_wiki[['prompt', 'answer']] = test_wiki['text'].apply(split_text, args=(prompt_length, answer_length)).apply(pd.Series)

In [15]:
train_squad[['prompt', 'answer']] = train_squad['context'].apply(split_text, args=(prompt_length, answer_length)).apply(pd.Series)
test_squad[['prompt', 'answer']] = test_squad['context'].apply(split_text, args=(prompt_length, answer_length)).apply(pd.Series)
val_squad[['prompt', 'answer']] = val_squad['context'].apply(split_text, args=(prompt_length, answer_length)).apply(pd.Series)

In [16]:
train_wiki.to_csv("../data/wikitext/train.csv", index=False) 
val_wiki.to_csv("../data/wikitext/val.csv", index=False) 
test_wiki.to_csv("../data/wikitext/test.csv", index=False) 

In [17]:
train_squad.to_csv("../data/squad/train.csv", index=False) 
val_squad.to_csv("../data/squad/val.csv", index=False)
test_squad.to_csv("../data/squad/test.csv", index=False)