# Download dataset and store

In [1]:
from datasets import load_dataset
import random
import pandas as pd
import re
import pickle

random.seed(2023)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
origin_data = load_dataset('aadityaubhat/GPT-wiki-intro')
print(origin_data)

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'wiki_intro', 'generated_intro', 'title_len', 'wiki_intro_len', 'generated_intro_len', 'prompt', 'generated_text', 'prompt_tokens', 'generated_text_tokens'],
        num_rows: 150000
    })
})


In [3]:
# origin_data olny has train set, so we get it 
origin_data = origin_data['train']

We random select 1500 data points

In [4]:
# random select 1500 of origin data
select_index = random.sample(range(0, len(origin_data)), 1500)
origin_data = origin_data.select(select_index)

# save subset data
with open('./dataset/origin_data_3000.pickle', 'wb') as f:
    pickle.dump(origin_data, f)

# Get from origin data and make it to our dataset

In [5]:
def format_sentence(sentence):
    sentence = sentence.replace('\n', ' ')
    sentence = sentence.replace('\t', ' ')
    sentence = sentence.replace('\r', ' ')
    
    sentence = sentence.lower()
    sentence = sentence.strip()
    
    emoji_pattern = re.compile("["
                        u"\U0001F600-\U0001F64F"  # emoticons
                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251"
                        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', sentence)

In [6]:
# we only use the 'wiki_intro' and 'generated_intro', and target is 0 or 1, 
# 0 means it is a wiki_intro, 1 means it is a generated_intro
dataset = []
for i in range(len(origin_data)):
    sentence = origin_data[i]['wiki_intro']
    if sentence == None:
        continue
    sentence = format_sentence(sentence)
    dataset.append([sentence, 0])
    
    sentence = origin_data[i]['generated_intro']
    if sentence == None:
        continue
    sentence = format_sentence(sentence)
    dataset.append([sentence, 1])

# shuffle the dataset
random.shuffle(dataset)

dataset = pd.DataFrame(dataset, columns=['text', 'label'])

print('dataset size: ', len(dataset))

dataset size:  3000


In [7]:
# save the dataset for later use    
with open('./dataset/text_3000.pickle', 'wb') as f:
    pickle.dump(dataset, f)

get the first 300 to choose hyperparameters

In [8]:
dataset_300 = dataset[:300]
with open('./dataset/text_300.pickle', 'wb') as f:
    pickle.dump(dataset_300, f)

# Load local dataset

In [9]:
import pickle

In [10]:
# read the dataset
dataset = pd.read_pickle('./dataset/text_3000.pickle')

print('dataset size: ', len(dataset))

dataset size:  3000


In [11]:
dataset.head()

Unnamed: 0,text,label
0,"""logic of empire"" is a science fiction novel b...",1
1,"major general richard hutton davies, (14 nove...",1
2,elgin reptiles is the name given to a group of...,1
3,"dubgaill and finngaill, or dubgenti and finnge...",0
4,chang teh-ming (; born 1938) is a taiwanese ph...,1


# Get GPT prompt

In [12]:
origin_data = pickle.load(open('./dataset/origin_data_3000.pickle', 'rb'))

In [13]:
# save prompts
prompts = origin_data['prompt']
with open('./dataset/prompts_3000.pickle', 'wb') as f:
    pickle.dump(prompts, f)

In [14]:
prompts = pickle.load(open('./dataset/prompts_3000.pickle', 'rb'))
prompts[:5]

["200 word wikipedia style introduction on 'Long Haired Businessmen'\n    Long Haired Businessmen is a comedy series",
 '200 word wikipedia style introduction on \'Duke Groner\'\n    Edward "Duke" Groner (March 24, 1908 –',
 '200 word wikipedia style introduction on \'Final Song\'\n    "Final Song" is a song by Danish',
 "200 word wikipedia style introduction on 'Gary Glitter'\n    Paul Francis Gadd (born 8 May 1944),",
 "200 word wikipedia style introduction on 'Kalinago'\n    The Kalinago, also known as the Island"]