# Download dataset and store

In [26]:
from datasets import load_dataset
import random
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [27]:
origin_data = load_dataset('aadityaubhat/GPT-wiki-intro')
print(origin_data)

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'wiki_intro', 'generated_intro', 'title_len', 'wiki_intro_len', 'generated_intro_len', 'prompt', 'generated_text', 'prompt_tokens', 'generated_text_tokens'],
        num_rows: 150000
    })
})


In [28]:
def format_sentence(sentence):
    sentence = sentence.replace('\n', ' ')
    sentence = sentence.replace('\t', ' ')
    sentence = sentence.replace('\r', ' ')
    
    sentence = sentence.lower()
    sentence = sentence.strip()
    return sentence

In [29]:
# origin_data olny has train set, so we get it 
origin_data = origin_data['train']

# we only use the top 30000 data, that is 15000 for each
length = 15000

# we only use the 'wiki_intro' and 'generated_intro', and target is 0 or 1, 
# 0 means it is a wiki_intro, 1 means it is a generated_intro
dataset = []
for i in range(length):
    sentence = origin_data[i]['wiki_intro']
    if sentence == None:
        continue
    sentence = format_sentence(sentence)
    dataset.append([sentence, 0])
    
    sentence = origin_data[i]['generated_intro']
    if sentence == None:
        continue
    sentence = format_sentence(sentence)
    dataset.append([sentence, 1])

# shuffle the dataset
random.shuffle(dataset)

# split the dataset into train set and test set
split_ratio = 0.9
train_set = dataset[:int(len(dataset)*split_ratio)]
test_set = dataset[int(len(dataset)*split_ratio):]

train_set = pd.DataFrame(train_set, columns=['text', 'label'])
test_set = pd.DataFrame(test_set, columns=['text', 'label'])

print('train set size: ', len(train_set))
print('test set size: ', len(test_set))

train set size:  27000
test set size:  3000


In [30]:
# save the dataset for later use    
train_set.to_csv('./dataset/train_set.csv', index=False)  
test_set.to_csv('./dataset/test_set.csv', index=False)

# Load local dataset

In [31]:
import pandas as pd

In [32]:
# read the dataset
train_set = pd.read_csv('./dataset/train_set.csv')
test_set = pd.read_csv('./dataset/test_set.csv')

print('train set size: ', len(train_set))
print('test set size: ', len(test_set))

train set size:  27000
test set size:  3000


In [33]:
train_set.head()

Unnamed: 0,text,label
0,"robert d. beach (born july 21, 1959, in glouce...",1
1,hairspray is a 1988 american comedy film direc...,1
2,trent carroll (born 28 april 1978) is a retire...,1
3,pavitra papi (punjabi: ਪਵਿੱਤਰ ਪਾਪੀ) is a 1970 ...,0
4,"algolagnia (; from , álgos, ""pain"", and , lagn...",0
