# Download dataset and store

In [3]:
from datasets import load_dataset
import random
import pandas as pd
import re

In [2]:
origin_data = load_dataset('aadityaubhat/GPT-wiki-intro')
print(origin_data)

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'wiki_intro', 'generated_intro', 'title_len', 'wiki_intro_len', 'generated_intro_len', 'prompt', 'generated_text', 'prompt_tokens', 'generated_text_tokens'],
        num_rows: 150000
    })
})


In [4]:
def format_sentence(sentence):
    sentence = sentence.replace('\n', ' ')
    sentence = sentence.replace('\t', ' ')
    sentence = sentence.replace('\r', ' ')
    
    sentence = sentence.lower()
    sentence = sentence.strip()
    
    emoji_pattern = re.compile("["
                        u"\U0001F600-\U0001F64F"  # emoticons
                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251"
                        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', sentence)

In [5]:
# origin_data olny has train set, so we get it 
origin_data = origin_data['train']

# we only use the top 3000 data, that is 1500 for each
length = 1500

# we only use the 'wiki_intro' and 'generated_intro', and target is 0 or 1, 
# 0 means it is a wiki_intro, 1 means it is a generated_intro
dataset = []
for i in range(length):
    sentence = origin_data[i]['wiki_intro']
    if sentence == None:
        continue
    sentence = format_sentence(sentence)
    dataset.append([sentence, 0])
    
    sentence = origin_data[i]['generated_intro']
    if sentence == None:
        continue
    sentence = format_sentence(sentence)
    dataset.append([sentence, 1])

# shuffle the dataset
random.shuffle(dataset)

# split the dataset into train set and test set
split_ratio = 0.9
train_set = dataset[:int(len(dataset)*split_ratio)]
test_set = dataset[int(len(dataset)*split_ratio):]

train_set = pd.DataFrame(train_set, columns=['text', 'label'])
test_set = pd.DataFrame(test_set, columns=['text', 'label'])

print('train set size: ', len(train_set))
print('test set size: ', len(test_set))

train set size:  2700
test set size:  300


In [6]:
# save the dataset for later use    
train_set.to_csv('./dataset/train_set.csv', index=False)  
test_set.to_csv('./dataset/test_set.csv', index=False)

# Load local dataset

In [7]:
import pandas as pd

In [8]:
# read the dataset
train_set = pd.read_csv('./dataset/train_set.csv')
test_set = pd.read_csv('./dataset/test_set.csv')

print('train set size: ', len(train_set))
print('test set size: ', len(test_set))

train set size:  2700
test set size:  300


In [9]:
train_set.head()

Unnamed: 0,text,label
0,events march 17 – anton rubinstein is named f...,0
1,"caladenia lindleyana, commonly known as the li...",1
2,paweł kontny schr (29 june 1910 – 10 january 1...,1
3,supported employment refers to service provisi...,0
4,highlands is a census-designated place (cdp) l...,0
