# Download dataset and store

In [6]:
from datasets import load_dataset
import random
import pandas as pd

In [7]:
origin_data = load_dataset('aadityaubhat/GPT-wiki-intro')
print(origin_data)

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'wiki_intro', 'generated_intro', 'title_len', 'wiki_intro_len', 'generated_intro_len', 'prompt', 'generated_text', 'prompt_tokens', 'generated_text_tokens'],
        num_rows: 150000
    })
})


In [8]:
def format_sentence(sentence):
    sentence = sentence.replace('\n', ' ')
    sentence = sentence.replace('\t', ' ')
    sentence = sentence.replace('\r', ' ')
    
    sentence = sentence.lower()
    sentence = sentence.strip()
        
    # cut the sentence if it is too long for bert
    if len(sentence) > 512:
        sentence = sentence[:512]
    return sentence

In [9]:
# origin_data olny has train set, so we get it 
origin_data = origin_data['train']

# we only use the top 30000 data, that is 15000 for each
length = 15000

# we only use the 'wiki_intro' and 'generated_intro', and target is 0 or 1, 
# 0 means it is a wiki_intro, 1 means it is a generated_intro
dataset = []
for i in range(length):
    sentence = origin_data[i]['wiki_intro']
    if sentence == None:
        continue
    sentence = format_sentence(sentence)
    dataset.append([sentence, 0])
    
    sentence = origin_data[i]['generated_intro']
    if sentence == None:
        continue
    sentence = format_sentence(sentence)
    dataset.append([sentence, 1])

# shuffle the dataset
random.shuffle(dataset)

# split the dataset into train set and test set
split_ratio = 0.9
train_set = dataset[:int(len(dataset)*split_ratio)]
test_set = dataset[int(len(dataset)*split_ratio):]

train_set = pd.DataFrame(train_set, columns=['text', 'label'])
test_set = pd.DataFrame(test_set, columns=['text', 'label'])

print('train set size: ', len(train_set))
print('test set size: ', len(test_set))

train set size:  27000
test set size:  3000


In [10]:
# save the dataset for later use    
train_set.to_csv('./dataset/train_set.csv', index=False)  
test_set.to_csv('./dataset/test_set.csv', index=False)

# Load local dataset

In [1]:
import pandas as pd

In [2]:
# read the dataset
train_set = pd.read_csv('./dataset/train_set.csv')
test_set = pd.read_csv('./dataset/test_set.csv')

print('train set size: ', len(train_set))
print('test set size: ', len(test_set))

train set size:  27000
test set size:  3000


In [3]:
train_set.head()

Unnamed: 0,text,label
0,the oliveira lima library (also known as the ...,0
1,", better known as , is a japanese popular sing...",0
2,julian marryshow (1918 – 17 july 2012) was a g...,0
3,joseph mason cox (1763–1818) was an early amer...,1
4,the crucifixion darkness is an episode in thre...,0


# Sent2Vec

In [4]:
# https://pypi.org/project/sent2vec/
from sent2vec.vectorizer import Vectorizer
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


## For small dataset we run all text

In [5]:
def vectorize_small(dataset):
    vector_set = pd.DataFrame(columns=['vector', 'label'], index=range(len(dataset)))
    vectorizer = Vectorizer(pretrained_weights='distilbert-base-uncased')
    vectorizer.run(list(dataset['text']))
    vectors = vectorizer.vectors
    vector_set['vector'] = list(vectors)
    vector_set['label'] = list(dataset['label'])
    return vector_set

## For large dataset we run every 100 text

In [10]:
def vectorize_large(dataset, n=1000):
    vector_set = pd.DataFrame(columns=['vector', 'label'], index=range(len(dataset)))
    # for every n data, we run the vectorizer
    for i in range(0, len(train_set), n):
        if i+n > len(train_set):
            end = len(train_set)
        else:
            end = i+n
        vectorizer = Vectorizer(pretrained_weights='distilbert-base-uncased')
        vectorizer.run(list(train_set['text'][i:end]))
        vector_set['vector'][i:end] = vectorizer.vectors
        vector_set['label'][i:end] = train_set['label'][i:end]
    return vector_set

In [11]:
# save the dataset for later use
train_set_vector = vectorize_large(train_set)
train_set_vector.to_csv('./dataset/train_set_vector.csv', index=False)

Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vector

In [6]:
test_set_vector = vectorize_small(test_set)
test_set_vector.to_csv('./dataset/test_set_vector.csv', index=False)

Initializing Bert distilbert-base-uncased
Vectorization done on cpu


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
