In [1]:
import pickle
import pandas as pd
import numpy as np

# https://pypi.org/project/sent2vec/
from sent2vec.vectorizer import Vectorizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_3000 = pd.read_pickle('./dataset/text_3000.pickle')
dataset_300 = pd.read_pickle('./dataset/text_300.pickle')

print(dataset_3000.shape)
print(dataset_300.shape)

(3000, 2)
(300, 2)


In [3]:
dataset_3000.head()

Unnamed: 0,text,label
0,"""logic of empire"" is a science fiction novel b...",1
1,"major general richard hutton davies, (14 nove...",1
2,elgin reptiles is the name given to a group of...,1
3,"dubgaill and finngaill, or dubgenti and finnge...",0
4,chang teh-ming (; born 1938) is a taiwanese ph...,1


In [2]:
def vectorize(dataset):
    vector_set = pd.DataFrame(columns=['vector', 'label'], index=range(len(dataset)))
    vectorizer = Vectorizer(pretrained_weights='distilbert-base-uncased')
    text_list = list(dataset['text'])
    # Bert request, length of text no more than 512
    for j in range(len(text_list)):
        if len(text_list[j]) > 512:
            text_list[j] = text_list[j][:512]
    vectorizer.run(text_list)
    vectors = vectorizer.vectors
    vector_set['vector'] = list(vectors)
    vector_set['label'] = list(dataset['label'])
    return vector_set

In [5]:
dataset_3000_vector = vectorize(dataset_3000)
with open('./dataset/vector_3000_sent2vec.pickle', 'wb') as f:
    pickle.dump(dataset_3000_vector, f)

Initializing Bert distilbert-base-uncased
Vectorization done on cpu


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [6]:
dataset_300_vector = vectorize(dataset_300)
with open('./dataset/vector_300_sent2vec.pickle', 'wb') as f:
    pickle.dump(dataset_300_vector, f)

Initializing Bert distilbert-base-uncased
Vectorization done on cpu


# Read from pickle

In [7]:
dataset_3000_vector = pd.read_pickle('./dataset/vector_3000_sent2vec.pickle')
dataset_300_vector = pd.read_pickle('./dataset/vector_300_sent2vec.pickle')

print('train set size: ', len(dataset_3000_vector))
print('test set size: ', len(dataset_300_vector))

print('train set vector size: ', len(dataset_3000_vector['vector'][0]))
print('test set vector size: ', len(dataset_300_vector['vector'][0]))

train set size:  3000
test set size:  300
train set vector size:  768
test set vector size:  768


In [8]:
dataset_3000_vector.head()

Unnamed: 0,vector,label
0,"[-0.2395496, -0.32720825, -0.10405706, -0.2154...",1
1,"[-0.029372599, 0.11873061, -0.13061704, -0.356...",1
2,"[-0.3491774, -0.10110219, -0.3394671, -0.22729...",1
3,"[-0.2235898, 0.11179226, -0.26332092, -0.47221...",0
4,"[-0.19014409, 0.021835754, -0.32255918, -0.341...",1


# For GPT 3.5

In [3]:
dataset_3000 = pd.read_pickle('./dataset/text_3000_Turbo.pickle')
dataset_300 = pd.read_pickle('./dataset/text_300_Turbo.pickle')

print(dataset_3000.shape)
print(dataset_300.shape)

(3000, 2)
(300, 2)


In [4]:
dataset_3000_vector = vectorize(dataset_3000)
with open('./dataset/vector_3000_Turbo_sent2vec.pickle', 'wb') as f:
    pickle.dump(dataset_3000_vector, f)

Initializing Bert distilbert-base-uncased
Vectorization done on cpu


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [5]:
dataset_300_vector = vectorize(dataset_300)
with open('./dataset/vector_300_Turbo_sent2vec.pickle', 'wb') as f:
    pickle.dump(dataset_300_vector, f)

Initializing Bert distilbert-base-uncased
Vectorization done on cpu
