In [1]:
import pickle
import pandas as pd
import numpy as np

# https://pypi.org/project/sent2vec/
from sent2vec.vectorizer import Vectorizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_set = pd.read_csv('./dataset/train_set.csv')
test_set = pd.read_csv('./dataset/test_set.csv')

print('train set size: ', len(train_set))
print('test set size: ', len(test_set))

train set size:  27000
test set size:  3000


## For small dataset we run all text

In [4]:
def vectorize_small(dataset):
    vector_set = pd.DataFrame(columns=['vector', 'label'], index=range(len(dataset)))
    vectorizer = Vectorizer(pretrained_weights='distilbert-base-uncased')
    text_list = list(dataset['text'])
    # Bert request, length of text no more than 512
    for j in range(len(text_list)):
        if len(text_list[j]) > 512:
            text_list[j] = text_list[j][:512]
    vectorizer.run(text_list)
    vectors = vectorizer.vectors
    vector_set['vector'] = list(vectors)
    vector_set['label'] = list(dataset['label'])
    return vector_set

## For large dataset we run every 1000 text

In [5]:
def vectorize_large(dataset, n=1000):
    vector_set = pd.DataFrame(columns=['vector', 'label'], index=range(len(dataset)))
    # for every n data, we run the vectorizer
    for i in range(0, len(train_set), n):
        if i+n > len(train_set):
            end = len(train_set)
        else:
            end = i+n
        vectorizer = Vectorizer(pretrained_weights='distilbert-base-uncased')
        text_list = list(train_set['text'][i:end])
        # Bert request, length of text no more than 512
        for j in range(len(text_list)):
            if len(text_list[j]) > 512:
                text_list[j] = text_list[j][:512]
        vectorizer.run(text_list)
        vector_set['vector'][i:end] = vectorizer.vectors
        vector_set['label'][i:end] = train_set['label'][i:end]
    return vector_set

In [6]:
# save the dataset for later use
train_set_vector = vectorize_large(train_set)
with open('./dataset/train_set_vector.pickle', 'wb') as f:
    pickle.dump(train_set_vector, f)

Initializing Bert distilbert-base-uncased
Vectorization done on cpu


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vectorization done on cpu
Initializing Bert distilbert-base-uncased
Vector

In [7]:
test_set_vector = vectorize_small(test_set)
with open('./dataset/test_set_vector.pickle', 'wb') as f:
    pickle.dump(test_set_vector, f)

Initializing Bert distilbert-base-uncased
Vectorization done on cpu


# Read from pickle

In [8]:
train_set_vector = pd.read_pickle('./dataset/train_set_vector.pickle')
test_set_vector = pd.read_pickle('./dataset/test_set_vector.pickle')

print('train set size: ', len(train_set_vector))
print('test set size: ', len(test_set_vector))

print('train set vector size: ', len(train_set_vector['vector'][0]))
print('test set vector size: ', len(test_set_vector['vector'][0]))

train set size:  27000
test set size:  3000
train set vector size:  768
test set vector size:  768
