In [14]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence

In [15]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [16]:
# Load the dataset
data = pd.read_csv('sentiment140/training.1600000.processed.noemoticon.csv', 
                   encoding='latin-1', 
                   names=['target', 'ids', 'date', 'flag', 'user', 'text'])

# Use a smaller subset of the data for testing
#data = data.sample(n=1000, random_state=42)

In [17]:
# Extract texts and labels
texts = data['text'].tolist()
labels = data['target'].tolist()


In [18]:
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [24]:
# Tokenize in smaller batches
batch_size = 1000  # Adjust based on your memory capacity
input_ids_list = []
attention_mask_list = []
max_len = 0

In [25]:
# Tokenize the texts
for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i + batch_size]
    inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    max_len = max(max_len, inputs['input_ids'].size(1))
    input_ids_list.append(inputs['input_ids'])
    attention_mask_list.append(inputs['attention_mask'])
    print(f'Tokenized batch {i//batch_size + 1}/{len(texts)//batch_size + 1}')


Tokenized batch 1/1601
Tokenized batch 2/1601
Tokenized batch 3/1601
Tokenized batch 4/1601
Tokenized batch 5/1601
Tokenized batch 6/1601
Tokenized batch 7/1601
Tokenized batch 8/1601
Tokenized batch 9/1601
Tokenized batch 10/1601
Tokenized batch 11/1601
Tokenized batch 12/1601
Tokenized batch 13/1601
Tokenized batch 14/1601
Tokenized batch 15/1601
Tokenized batch 16/1601
Tokenized batch 17/1601
Tokenized batch 18/1601
Tokenized batch 19/1601
Tokenized batch 20/1601
Tokenized batch 21/1601
Tokenized batch 22/1601
Tokenized batch 23/1601
Tokenized batch 24/1601
Tokenized batch 25/1601
Tokenized batch 26/1601
Tokenized batch 27/1601
Tokenized batch 28/1601
Tokenized batch 29/1601
Tokenized batch 30/1601
Tokenized batch 31/1601
Tokenized batch 32/1601
Tokenized batch 33/1601
Tokenized batch 34/1601
Tokenized batch 35/1601
Tokenized batch 36/1601
Tokenized batch 37/1601
Tokenized batch 38/1601
Tokenized batch 39/1601
Tokenized batch 40/1601
Tokenized batch 41/1601
Tokenized batch 42/1601
T

In [26]:
# Pad sequences to the maximum length
input_ids_padded = [torch.cat([batch, torch.full((batch.size(0), max_len - batch.size(1)), tokenizer.pad_token_id)], dim=1) for batch in input_ids_list]
attention_mask_padded = [torch.cat([batch, torch.full((batch.size(0), max_len - batch.size(1)), 0)], dim=1) for batch in attention_mask_list]


In [27]:
# Concatenate all batches
input_ids = torch.cat(input_ids_padded, dim=0)
attention_mask = torch.cat(attention_mask_padded, dim=0)
labels = torch.tensor(labels)

print('Tokenization and padding complete')

Tokenization and padding complete


In [29]:
# Create a dataset and DataLoader
dataset = TensorDataset(input_ids, attention_mask, labels)
batch_size = 1  # Adjust based on your GPU memory
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [30]:
# Load the BERT model
model = BertModel.from_pretrained('bert-base-uncased')
model.to(device)
model.eval()

output_path = 'bert_features.pt'  # Path to save the features

all_features = []

In [31]:
# Process data in batches
with torch.no_grad():
    for i, batch in enumerate(dataloader):
        input_ids, attention_mask, _ = [t.to(device) for t in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        features = outputs.last_hidden_state  # (batch_size, sequence_length, hidden_size)
        all_features.append(features.cpu())  # Move features back to CPU to concatenate later

        # Save features batch-wise to avoid memory overflow
        if (i + 1) % 1000 == 0:  # Save every 1000 batches
            torch.save(torch.cat(all_features, dim=0), output_path)
            all_features = []  # Reset list to free up memory
            print(f'Saved features up to batch {i + 1}')

        del input_ids, attention_mask, features  # Clear variables to free up memory
        torch.cuda.empty_cache()  # Clear CUDA cache to free up memory

Saved features up to batch 1000
Saved features up to batch 2000
Saved features up to batch 3000
Saved features up to batch 4000
Saved features up to batch 5000
Saved features up to batch 6000
Saved features up to batch 7000
Saved features up to batch 8000
Saved features up to batch 9000
Saved features up to batch 10000
Saved features up to batch 11000
Saved features up to batch 12000
Saved features up to batch 13000
Saved features up to batch 14000
Saved features up to batch 15000
Saved features up to batch 16000
Saved features up to batch 17000
Saved features up to batch 18000
Saved features up to batch 19000
Saved features up to batch 20000
Saved features up to batch 21000
Saved features up to batch 22000
Saved features up to batch 23000
Saved features up to batch 24000
Saved features up to batch 25000
Saved features up to batch 26000
Saved features up to batch 27000
Saved features up to batch 28000
Saved features up to batch 29000
Saved features up to batch 30000
Saved features up t

In [32]:
# Save remaining features
if all_features:
    torch.save(torch.cat(all_features, dim=0), output_path)
    print('Saved remaining features')

print('All features saved')

# Load and print the shape of the resulting features
all_features = torch.load(output_path)
print(all_features.shape)  # (total_number_of_sentences, sequence_length, hidden_size)

All features saved
torch.Size([1000, 230, 768])


In [33]:
print(len(texts))

1600000


In [34]:
data.head(10)

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
5,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
6,0,1467811592,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,mybirch,Need a hug
7,0,1467811594,Mon Apr 06 22:20:03 PDT 2009,NO_QUERY,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
8,0,1467811795,Mon Apr 06 22:20:05 PDT 2009,NO_QUERY,2Hood4Hollywood,@Tatiana_K nope they didn't have it
9,0,1467812025,Mon Apr 06 22:20:09 PDT 2009,NO_QUERY,mimismo,@twittera que me muera ?
