In [30]:
from transformers import BertTokenizer, BertModel
import torch

In [31]:
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)

In [70]:
from nltk.corpus import movie_reviews
import nltk
import string

pos_reviews = [movie_reviews.words(fileid) for fileid in movie_reviews.fileids('pos')]
neg_reviews = [movie_reviews.words(fileid) for fileid in movie_reviews.fileids('neg')]
reviews = pos_reviews + neg_reviews
labels = [1] * len(pos_reviews) + [0] * len(neg_reviews)

stop_words = set(nltk.corpus.stopwords.words('english'))
punctuation = set(string.punctuation)
filtered_reviews = []

for review in reviews:
    words = [word.lower() for word in review if word.lower() not in stop_words and word.lower() not in punctuation]
    filtered_reviews.append(" ".join(words[:20]))

In [32]:
# Input text for which you want to generate embeddings
input_text = "I love natural language processing."

In [33]:
# Tokenize the input text
tokenized_text = tokenizer.tokenize(input_text)
tokenized_text = ["[CLS]"] + tokenized_text + ["[SEP]"]
input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)

In [34]:
# Convert token IDs to tensor
input_tensor = torch.tensor([input_ids])

In [35]:
# Get the BERT model output
with torch.no_grad():
    outputs = model(input_tensor)

In [36]:
# Extract the embeddings from the BERT model outputs
hidden_states = outputs[2]
word_embeddings = hidden_states[-1]  # Last layer hidden states for each token

# Average the token embeddings to get the sentence embedding
sentence_embedding = torch.mean(word_embeddings, dim=1).squeeze()

print("Word Embeddings:")
print(word_embeddings)
print("Sentence Embedding:")
print(sentence_embedding)

Word Embeddings:
tensor([[[-0.0419,  0.0434, -0.2534,  ..., -0.2518,  0.2330,  0.6576],
         [ 0.6340,  0.1373, -0.5598,  ..., -0.0658,  0.5974,  0.5542],
         [ 1.1149,  1.2050,  0.3571,  ..., -0.0769,  0.4818,  0.4114],
         ...,
         [ 0.7493,  0.3590, -0.4548,  ..., -0.9851, -0.3001, -0.3983],
         [ 0.5699,  0.1807, -0.3210,  ...,  0.3424, -0.4560, -0.3952],
         [ 0.7647,  0.1535, -0.1123,  ...,  0.2889, -0.6728, -0.2865]]])
Sentence Embedding:
tensor([ 4.2820e-01,  3.5509e-01, -1.8181e-01, -4.7634e-02,  1.4228e-01,
        -2.1608e-01, -6.9977e-03,  7.1022e-01,  9.0334e-03, -2.1341e-01,
        -9.2669e-02, -8.8439e-02, -2.0178e-02,  6.1247e-02, -1.3540e-01,
        -3.8201e-02,  1.1978e-01,  1.1908e-01, -1.6284e-01,  3.4451e-01,
         3.7351e-01,  2.1084e-01, -2.2698e-01,  4.1619e-01,  6.0871e-01,
        -2.1485e-01, -4.0662e-01,  1.5206e-01, -4.7932e-01, -3.1712e-01,
        -5.8468e-02, -4.1601e-02, -2.3040e-01,  1.2369e-01, -1.9613e-01,
        -3

In [37]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
from sklearn.model_selection import train_test_split

In [58]:
# Sample data for text classification
# texts = ["I love natural language processing.", "This movie is great!", "I don't like this product.", "The weather today is nice."]
# labels = [1, 1, 0, 1]  # Binary labels (1: positive, 0: negative)
texts = filtered_reviews

In [65]:
len(texts), len(labels)

(2000, 2000)

In [66]:
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # 2 classes for binary classification


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [67]:
# Tokenize the input texts
tokenized_texts = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")


In [72]:
# Split data into training and testing sets
input_ids_train, input_ids_test, labels_train, labels_test = \
    train_test_split(tokenized_texts['input_ids'], labels, test_size=0.2, random_state=42)
attention_masks_train, attention_masks_test = \
    train_test_split(tokenized_texts['attention_mask'], test_size=0.2, random_state=42)


In [73]:
# Create DataLoader for training and testing data
train_data = TensorDataset(input_ids_train, attention_masks_train, torch.tensor(labels_train))
train_loader = DataLoader(train_data, batch_size=4, shuffle=True)

test_data = TensorDataset(input_ids_test, attention_masks_test, torch.tensor(labels_test))
test_loader = DataLoader(test_data, batch_size=4, shuffle=False)


In [74]:
# Set optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)


In [None]:
model.train()
for epoch in range(3):
    total_loss = 0
    for batch in train_loader:
        input_ids_batch, attention_masks_batch, labels_batch = batch
        optimizer.zero_grad()
        outputs = model(input_ids_batch, attention_mask=attention_masks_batch, labels=labels_batch)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    scheduler.step()
    print(f"Epoch {epoch + 1}, Average Loss: {total_loss / len(train_loader)}")


In [None]:
# Evaluation on test data
model.eval()
predictions = []
true_labels = []


In [None]:
with torch.no_grad():
    for batch in test_loader:
        input_ids_batch, attention_masks_batch, labels_batch = batch
        outputs = model(input_ids_batch, attention_mask=attention_masks_batch)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).tolist())
        true_labels.extend(labels_batch.tolist())

In [None]:
# Calculate accuracy
accuracy = torch.sum(torch.tensor(predictions) == torch.tensor(true_labels)).item() / len(true_labels)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [None]:
# Sample input text for testing
input_text = "I love this movie!"

In [None]:
# Tokenize the input text
tokenized_input = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt")

In [None]:
# Make predictions using the trained model
model.eval()
with torch.no_grad():
    outputs = model(**tokenized_input)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()

predicted_label = "positive" if prediction == 1 else "negative"

print(f"Input Text: {input_text}")

print(f"Predicted Label: {predicted_label}")