In [1]:
!pip install torch transformers pandas scikit-learn


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting transformers
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 20.4 MB/s eta 0:00:01
Collecting tokenizers<0.19,>=0.14
  Downloading tokenizers-0.15.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 119.9 MB/s eta 0:00:01
Collecting huggingface-hub<1.0,>=0.19.3
  Downloading huggingface_hub-0.20.1-py3-none-any.whl (330 kB)
[K     |████████████████████████████████| 330 kB 109.5 MB/s eta 0:00:01
Installing collected packages: huggingface-hub, tokenizers, transformers
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.18.0
    Uninstalling huggingface-hub-0.18.0:
      Successfully uninstalled huggingface-hub-0.18.0
Successfully installed huggingface-hub-0.20.1 tokenizers-0.15.0 transformers-4.36.2


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
data = pd.read_csv("twitter_data.csv", error_bad_lines=False)


In [8]:
data=data.drop(['ItemID','SentimentSource'],axis=1)

In [9]:
data

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...
...,...,...
1578607,1,Zzzzzz.... Finally! Night tweeters!
1578608,1,"Zzzzzzz, sleep well people"
1578609,0,ZzzZzZzzzZ... wait no I have homework.
1578610,0,"ZzZzzzZZZZzzz meh, what am I doing up again?"


In [10]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)


In [11]:
train_df

Unnamed: 0,Sentiment,SentimentText
363906,1,@p3cia hihi.. already looked
1002673,1,@lizzylou62 Good luck with the exams!
1257527,0,The krispy kreme in CT is so closed
495883,1,@TomJ93 because of what @_nanu_ said
445457,0,@TellYaFriday I have nothing else to do...i'm...
...,...,...
259178,1,@kristinburbey Hey new friend.... what is up?
1414414,0,got burnt during work. hurts...
131932,1,@DommeJezebel ya know radiant Dommes must refu...
671155,0,@BUNCHiEB but i just had mccalisters


In [12]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [44]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
model.to(device)
device

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


device(type='cuda', index=1)

In [61]:
batch_size = 64
max_len = 128
epochs = 2
lr = 2e-5

train_dataset = CustomDataset(train_df['SentimentText'].values, train_df['Sentiment'].values, tokenizer, max_len)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = CustomDataset(test_df['SentimentText'].values, test_df['Sentiment'].values, tokenizer, max_len)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [62]:
optimizer = AdamW(model.parameters(), lr=lr)
loss_fn = torch.nn.CrossEntropyLoss()


In [63]:
from tqdm import tqdm

# Training loop
for epoch in range(epochs):
    model.train()
    train_loss = 0.0

    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}', leave=False)

    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()

        loss.backward()
        optimizer.step()

        progress_bar.set_postfix({'Batch Loss': loss.item()})

    avg_train_loss = train_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss:.4f}')





                                                                                                                        

Epoch 1/2, Average Training Loss: 0.3238


                                                                                                                        

Epoch 2/2, Average Training Loss: 0.2676




In [66]:
# Test loop
model.eval()
test_predictions, test_labels = [], []
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Test'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        _, predicted = torch.max(logits, 1)

        test_predictions.extend(predicted.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

Test: 100%|█████████████████████████████████████████████████████████████████████████| 4934/4934 [04:16<00:00, 19.22it/s]


In [68]:
accuracy = accuracy_score(test_labels, test_predictions)
print(f'Test Accuracy: {accuracy:.4f}')

Test Accuracy: 0.8749
