In [19]:
!pip install transformers
!pip install torch



In [20]:
from transformers import RobertaTokenizer
import pandas as pd
from torch.utils.data import DataLoader
from torch import cuda
from tqdm import tqdm
import torch
import torch.nn as nn
from transformers import RobertaModel
from torch.utils.data import Dataset

In [21]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [22]:
from google.colab import files
uploaded = files.upload()

Saving file102.csv to file102.csv


In [23]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4

# EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)


In [25]:
train = pd.read_csv("file102.csv")
print(train.head())
new_df= train[['tweet_text', 'sentiment']]
print(new_df.head())

   Unnamed: 0                                         tweet_text  emotion_at  \
0           0  .@wesley83 I have a 3G iPhone. After 3 hrs twe...         1.0   
1           1  @jessedee Know about @fludapp ? Awesome iPad/i...         2.0   
2           2  @swonderlin Can not wait for #iPad 2 also. The...         3.0   
3           3  @sxsw I hope this year's festival isn't as cra...         2.0   
4           4  @sxtxstate great stuff on Fri #SXSW: Marissa M...         4.0   

   sentiment  tweet_len  word_count  polarity company  
0        2.0        127          23 -0.250000   apple  
1        1.0        139          22  0.466667   apple  
2        1.0         79          15 -0.155556   apple  
3        2.0         82          15  0.000000   apple  
4        1.0        131          17  0.800000  google  
                                          tweet_text  sentiment
0  .@wesley83 I have a 3G iPhone. After 3 hrs twe...        2.0
1  @jessedee Know about @fludapp ? Awesome iPad/i...   

In [26]:
train.sentiment

0       2.0
1       1.0
2       1.0
3       2.0
4       1.0
       ... 
8584    1.0
8585    3.0
8586    3.0
8587    3.0
8588    3.0
Name: sentiment, Length: 8589, dtype: float64

In [27]:
train.emotion_at

0       1.0
1       2.0
2       3.0
3       2.0
4       4.0
       ... 
8584    3.0
8585    0.0
8586    0.0
8587    0.0
8588    0.0
Name: emotion_at, Length: 8589, dtype: float64

In [28]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.text = dataframe.tweet_text
        self.max_len = max_len
        self.sentiment_targets = self.data.sentiment
        self.product_targets = self.data.emotion_at
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = torch.tensor(inputs['input_ids'], dtype=torch.long)
        mask = torch.tensor(inputs['attention_mask'], dtype=torch.long)
        token_type_ids = torch.tensor(inputs["token_type_ids"], dtype=torch.long)

        return {
            'ids': ids,
            'mask': mask,
            'token_type_ids': token_type_ids,
            'sentiment_targets': torch.tensor(self.sentiment_targets[index], dtype=torch.float),
            'emotion_targets': torch.tensor(self.product_targets[index], dtype=torch.long)
        }

In [42]:
training_set = SentimentData(train, tokenizer, MAX_LEN)
print(training_set)

train_params = {'batch_size': TRAIN_BATCH_SIZE,
            'shuffle': True,
            'num_workers': 0
            }

test_params = {'batch_size': VALID_BATCH_SIZE,
            'shuffle': True,
            'num_workers': 0
            }


<__main__.SentimentData object at 0x78de4d613be0>


In [30]:
training_loader = DataLoader(training_set, **train_params)
print(training_loader)


<torch.utils.data.dataloader.DataLoader object at 0x78de54f409d0>


In [72]:
class RobertaClass(nn.Module):
    def __init__(self, num_classes_sentiment=5, num_classes_product=10):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = nn.Linear(768, 768)
        self.dropout = nn.Dropout(0.3)
        self.classifier_sentiment = nn.Linear(768, num_classes_sentiment)
        self.classifier_product = nn.Linear(768, num_classes_product)  # Add a linear layer for product classification

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = nn.ReLU()(pooler)
        pooler = self.dropout(pooler)

        # Output for sentiment classification
        output_sentiment = self.classifier_sentiment(pooler)

        # Output for product classification
        output_product = self.classifier_product(pooler)

        return output_sentiment, output_product


In [34]:
model = RobertaClass()
model.to(device)

loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

def train_model(epoch):
    tr_loss = 0
    n_correct_sentiment = 0
    n_correct_emotion = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()

    for batch_idx, data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets_sentiment = data['sentiment_targets'].to(device, dtype=torch.long)
        targets_emotion = data['emotion_targets'].to(device, dtype=torch.long)

        optimizer.zero_grad()
        outputs_sentiment, outputs_emotion = model(ids, mask, token_type_ids)

        loss_sentiment = loss_function(outputs_sentiment, targets_sentiment)
        loss_emotion = loss_function(outputs_emotion, targets_emotion)

        # Combine the losses, for example, add them
        loss = loss_sentiment + loss_emotion
        loss.backward()
        optimizer.step()

        tr_loss += loss.item()

        _, predicted_sentiment = torch.max(outputs_sentiment.data, 1)
        _, predicted_emotion = torch.max(outputs_emotion.data, 1)

        n_correct_sentiment += (predicted_sentiment == targets_sentiment).sum().item()
        n_correct_emotion += (predicted_emotion == targets_emotion).sum().item()

        nb_tr_steps += 1
        nb_tr_examples += targets_sentiment.size(0)

        if batch_idx % 500 == 0:
            loss_step = tr_loss / nb_tr_steps
            accu_sentiment_step = (n_correct_sentiment * 100) / nb_tr_examples
            accu_emotion_step = (n_correct_emotion * 100) / nb_tr_examples

            print(f"Epoch [{epoch + 1}/{EPOCHS}] - Batch [{batch_idx}/{len(training_loader)}]")
            print(f"  Loss: {loss_step:.4f}")
            print(f"  Sentiment Accuracy: {accu_sentiment_step:.2f}%")
            print(f"  Emotion Accuracy: {accu_emotion_step:.2f}%")

    epoch_loss = tr_loss / nb_tr_steps
    epoch_accu_sentiment = (n_correct_sentiment * 100) / nb_tr_examples
    epoch_accu_emotion = (n_correct_emotion * 100) / nb_tr_examples

    print(f"\nEpoch [{epoch + 1}/{EPOCHS}] - Summary:")
    print(f"  Total Loss: {epoch_loss:.4f}")
    print(f"  Total Sentiment Accuracy: {epoch_accu_sentiment:.2f}%")
    print(f"  Total Emotion Accuracy: {epoch_accu_emotion:.2f}%")
    print("--------------------------------------------------------")

    return


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
EPOCHS = 10
for epoch in range(EPOCHS):
    train_model(epoch)


Epoch [1/10] - Batch [0/1074]
  Loss: 3.9149
  Sentiment Accuracy: 12.50%
  Emotion Accuracy: 0.00%
Epoch [1/10] - Batch [500/1074]
  Loss: 2.2603
  Sentiment Accuracy: 63.77%
  Emotion Accuracy: 61.23%
Epoch [1/10] - Batch [1000/1074]
  Loss: 1.9107
  Sentiment Accuracy: 68.08%
  Emotion Accuracy: 65.66%

Epoch [1/10] - Summary:
  Total Loss: 1.8691
  Total Sentiment Accuracy: 68.70%
  Total Emotion Accuracy: 66.32%
--------------------------------------------------------
Epoch [2/10] - Batch [0/1074]
  Loss: 1.4379
  Sentiment Accuracy: 87.50%
  Emotion Accuracy: 87.50%
Epoch [2/10] - Batch [500/1074]
  Loss: 1.2519
  Sentiment Accuracy: 77.92%
  Emotion Accuracy: 75.25%
Epoch [2/10] - Batch [1000/1074]
  Loss: 1.2098
  Sentiment Accuracy: 78.42%
  Emotion Accuracy: 76.04%

Epoch [2/10] - Summary:
  Total Loss: 1.2043
  Total Sentiment Accuracy: 78.50%
  Total Emotion Accuracy: 76.05%
--------------------------------------------------------
Epoch [3/10] - Batch [0/1074]
  Loss: 1.132

In [36]:
output_model_file = 'pytorch_roberta_sentiment.bin'
output_vocab_file = './'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')
print('This tutorial is completed')

All files saved
This tutorial is completed


In [37]:
from google.colab import files
uploaded = files.upload()

Saving ML Assignment Dataset - Test.csv to ML Assignment Dataset - Test.csv


In [38]:
test_df = pd.read_csv("test.csv")

In [39]:
test_df

Unnamed: 0,Tweet
0,Hand-Held ���Hobo�۪: Drafthouse launches ���Ho...
1,Again? RT @mention Line at the Apple store is ...
2,Boooo! RT @mention Flipboard is developing an ...
3,Thanks to @mention for publishing the news of ...
4,���@mention &quot;Apple has opened a pop-up st...
...,...
499,Hey is anyone doing #sxsw signing up for the g...
500,@mention you can buy my used iPad and I'll pic...
501,@mention You could buy a new iPad 2 tmrw at th...
502,"Guys, if you ever plan on attending #SXSW, you..."


In [40]:
test_df.columns = test_df.columns.str.replace("Tweet", "tweet_text")

In [41]:
test_df

Unnamed: 0,tweet_text
0,Hand-Held ���Hobo�۪: Drafthouse launches ���Ho...
1,Again? RT @mention Line at the Apple store is ...
2,Boooo! RT @mention Flipboard is developing an ...
3,Thanks to @mention for publishing the news of ...
4,���@mention &quot;Apple has opened a pop-up st...
...,...
499,Hey is anyone doing #sxsw signing up for the g...
500,@mention you can buy my used iPad and I'll pic...
501,@mention You could buy a new iPad 2 tmrw at th...
502,"Guys, if you ever plan on attending #SXSW, you..."


In [75]:
output_model_file = 'pytorch_roberta_sentiment.bin'
loaded_model = torch.load(output_model_file)

# Extract the state dictionary from the loaded model
state_dict = loaded_model.state_dict()

# Create an instance of your model and load the state dictionary
model = RobertaClass(num_classes_sentiment=5, num_classes_product=10)
model.load_state_dict(state_dict)
model.eval()

tokenizer = RobertaTokenizer.from_pretrained(output_vocab_file)

# Example: Process a tweet using the loaded model
tweet = "I love this apple iPad! It makes my life so much easier. 😃"
inputs = tokenizer.encode_plus(
            tweet,
            None,
            add_special_tokens=True,
            max_length=256,
            pad_to_max_length=True,
            return_token_type_ids=True
)

# input_ids = tokens["input_ids"]
# attention_mask = tokens["attention_mask"]
# token_type_ids = tokens["token_type_ids"]

ids = torch.tensor(inputs['input_ids'], dtype=torch.long)
mask = torch.tensor(inputs['attention_mask'], dtype=torch.long)
token_type_ids = torch.tensor(inputs["token_type_ids"], dtype=torch.long)

ids = ids.unsqueeze(0)
mask = mask.unsqueeze(0)

# Forward pass through the loaded model
with torch.no_grad():
    outputs_sentiment, outputs_product = model(ids, mask, token_type_ids)

# Get predicted classes
_, predicted_sentiment = torch.max(outputs_sentiment, 1)
_, predicted_product = torch.max(outputs_product, 1)


# Print the results
print(f"Predicted Sentiment: {predicted_sentiment.item()}")
print(f"Predicted Product: {predicted_product.item()}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Predicted Sentiment: 1
Predicted Product: 0
