In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('Dataset/Twitter/train.csv', encoding = "ISO-8859-1", header = None)[[0, 5]].rename(columns = {0: 'target', 5: 'text'})
df['target'] = np.where(df['target']==4, 0, 1)
df.head()

Unnamed: 0,target,text
0,1,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,1,is upset that he can't update his Facebook by ...
2,1,@Kenichan I dived many times for the ball. Man...
3,1,my whole body feels itchy and like its on fire
4,1,"@nationwideclass no, it's not behaving at all...."


In [3]:
from sklearn.model_selection import train_test_split
# df1 = df[799900:800100]
df1 = df[793000:-793000]
# df1 = df
df1.head()

Unnamed: 0,target,text
793000,1,"Ended up doing the tai chi, pilates and yoga c..."
793001,1,"@Footdr69 OhhhH!! Anbosol, stat!"
793002,1,got woken up and can't go back to sleep peopl...
793003,1,I wore my red flats to work today so i could m...
793004,1,@ekrelly notto the no rehab thing


In [4]:
df1['target'].value_counts()

1    7000
0    7000
Name: target, dtype: int64

In [5]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 14 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [6]:
def custom_standardization(input_data):
    import preprocessor as p
    processed_data = p.clean(input_data)
    lowercase_value = processed_data.lower()
    return lowercase_value

# def remove_stopwords(text):
#     import nltk
#     from nltk.corpus import stopwords
#     stop_words = stopwords.words('english')
#     text = ' '.join(word for word in text.split(' ') if word not in stop_words and len(word) > 3)
#     return text

# def stemm_text(text):
#     import nltk
#     from nltk.corpus import stopwords
#     stemmer = nltk.SnowballStemmer("english")
#     text = ' '.join(stemmer.stem(word) for word in text.split(' '))
#     return text

In [7]:
df1["text"] = df1["text"].parallel_apply(custom_standardization)
# df1['text'] = df1['text'].parallel_apply(remove_stopwords)
# df1['text'] = df1['text'].parallel_apply(stemm_text)
df1.head()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1000), Label(value='0 / 1000'))), …

Unnamed: 0,target,text
793000,1,"ended up doing the tai chi, pilates and yoga c..."
793001,1,"ohhhh!! anbosol, stat!"
793002,1,got woken up and can't go back to sleep people...
793003,1,i wore my red flats to work today so i could m...
793004,1,notto the no rehab thing


In [8]:
max(df1['text'].str.split().apply(lambda x: len(x)))

32

In [9]:
train_df, test_df = train_test_split(df1, test_size=0.2, random_state = 99)

In [10]:
import torch
import numpy as np
from transformers import BertTokenizer
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base")
labels = {0: 0, 1: 1}

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['target']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 64, truncation=True,
                                return_tensors="pt") for text in tqdm(df['text'])]
    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [11]:
from torch import nn
from transformers import BertConfig, BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.1):
        
        super(BertClassifier, self).__init__()
        
        self.bert = AutoModel.from_pretrained("cardiffnlp/twitter-roberta-base")
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 192)
        self.linear1 = nn.Linear(192, 64)
        self.linear2 = nn.Linear(64, 2)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id,attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output1 = self.linear(dropout_output)
        linear_output1 = self.relu(linear_output1)
        linear_output2 = self.linear1(linear_output1)
        linear_output2 = self.relu(linear_output2)
        linear_output3 = self.linear2(linear_output2)
        final_layer_relu = self.relu(linear_output3)
        final_layer = self.sigmoid(linear_output3)
        
        return final_layer

In [12]:
from torch.optim import Adam
from tqdm.notebook import tqdm
from alive_progress import alive_bar
import time

In [13]:
print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())
print(torch.cuda.get_device_name(0))
print(torch.cuda.get_device_properties(0))

1.13.1+cu117
11.7
8500
NVIDIA GeForce RTX 3060 Laptop GPU
_CudaDeviceProperties(name='NVIDIA GeForce RTX 3060 Laptop GPU', major=8, minor=6, total_memory=6143MB, multi_processor_count=30)


In [14]:
torch.cuda.current_device()

0

In [15]:
%%time
# torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)
    
    train_dataloader = torch.utils.data.DataLoader(train, batch_size = 64, shuffle=False) #, num_workers=4)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size = 64) #, num_workers = 4)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()
            
    # model, optimizer, dataloader = accelerator.prepare(model, optimizer, train_dataloader)

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0
            count = 0
            print(f'EPOCH NUMBER {epoch_num}')
            for train_input, train_label in tqdm(train_dataloader):
                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)
                output = model(input_id, mask)

                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()

                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad(set_to_none = True)
                batch_loss.backward()
                optimizer.step()              
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in tqdm(val_dataloader):
                    count += 1
                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}'
            )
            
    return model
                  
EPOCHS = 5
model = BertClassifier()
LR = 8e-6
              
final_model = train(model, (train_df), (test_df), LR, EPOCHS)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/11200 [00:00<?, ?it/s]

  0%|          | 0/2800 [00:00<?, ?it/s]

EPOCH NUMBER 0


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epochs: 1 | Train Loss:  0.010                 | Train Accuracy:  0.783                 | Val Loss:  0.008                 | Val Accuracy:  0.831
EPOCH NUMBER 1


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epochs: 2 | Train Loss:  0.008                 | Train Accuracy:  0.858                 | Val Loss:  0.008                 | Val Accuracy:  0.846
EPOCH NUMBER 2


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epochs: 3 | Train Loss:  0.007                 | Train Accuracy:  0.888                 | Val Loss:  0.008                 | Val Accuracy:  0.825
EPOCH NUMBER 3


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epochs: 4 | Train Loss:  0.007                 | Train Accuracy:  0.896                 | Val Loss:  0.007                 | Val Accuracy:  0.842
EPOCH NUMBER 4


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epochs: 5 | Train Loss:  0.006                 | Train Accuracy:  0.910                 | Val Loss:  0.007                 | Val Accuracy:  0.846
CPU times: total: 3min 29s
Wall time: 7min 58s


64 = 38.2s
32 = 42.7s
16 = 58.6s 

In [16]:
test = pd.read_csv('Dataset/Twitter/test.csv', encoding = "ISO-8859-1", usecols = ["Sentiment","SentimentText"]).rename(columns = {'Sentiment': 'target', 'SentimentText': 'text'})
test['target'] = np.where(test['target']==1, 0, 1)
print(len(test))
test.head()

1578614


Unnamed: 0,target,text
0,1,is so sad for my APL frie...
1,1,I missed the New Moon trail...
2,0,omg its already 7:30 :O
3,1,.. Omgaga. Im sooo im gunna CRy. I'...
4,1,i think mi bf is cheating on me!!! ...


In [17]:
from sklearn.model_selection import train_test_split
# df1 = df[799900:800100]
test1 = test[787800:-787800]

In [18]:
test1['target'].value_counts()

0    1786
1    1228
Name: target, dtype: int64

In [19]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 14 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [20]:
def custom_standardization(input_data):
    import preprocessor as p
    processed_data = p.clean(input_data)
    lowercase_value = processed_data.lower()
    return lowercase_value

In [21]:
test1["text"] = test1["text"].parallel_apply(custom_standardization)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=216), Label(value='0 / 216'))), HB…

In [22]:
def evaluate(model, test_data):
    # test_data = pd.DataFrame([['I want to kill myself', 1], 
    #                           ['I want to die', 1]], columns = ['text', 'target'])
    test = Dataset(test_data)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=32)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = final_model.cuda()

    total_acc_test = 0
    with torch.no_grad():
        count = 0
        for test_input, test_label in tqdm(test_dataloader):
            # print(test_data.iloc[count]['text'], test_data.iloc[count]['target'])
            # count += 1
            # if(count == 10):
            #     return 
            
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            # print(output, output.argmax(dim=1))

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    
evaluate(model, test1)

# GELU - 86.4 | 14000 
# RELU6 - 85 | 14000

  0%|          | 0/3014 [00:00<?, ?it/s]

  0%|          | 0/95 [00:00<?, ?it/s]

Test Accuracy:  0.867


In [23]:
print(test1.iloc[0]['text'])

yeah i know. maybe we can hook up later tonight or tomorrow ... i'm here til mon. i'll text you


In [24]:
print(model.parameters)

<bound method Module.parameters of BertClassifier(
  (bert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
       