In [27]:
# Insert code here.
import pandas as pd
import numpy as np
import random
import re
import time
import datetime
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, neighbors
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, BertConfig, AutoModel
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from torch.utils.data import Dataset
from tqdm import tqdm
import pytorch_lightning as pl
import neptune
with open("token", "r") as f:
    token = f.read()
neptune.init(project_qualified_name='tathagataraha/contro-base',
             api_token=token,
             )
# from sentence_transformers import SentenceTransformer
# sent_encoder = SentenceTransformer('bert-base-nli-mean-tokens')

Project(tathagataraha/contro-base)

In [28]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda:0")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
torch.cuda.empty_cache()

There are 4 GPU(s) available.
We will use the GPU: GeForce RTX 2080 Ti


In [29]:
models = ['bert-base-uncased','roberta-base', 'vinai/bertweet-base']
labels = ['is_cont', 'humor_rating', 'humor_controversy', 'offense_rating']
model_num = 1
label_num = 0

In [30]:
params = {
    'model' : models[model_num],
    'label' : labels[label_num],
    'valid_size' : 0.2,
    'rnd' : 42,
    'max_len' : 64,
    'train_batch' : 32,
    'valid_batch' : 32,
    'epochs' : 4,
    'lr' : 1e-05,
    'dropout' : 0.1
}

In [31]:
params

{'model': 'roberta-base',
 'label': 'is_cont',
 'valid_size': 0.2,
 'rnd': 42,
 'max_len': 64,
 'train_batch': 32,
 'valid_batch': 32,
 'epochs': 4,
 'lr': 1e-05,
 'dropout': 0.1}

In [33]:

tokenizer = AutoTokenizer.from_pretrained(params['model'])

In [34]:
train = pd.read_csv('data/dataset.csv')
import pickle
with open('data/extracted.pickle', 'rb') as f:
    data = pickle.load(f)

# valid = pd.read_csv('data/public_dev.csv')
# train = pd.concat([train, valid])

In [35]:
train=pd.DataFrame.from_dict(data)[['tweet_raw_text', 'task_1']].rename(columns = {'tweet_raw_text':'text', 'task_1':'is_cont'})

In [36]:
from sklearn.model_selection import train_test_split
# train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['tweet'], train['label'])
train_x, valid_x, train_y, valid_y = train_test_split(train.text, train[params['label']], test_size=params['valid_size'], random_state=params['rnd'])

In [37]:
def count_words(text):
    try:
        return len(text.split())
    except:
        print(text)
        return None

In [38]:
total = 0
maxw = 0
large_count = 0
for i in train.text:
    temp = count_words(i)
    total += temp
    maxw = temp if temp > maxw else maxw
    large_count += 1 if temp > 64 else 0
total/len(train.text), maxw, large_count, len(train.text)

(19.564784053156146, 54, 0, 2709)

In [39]:
def to_int(i):
    return int(i)
train.is_cont = train.is_cont.apply(to_int)

In [40]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len, t = False):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
#         self.emoji = dataframe.emoji
#         self.hash = dataframe.segmented_hash
        self.t = t
        if not self.t:
            self.targets = self.data[params['label']]
        self.max_len = max_len
#         print(self.targets)
    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_attention_mask = True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
#         h_text = self.hash[index]
#         h_text = " ".join(h_text)
#         inputs = self.tokenizer.encode_plus(
#             h_text,
#             None,
#             truncation=True,
#             add_special_tokens=True,
#             max_length=self.max_len,
#             pad_to_max_length=True,
#             return_attention_mask = True,
#             return_token_type_ids=True
#         )
#         h_ids = inputs['input_ids']
#         h_mask = inputs['attention_mask']
#         h_token_type_ids = inputs["token_type_ids"]
#         h_inputs
#         emoji = getEmojiEmbeddings(self.emoji[index])
        if self.t:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
#                 'h_ids': torch.tensor(h_ids, dtype=torch.long),
#                 'h_mask': torch.tensor(h_mask, dtype=torch.long),
#                 'h_token_type_ids': torch.tensor(h_token_type_ids, dtype=torch.long),
#                 'emoji' : torch.tensor(emoji, dtype=torch.long),
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
#                 'h_ids': torch.tensor(h_ids, dtype=torch.long),
#                 'h_mask': torch.tensor(h_mask, dtype=torch.long),
#                 'h_token_type_ids': torch.tensor(h_token_type_ids, dtype=torch.long),
#                 'emoji' : torch.tensor(emoji, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.long)
            }

In [41]:
# Creating the dataset and dataloader for the neural network
# train_size = 0.85
train_data=train.sample(frac=1 - params['valid_size'],random_state=params['rnd'])
test_data=train.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)
# train_data, test_data = train_test_split(train, test_size=params['valid_size'])

test_data=test_data.reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(train.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, params['max_len'])
testing_set = MultiLabelDataset(test_data, tokenizer, params['max_len'])

FULL Dataset: (2709, 2)
TRAIN Dataset: (2167, 2)
TEST Dataset: (542, 2)


In [42]:
train_params = {'batch_size': params['train_batch'],
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': params['valid_batch'],
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [43]:
class LMModelClassifier(pl.LightningModule):
    def __init__(self, params):
        super().__init__()
        self.save_hyperparameters()
        self.l1 = AutoModel.from_pretrained(params['model'])
        self.pre_classifier_1 = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(params['dropout'])
        self.total_loss = 0
        self.batch_count = 0
        self.epoch = 0
        self.classifier = torch.nn.Linear(768, 2)
        self.preds = []
        self.targets = []
        self.test_preds = []

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state_1 = output_1[0]
        pooler_1 = hidden_state_1[:, 0]
        pooler_1 = self.pre_classifier_1(pooler_1)
        pooler_1 = torch.nn.Tanh()(pooler_1)
        pooler_1 = self.dropout(pooler_1)
        output = self.classifier(pooler_1)
        return output
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(params =  self.parameters(), lr=params['lr'])
        return optimizer

    def training_step(self, batch, batch_nb):
        ids = batch['ids']
        mask = batch['mask']
        token_type_ids = batch['token_type_ids']
        targets = batch['targets']
        outputs = self.forward(ids, mask, token_type_ids)
        loss = torch.nn.CrossEntropyLoss()(outputs, targets)
        self.total_loss += loss
        self.batch_count += 1
        logger_logs = {'training_loss': loss}
        logger_logs = {'losses': logger_logs} # optional (MUST ALL BE TENSORS)
        output = {
            'loss': loss, # required
            'progress_bar': {'training_loss': loss}, # optional (MUST ALL BE TENSORS)
            'log': logger_logs
        }
        # return a dict
        return output
    
    def on_epoch_end(self):
        self.epoch += 1
        print(f'Epoch: {self.epoch}, Loss:  {self.total_loss/self.batch_count}')
        self.total_loss=0
        self.batch_count=0
    
    def validation_step(self, batch, batch_idx):
        ids = batch['ids']
        mask = batch['mask']
        token_type_ids = batch['token_type_ids']
        targets = batch['targets']
        outputs = self.forward(ids, mask, token_type_ids)
        loss = torch.nn.CrossEntropyLoss()(outputs, targets)
        labels_hat = torch.argmax(outputs, dim=1)
        self.preds.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
        self.targets.extend(targets.cpu().detach().numpy().tolist())
        val_acc = torch.sum(targets == labels_hat).item() / (len(targets) * 1.0)
        output = {
            'val_loss': loss,
            'val_acc': torch.tensor(val_acc), # everything must be a tensor
        }
        return output
    
    def validation_epoch_end(self, validation_step_outputs):
        self.preds = list(np.argmax(np.array(self.preds), axis=1).flatten())
        print(classification_report(self.targets, self.preds, digits=4))
        self.preds = []
        self.targets = []
    
    def test_step(self, batch, batch_idx):
        ids = batch['ids']
        mask = batch['mask']
        token_type_ids = batch['token_type_ids']
        outputs = self.forward(ids, mask, token_type_ids)
        labels_hat = torch.argmax(outputs, dim=1)
        self.test_preds.extend(labels_hat.cpu().detach().numpy().tolist())
        

In [44]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath='/scratch/tr/',
    filename=params['model']+'-{epoch}',
    save_top_k = -1
)


In [47]:
model = LMModelClassifier(params)
trainer = pl.Trainer(max_epochs=params['epochs'], callbacks=[checkpoint_callback], gpus=1)


GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


In [48]:
trainer.fit(model, train_dataloader=training_loader, val_dataloaders=testing_loader)

Set SLURM handle signals.

  | Name             | Type         | Params
--------------------------------------------------
0 | l1               | RobertaModel | 124 M 
1 | pre_classifier_1 | Linear       | 590 K 
2 | dropout          | Dropout      | 0     
3 | classifier       | Linear       | 1.5 K 
--------------------------------------------------
125 M     Trainable params
0         Non-trainable params
125 M     Total params


Validation sanity check: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0     0.3906    1.0000    0.5618        25
           1     0.0000    0.0000    0.0000        39

    accuracy                         0.3906        64
   macro avg     0.1953    0.5000    0.2809        64
weighted avg     0.1526    0.3906    0.2195        64



Training: |          | 0/? [00:00<?, ?it/s]

Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)


Validating: |          | 0/? [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0     0.9009    0.9330    0.9167       224
           1     0.9516    0.9277    0.9395       318

    accuracy                         0.9299       542
   macro avg     0.9262    0.9304    0.9281       542
weighted avg     0.9306    0.9299    0.9301       542

Epoch: 1, Loss:  0.35507112741470337


Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)


Validating: |          | 0/? [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0     0.9383    0.9509    0.9446       224
           1     0.9651    0.9560    0.9605       318

    accuracy                         0.9539       542
   macro avg     0.9517    0.9534    0.9525       542
weighted avg     0.9540    0.9539    0.9539       542

Epoch: 2, Loss:  0.09490055590867996


Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)


Validating: |          | 0/? [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0     0.9280    0.9777    0.9522       224
           1     0.9837    0.9465    0.9647       318

    accuracy                         0.9594       542
   macro avg     0.9558    0.9621    0.9585       542
weighted avg     0.9606    0.9594    0.9595       542

Epoch: 3, Loss:  0.048472046852111816


Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)


Validating: |          | 0/? [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0     0.9471    0.9598    0.9534       224
           1     0.9714    0.9623    0.9668       318

    accuracy                         0.9613       542
   macro avg     0.9593    0.9610    0.9601       542
weighted avg     0.9614    0.9613    0.9613       542

Epoch: 4, Loss:  0.03261565789580345


1

In [49]:
testdf = pd.read_csv('data/final_golden.csv')
with open('data/extracted_golden.pickle', 'rb') as f:
    data = pickle.load(f)
test = pd.DataFrame.from_dict(data)[['tweet_raw_text', 'task_1']].rename(columns = {'tweet_raw_text':'text', 'task_1':'is_cont'})
test['is_cont'] = testdf['is_cont']
test.is_cont = test.is_cont.apply(to_int)

In [50]:
# test['is_cont'] == testdf['is_cont']

In [51]:
# test = pd.read_csv('data/golden.csv')
real = test['is_cont'].values
test_data = test.reset_index(drop=True)
testing = MultiLabelDataset(test_data, tokenizer, params['max_len'], t=False)
test_params = {'batch_size': params['valid_batch'],
                'shuffle': False,
                'num_workers': 0
                }
testing_loader = DataLoader(testing, **test_params)
for i in range(params['epochs']):
    model = LMModelClassifier.load_from_checkpoint('/scratch/tr/'+params['model']+'-epoch='+str(i)+'.ckpt')
    model.test_preds = []
    trainer.test(model=model, test_dataloaders=testing_loader)
    print(classification_report(real, model.test_preds, digits=4))

Set SLURM handle signals.


Testing: |          | 0/? [00:00<?, ?it/s]



--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{}
--------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0     0.8655    0.4728    0.6116       313
           1     0.5429    0.8950    0.6759       219

    accuracy                         0.6466       532
   macro avg     0.7042    0.6839    0.6437       532
weighted avg     0.7327    0.6466    0.6380       532



Set SLURM handle signals.


Testing: |          | 0/? [00:00<?, ?it/s]



--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{}
--------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0     0.7826    0.5751    0.6630       313
           1     0.5596    0.7717    0.6488       219

    accuracy                         0.6560       532
   macro avg     0.6711    0.6734    0.6559       532
weighted avg     0.6908    0.6560    0.6571       532



Set SLURM handle signals.


Testing: |          | 0/? [00:00<?, ?it/s]



--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{}
--------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0     0.8186    0.5335    0.6460       313
           1     0.5549    0.8311    0.6654       219

    accuracy                         0.6560       532
   macro avg     0.6868    0.6823    0.6557       532
weighted avg     0.7101    0.6560    0.6540       532



Set SLURM handle signals.


Testing: |          | 0/? [00:00<?, ?it/s]



--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{}
--------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0     0.7733    0.6102    0.6821       313
           1     0.5719    0.7443    0.6468       219

    accuracy                         0.6654       532
   macro avg     0.6726    0.6773    0.6645       532
weighted avg     0.6904    0.6654    0.6676       532



In [None]:
with open('preds.pickle','wb') as f:
    pickle.dump(model.test_preds, f)

In [None]:
model.test_preds = []
trainer.test(test_dataloaders=testing_loader)


In [None]:
# df = pd.DataFrame()
# df['id'] = list(range(8001,9001))
# df['is_humor'] = list(model.test_preds)
# df.to_csv('preds1.csv', index=False)

In [None]:
print(classification_report(real, model.test_preds, digits=4))
# model.test_preds == real