In [1]:
# Insert code here.
import pandas as pd
import numpy as np
import random
import re
import time
import datetime
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, neighbors
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, BertConfig, AutoModel
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from torch.utils.data import Dataset
from tqdm import tqdm
import pytorch_lightning as pl
import neptune
with open("token", "r") as f:
    token = f.read()
neptune.init(project_qualified_name='tathagataraha/contro-base',
             api_token=token,
             )
# from sentence_transformers import SentenceTransformer
# sent_encoder = SentenceTransformer('bert-base-nli-mean-tokens')

Project(tathagataraha/contro-base)

In [2]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda:0")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
torch.cuda.empty_cache()

There are 3 GPU(s) available.
We will use the GPU: GeForce RTX 2080 Ti


In [9]:
models = ['bert-base-uncased', 'distilbert-base-uncased-finetuned-sst-2-english', 'textattack/roberta-base-SST-2','roberta-base', 'google/electra-base-discriminator', 'xlnet-base-cased', 'xlm-roberta-base']
labels = ['is_cont', 'humor_rating', 'humor_controversy', 'offense_rating']
model_num = 3
label_num = 0

In [10]:
params = {
    'model' : models[model_num],
    'label' : labels[label_num],
    'valid_size' : 0.2,
    'rnd' : 42,
    'max_len' : 64,
    'train_batch' : 32,
    'valid_batch' : 32,
    'epochs' : 10,
    'lr' : 1e-05,
    'dropout' : 0.1
}

In [11]:
params

{'model': 'roberta-base',
 'label': 'is_cont',
 'valid_size': 0.2,
 'rnd': 42,
 'max_len': 64,
 'train_batch': 32,
 'valid_batch': 32,
 'epochs': 10,
 'lr': 1e-05,
 'dropout': 0.1}

In [12]:

tokenizer = AutoTokenizer.from_pretrained(params['model'])

In [14]:
train = pd.read_csv('data/dataset.csv')
# valid = pd.read_csv('data/public_dev.csv')
# train = pd.concat([train, valid])

In [15]:
from sklearn.model_selection import train_test_split
# train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['tweet'], train['label'])
train_x, valid_x, train_y, valid_y = train_test_split(train.text, train[params['label']], test_size=params['valid_size'], random_state=params['rnd'])

In [16]:
def count_words(text):
    try:
        return len(text.split())
    except:
        print(text)
        return None

In [17]:
total = 0
maxw = 0
large_count = 0
for i in train.text:
    temp = count_words(i)
    total += temp
    maxw = temp if temp > maxw else maxw
    large_count += 1 if temp > 64 else 0
total/len(train.text), maxw, large_count, len(train.text)

(22.31782945736434, 55, 0, 2709)

In [18]:
train

Unnamed: 0,time,id,text,user_id,user_name,no_of_retweets,no_of_likes,is_cont
0,2021-02-25 13:07:35,1364925009316237316,ACLU: There is no way to prosecute [Assange] ...,16589206,wikileaks,223,415,1
1,2021-02-23 20:16:52,1364308266285731846,America’s Top Newspaper Editors Alarmed by Ass...,16589206,wikileaks,167,309,1
2,2021-02-23 09:59:57,1364153013250781188,"""Enough is Enough"" - Australian Opposition Lea...",16589206,wikileaks,723,1658,1
3,2021-02-23 09:37:41,1364147407215296512,Australian opposition leader Anthony Albanese ...,16589206,wikileaks,1144,3018,1
4,2021-02-22 15:42:10,1363876745515040771,Reporters Committee reviews Judge Merrick Garl...,16589206,wikileaks,98,138,1
...,...,...,...,...,...,...,...,...
2704,2021-02-17 23:45:09,1362186354692620289,"""Too many moves!""\n\nWhich Steph handles are y...",19923144,NBA,86,637,0
2705,2021-02-17 23:30:00,1362182540241551363,"Always bringing the energy, listen in to Steph...",19923144,NBA,158,1635,0
2706,2021-02-17 23:00:01,1362174996353810433,Where else can you see Steph Curry take 3-poin...,19923144,NBA,116,946,0
2707,2021-02-17 22:10:00,1362162407100084225,Where else can a kid from Cameroon become a fa...,19923144,NBA,108,791,0


In [19]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len, t = False):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
#         self.emoji = dataframe.emoji
#         self.hash = dataframe.segmented_hash
        self.t = t
        if not self.t:
            self.targets = self.data[params['label']]
        self.max_len = max_len
#         print(self.targets)
    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_attention_mask = True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
#         h_text = self.hash[index]
#         h_text = " ".join(h_text)
#         inputs = self.tokenizer.encode_plus(
#             h_text,
#             None,
#             truncation=True,
#             add_special_tokens=True,
#             max_length=self.max_len,
#             pad_to_max_length=True,
#             return_attention_mask = True,
#             return_token_type_ids=True
#         )
#         h_ids = inputs['input_ids']
#         h_mask = inputs['attention_mask']
#         h_token_type_ids = inputs["token_type_ids"]
#         h_inputs
#         emoji = getEmojiEmbeddings(self.emoji[index])
        if self.t:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
#                 'h_ids': torch.tensor(h_ids, dtype=torch.long),
#                 'h_mask': torch.tensor(h_mask, dtype=torch.long),
#                 'h_token_type_ids': torch.tensor(h_token_type_ids, dtype=torch.long),
#                 'emoji' : torch.tensor(emoji, dtype=torch.long),
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
#                 'h_ids': torch.tensor(h_ids, dtype=torch.long),
#                 'h_mask': torch.tensor(h_mask, dtype=torch.long),
#                 'h_token_type_ids': torch.tensor(h_token_type_ids, dtype=torch.long),
#                 'emoji' : torch.tensor(emoji, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.long)
            }

In [20]:
# Creating the dataset and dataloader for the neural network
# train_size = 0.85
train_data=train.sample(frac=1 - params['valid_size'],random_state=params['rnd'])
test_data=train.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)
# train_data, test_data = train_test_split(train, test_size=params['valid_size'])

test_data=test_data.reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(train.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, params['max_len'])
testing_set = MultiLabelDataset(test_data, tokenizer, params['max_len'])

FULL Dataset: (2709, 8)
TRAIN Dataset: (2167, 8)
TEST Dataset: (542, 8)


In [21]:
train_params = {'batch_size': params['train_batch'],
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': params['valid_batch'],
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [22]:
class LMModelClassifier(pl.LightningModule):
    def __init__(self, params):
        super().__init__()
        self.save_hyperparameters()
        self.l1 = AutoModel.from_pretrained(params['model'])
        self.pre_classifier_1 = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(params['dropout'])
        self.total_loss = 0
        self.batch_count = 0
        self.epoch = 0
        self.classifier = torch.nn.Linear(768, 2)
        self.preds = []
        self.targets = []
        self.test_preds = []

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state_1 = output_1[0]
        pooler_1 = hidden_state_1[:, 0]
        pooler_1 = self.pre_classifier_1(pooler_1)
        pooler_1 = torch.nn.Tanh()(pooler_1)
        pooler_1 = self.dropout(pooler_1)
        output = self.classifier(pooler_1)
        return output
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(params =  self.parameters(), lr=params['lr'])
        return optimizer

    def training_step(self, batch, batch_nb):
        ids = batch['ids']
        mask = batch['mask']
        token_type_ids = batch['token_type_ids']
        targets = batch['targets']
        outputs = self.forward(ids, mask, token_type_ids)
        loss = torch.nn.CrossEntropyLoss()(outputs, targets)
        self.total_loss += loss
        self.batch_count += 1
        logger_logs = {'training_loss': loss}
        logger_logs = {'losses': logger_logs} # optional (MUST ALL BE TENSORS)
        output = {
            'loss': loss, # required
            'progress_bar': {'training_loss': loss}, # optional (MUST ALL BE TENSORS)
            'log': logger_logs
        }
        # return a dict
        return output
    
    def on_epoch_end(self):
        self.epoch += 1
        print(f'Epoch: {self.epoch}, Loss:  {self.total_loss/self.batch_count}')
        self.total_loss=0
        self.batch_count=0
    
    def validation_step(self, batch, batch_idx):
        ids = batch['ids']
        mask = batch['mask']
        token_type_ids = batch['token_type_ids']
        targets = batch['targets']
        outputs = self.forward(ids, mask, token_type_ids)
        loss = torch.nn.CrossEntropyLoss()(outputs, targets)
        labels_hat = torch.argmax(outputs, dim=1)
        self.preds.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
        self.targets.extend(targets.cpu().detach().numpy().tolist())
        val_acc = torch.sum(targets == labels_hat).item() / (len(targets) * 1.0)
        output = {
            'val_loss': loss,
            'val_acc': torch.tensor(val_acc), # everything must be a tensor
        }
        return output
    
    def validation_epoch_end(self, validation_step_outputs):
        self.preds = list(np.argmax(np.array(self.preds), axis=1).flatten())
        print(classification_report(self.targets, self.preds, digits=4))
        self.preds = []
        self.targets = []
    
    def test_step(self, batch, batch_idx):
        ids = batch['ids']
        mask = batch['mask']
        token_type_ids = batch['token_type_ids']
        outputs = self.forward(ids, mask, token_type_ids)
        labels_hat = torch.argmax(outputs, dim=1)
        self.test_preds.extend(labels_hat.cpu().detach().numpy().tolist())
        

In [23]:
model = LMModelClassifier(params)
trainer = pl.Trainer(max_epochs=params['epochs'], gpus=1)


Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]


In [24]:
trainer.fit(model, train_dataloader=training_loader, val_dataloaders=testing_loader)

Set SLURM handle signals.

  | Name             | Type         | Params
--------------------------------------------------
0 | l1               | RobertaModel | 124 M 
1 | pre_classifier_1 | Linear       | 590 K 
2 | dropout          | Dropout      | 0     
3 | classifier       | Linear       | 1.5 K 
--------------------------------------------------
125 M     Trainable params
0         Non-trainable params
125 M     Total params


Validation sanity check: |          | 0/? [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0     0.4167    0.9615    0.5814        26
           1     0.7500    0.0789    0.1429        38

    accuracy                         0.4375        64
   macro avg     0.5833    0.5202    0.3621        64
weighted avg     0.6146    0.4375    0.3210        64





Training: |          | 0/? [00:00<?, ?it/s]

Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)


Validating: |          | 0/? [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0     0.9813    0.9375    0.9589       224
           1     0.9573    0.9874    0.9721       318

    accuracy                         0.9668       542
   macro avg     0.9693    0.9625    0.9655       542
weighted avg     0.9672    0.9668    0.9667       542

Epoch: 1, Loss:  0.34426048398017883


Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)


Validating: |          | 0/? [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0     0.9817    0.9598    0.9707       224
           1     0.9721    0.9874    0.9797       318

    accuracy                         0.9760       542
   macro avg     0.9769    0.9736    0.9752       542
weighted avg     0.9761    0.9760    0.9760       542

Epoch: 2, Loss:  0.07345932722091675


Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)


Validating: |          | 0/? [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0     0.9774    0.9643    0.9708       224
           1     0.9751    0.9843    0.9797       318

    accuracy                         0.9760       542
   macro avg     0.9762    0.9743    0.9752       542
weighted avg     0.9760    0.9760    0.9760       542

Epoch: 3, Loss:  0.04145366698503494


Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)


Validating: |          | 0/? [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0     0.9819    0.9688    0.9753       224
           1     0.9782    0.9874    0.9828       318

    accuracy                         0.9797       542
   macro avg     0.9800    0.9781    0.9790       542
weighted avg     0.9797    0.9797    0.9797       542

Epoch: 4, Loss:  0.020212244242429733


Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)


Validating: |          | 0/? [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0     0.9819    0.9688    0.9753       224
           1     0.9782    0.9874    0.9828       318

    accuracy                         0.9797       542
   macro avg     0.9800    0.9781    0.9790       542
weighted avg     0.9797    0.9797    0.9797       542

Epoch: 5, Loss:  0.010359393432736397


Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)


Validating: |          | 0/? [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0     0.9319    0.9777    0.9542       224
           1     0.9837    0.9497    0.9664       318

    accuracy                         0.9613       542
   macro avg     0.9578    0.9637    0.9603       542
weighted avg     0.9623    0.9613    0.9614       542

Epoch: 6, Loss:  0.01491096056997776


Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)


Validating: |          | 0/? [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0     0.9773    0.9598    0.9685       224
           1     0.9720    0.9843    0.9781       318

    accuracy                         0.9742       542
   macro avg     0.9747    0.9720    0.9733       542
weighted avg     0.9742    0.9742    0.9741       542

Epoch: 7, Loss:  0.011103135533630848


Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)


Validating: |          | 0/? [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0     0.9908    0.9598    0.9751       224
           1     0.9723    0.9937    0.9829       318

    accuracy                         0.9797       542
   macro avg     0.9815    0.9768    0.9790       542
weighted avg     0.9799    0.9797    0.9797       542

Epoch: 8, Loss:  0.01077239029109478


Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)


Validating: |          | 0/? [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0     0.9689    0.9732    0.9710       224
           1     0.9811    0.9780    0.9795       318

    accuracy                         0.9760       542
   macro avg     0.9750    0.9756    0.9753       542
weighted avg     0.9760    0.9760    0.9760       542

Epoch: 9, Loss:  0.009355619549751282


Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)


Validating: |          | 0/? [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0     0.9907    0.9554    0.9727       224
           1     0.9693    0.9937    0.9814       318

    accuracy                         0.9779       542
   macro avg     0.9800    0.9745    0.9770       542
weighted avg     0.9782    0.9779    0.9778       542

Epoch: 10, Loss:  0.0023249483201652765


1

In [17]:
test = pd.read_csv('data/public_dev.csv')
test_data = test.reset_index(drop=True)
testing = MultiLabelDataset(test_data, tokenizer, params['max_len'], t=True)
test_params = {'batch_size': params['valid_batch'],
                'shuffle': False,
                'num_workers': 0
                }
testing_loader = DataLoader(testing, **test_params)

In [18]:
model.test_preds = []
trainer.test(test_dataloaders=testing_loader)


Set SLURM handle signals.


Testing: |          | 0/? [00:00<?, ?it/s]



--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{}
--------------------------------------------------------------------------------


[{}]

In [21]:
df = pd.DataFrame()
df['id'] = list(range(8001,9001))
df['is_humor'] = list(model.test_preds)
df.to_csv('preds1.csv', index=False)