In [1]:
# Insert code here.
import pandas as pd
import numpy as np
import random
import re
import time
import datetime
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, neighbors
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, BertConfig, AutoModel
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, mean_squared_error, f1_score, accuracy_score
from torch.utils.data import Dataset
from tqdm import tqdm
import pytorch_lightning as pl
import neptune
import string
import pickle
import sys
with open("token", "r") as f:
    token = f.read()
neptune.init(project_qualified_name='tathagataraha/haha-cls1',
                 api_token=token,
             )
# from sentence_transformers import SentenceTransformer
# sent_encoder = SentenceTransformer('bert-base-nli-mean-tokens')

Project(tathagataraha/haha-cls1)

In [2]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda:0")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
torch.cuda.empty_cache()

There are 4 GPU(s) available.
We will use the GPU: GeForce RTX 2080 Ti


In [3]:
models = ['bert-base-uncased', 'roberta-base', 'google/electra-base-discriminator', 'xlnet-base-cased', '/scratch/tr/bert-base-uncased-tapt','/scratch/tr/distilgpt2-tapt', '/scratch/tr/roberta-base-tapt']
labels = ['is_humor', 'humor_rating', 'humor_controversy', 'offense_rating']
model_num = 3
label_num = 0

In [4]:
params = {
    'model' : models[model_num],
    'label' : labels[label_num],
    'valid_size' : 0.2,
    'rnd' : 42,
    'max_len' : 64,
    'train_batch' : 32,
    'valid_batch' : 32,
    'epochs' : 10,
    'lr' : 1e-05,
    'dropout' : 0.1,
    'file': ''.join(random.choices(string.ascii_lowercase + string.digits, k = 20)),
    'lexical' : 1,
    'hurtlex' : 1,
    'gpu' : 0
}

In [5]:
pl.seed_everything(params['rnd'])
params

{'model': 'xlnet-base-cased',
 'label': 'is_humor',
 'valid_size': 0.2,
 'rnd': 42,
 'max_len': 64,
 'train_batch': 32,
 'valid_batch': 32,
 'epochs': 10,
 'lr': 1e-05,
 'dropout': 0.1,
 'file': 'rbh6894bsn7n8c7kf5p2',
 'lexical': 1,
 'hurtlex': 1,
 'gpu': 0}

In [6]:



tokenizer = AutoTokenizer.from_pretrained(params['model'])

In [7]:
train = pd.read_csv('data/train.csv')
valid = pd.read_csv('data/dev.csv')
train = pd.concat([train, valid])

In [8]:
trainvec = pd.read_csv('hurtlex_features/trainvec.csv')
trainvec = trainvec.values.tolist()
devvec = pd.read_csv('hurtlex_features/devvec.csv')
devvec = devvec.values.tolist()
trainvec.extend(devvec)

In [9]:
for i in range(len(trainvec)):
    trainvec[i][1] = int(trainvec[i][1][1])
    trainvec[i][-1] = int(trainvec[i][-1][1])
    trainvec[i] = trainvec[i][1:]

In [10]:
with open('lexical_features/train.pickle', 'rb') as f:
    lx = pickle.load(f)
with open('lexical_features/dev.pickle', 'rb') as f:
    lx2 = pickle.load(f)
lx.extend(lx2)
print(len(lx))

9000


In [11]:
params['lx'] = len(lx[0])
params['hurt'] = len(trainvec[0])

In [12]:
train['hurtlex'] = trainvec
train['lexical'] = lx

In [13]:
train

Unnamed: 0,id,text,is_humor,humor_rating,humor_controversy,offense_rating,hurtlex,lexical
0,1,TENNESSEE: We're the best state. Nobody even c...,1,2.42,1.0,0.20,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[114, 10, 22, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0,..."
1,2,A man inserted an advertisement in the classif...,1,2.50,1.0,1.10,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[189, 9, 6, 4, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0..."
2,3,How many men does it take to open a can of bee...,1,1.95,0.0,2.40,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[112, 3, 3, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0..."
3,4,Told my mom I hit 1200 Twitter followers. She ...,1,2.11,1.0,0.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[139, 4, 6, 4, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0..."
4,5,Roses are dead. Love is fake. Weddings are bas...,1,2.78,0.0,0.10,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[72, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...,...,...,...
995,8996,boss: what are you doing inventor of the bagpi...,1,2.06,1.0,0.25,"[0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[73, 2, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
996,8997,I told him his views were pretty extreme and i...,0,,,0.10,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[149, 2, 1, 2, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0..."
997,8998,"""Mum, all the black kids call each other Nigga...",1,1.94,0.0,2.95,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[132, 10, 5, 0, 1, 1, 1, 1, 2, 0, 0, 0, 0, 0, ..."
998,8999,"In honor of Fathers Day, I'm gonna bring you ""...",0,,,0.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[240, 14, 22, 6, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,..."


In [14]:
from sklearn.model_selection import train_test_split
# train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['tweet'], train['label'])
train_x, valid_x, train_y, valid_y = train_test_split(train.text, train[params['label']], test_size=params['valid_size'], random_state=params['rnd'])

In [15]:
def count_words(text):
    try:
        return len(text.split())
    except:
        print(text)
        return None

In [16]:
total = 0
maxw = 0
large_count = 0
for i in train.text:
    temp = count_words(i)
    total += temp
    maxw = temp if temp > maxw else maxw
    large_count += 1 if temp > 64 else 0
total/len(train.text), maxw, large_count, len(train.text)

(20.712666666666667, 63, 0, 9000)

In [17]:
train

Unnamed: 0,id,text,is_humor,humor_rating,humor_controversy,offense_rating,hurtlex,lexical
0,1,TENNESSEE: We're the best state. Nobody even c...,1,2.42,1.0,0.20,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[114, 10, 22, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0,..."
1,2,A man inserted an advertisement in the classif...,1,2.50,1.0,1.10,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[189, 9, 6, 4, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0..."
2,3,How many men does it take to open a can of bee...,1,1.95,0.0,2.40,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[112, 3, 3, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0..."
3,4,Told my mom I hit 1200 Twitter followers. She ...,1,2.11,1.0,0.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[139, 4, 6, 4, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0..."
4,5,Roses are dead. Love is fake. Weddings are bas...,1,2.78,0.0,0.10,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[72, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...,...,...,...
995,8996,boss: what are you doing inventor of the bagpi...,1,2.06,1.0,0.25,"[0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[73, 2, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
996,8997,I told him his views were pretty extreme and i...,0,,,0.10,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[149, 2, 1, 2, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0..."
997,8998,"""Mum, all the black kids call each other Nigga...",1,1.94,0.0,2.95,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[132, 10, 5, 0, 1, 1, 1, 1, 2, 0, 0, 0, 0, 0, ..."
998,8999,"In honor of Fathers Day, I'm gonna bring you ""...",0,,,0.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[240, 14, 22, 6, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,..."


In [18]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len, t = False):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.hurtlex = dataframe.hurtlex
        self.lexical = dataframe.lexical
#         self.emoji = dataframe.emoji
#         self.hash = dataframe.segmented_hash
        self.t = t
        if not self.t:
            self.targets = self.data[params['label']]
        self.max_len = max_len
#         print(self.targets)
    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        hurt = np.asarray(self.hurtlex[index])
        lx = np.asarray(self.lexical[index])
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_attention_mask = True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
#         h_text = self.hash[index]
#         h_text = " ".join(h_text)
#         inputs = self.tokenizer.encode_plus(
#             h_text,
#             None,
#             truncation=True,
#             add_special_tokens=True,
#             max_length=self.max_len,
#             pad_to_max_length=True,
#             return_attention_mask = True,
#             return_token_type_ids=True
#         )
#         h_ids = inputs['input_ids']
#         h_mask = inputs['attention_mask']
#         h_token_type_ids = inputs["token_type_ids"]
#         h_inputs
#         emoji = getEmojiEmbeddings(self.emoji[index])
        if self.t:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'hurtlex' : torch.tensor(hurt, dtype=torch.long),
                'lexical' : torch.tensor(lx, dtype=torch.long)
#                 'h_ids': torch.tensor(h_ids, dtype=torch.long),
#                 'h_mask': torch.tensor(h_mask, dtype=torch.long),
#                 'h_token_type_ids': torch.tensor(h_token_type_ids, dtype=torch.long),
#                 'emoji' : torch.tensor(emoji, dtype=torch.long),
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'hurtlex' : torch.tensor(hurt, dtype=torch.long),
                'lexical' : torch.tensor(lx, dtype=torch.long),
#                 'h_ids': torch.tensor(h_ids, dtype=torch.long),
#                 'h_mask': torch.tensor(h_mask, dtype=torch.long),
#                 'h_token_type_ids': torch.tensor(h_token_type_ids, dtype=torch.long),
#                 'emoji' : torch.tensor(emoji, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.long)
            }

In [19]:
# Creating the dataset and dataloader for the neural network
# train_size = 0.85
# train_data=train.sample(frac=1 - params['valid_size'],random_state=params['rnd'])
# test_data=train.drop(train_data.index).reset_index(drop=True)
# train_data = train_data.reset_index(drop=True)
train_data, test_data = train_test_split(train, test_size=params['valid_size'],random_state=params['rnd'])

test_data=test_data.reset_index(drop=True)
train_data = train_data.reset_index(drop=True)
valid_data = test_data

print("FULL Dataset: {}".format(train.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, params['max_len'])
testing_set = MultiLabelDataset(test_data, tokenizer, params['max_len'])

FULL Dataset: (9000, 8)
TRAIN Dataset: (7200, 8)
TEST Dataset: (1800, 8)


In [24]:
valid_data.to_csv('data/task_1_val.csv', index=False)

In [20]:
train_params = {'batch_size': params['train_batch'],
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': params['valid_batch'],
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [21]:
class LMModelClassifier(pl.LightningModule):
    def __init__(self, params):
        super().__init__()
        self.save_hyperparameters()
        self.l1 = AutoModel.from_pretrained(params['model'])
        self.pre_classifier_1 = torch.nn.Linear(768, 768)
        self.final_layer_dim = 768
        if params['lexical']:
            self.final_layer_dim += params['lx']
            self.hidden_lex = torch.nn.Linear(params['lx'], params['lx'])
        if params['hurtlex']:
            self.final_layer_dim += params['hurt']
            self.hidden_hurt = torch.nn.Linear(params['hurt'], params['hurt'])
        if params['lexical'] or params['hurtlex']:
            self.pre_classifier_2 = torch.nn.Linear(self.final_layer_dim, self.final_layer_dim)
#         print(self.pre_classifier_2)
        self.classifier = torch.nn.Linear(self.final_layer_dim, 2)
        self.dropout = torch.nn.Dropout(params['dropout'])
        self.total_loss = 0
        self.val_loss = 0
        self.val_batch = 0
        self.batch_count = 0
        self.epoch = 0
        self.macrof1 = 0
        self.acc = 0
        self.f1 = 0
        
        self.preds = []
        self.targets = []
        self.test_preds = []

        
    def forward(self, input_ids, attention_mask, token_type_ids, lx, hurt):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state_1 = output_1[0]
        pooler_1 = hidden_state_1[:, 0]
        pooler_1 = self.pre_classifier_1(pooler_1)
        pooler_1 = torch.nn.Tanh()(pooler_1)
        pooler_1 = self.dropout(pooler_1)
#         print(pooler_1.shape)
        if params['lexical']:
            pooler_2 = self.hidden_lex(lx)
            pooler_2 = torch.nn.Tanh()(pooler_2)
            pooler_2 = self.dropout(pooler_2)
#             print(pooler_2.shape)
            pooler_1 = torch.cat((pooler_1, pooler_2), 1)
        
        if params['hurtlex']:
            pooler_2 = self.hidden_hurt(hurt)
            pooler_2 = torch.nn.Tanh()(pooler_2)
            pooler_2 = self.dropout(pooler_2)
#             print(pooler_2.shape)
            pooler_1 = torch.cat((pooler_1, pooler_2), 1)
#         print(pooler_1.shape)
        if params['lexical'] or params['hurtlex']:
#             print(pooler_1.shape)
            pooler_1 = self.pre_classifier_2(pooler_1)
            pooler_1 = torch.nn.Tanh()(pooler_1)
            pooler_1 = self.dropout(pooler_1)
        output = self.classifier(pooler_1)
        return output
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(params =  self.parameters(), lr=params['lr'])
        return optimizer

    def training_step(self, batch, batch_nb):
        ids = batch['ids']
        mask = batch['mask']
        token_type_ids = batch['token_type_ids']
        targets = batch['targets']
        lx = batch['lexical']
        hurt = batch['hurtlex']
        lx = lx.to(torch.float32)
        hurt = hurt.to(torch.float32)
        outputs = self.forward(ids, mask, token_type_ids, lx, hurt)
        loss = torch.nn.CrossEntropyLoss()(outputs, targets)
        self.total_loss += loss
        self.batch_count += 1
        logger_logs = {'training_loss': loss}
        logger_logs = {'losses': logger_logs} # optional (MUST ALL BE TENSORS)
        output = {
            'loss': loss, # required
            'progress_bar': {'training_loss': loss}, # optional (MUST ALL BE TENSORS)
            'log': logger_logs
        }
        # return a dict
        return output
    def on_epoch_end(self):
        self.epoch += 1
        params['epoch'] = self.epoch
#         params['file'] = ''.join(random.choices(string.ascii_lowercase + string.digits, k = 20))
#         trainer.checkpoint_callback.format_checkpoint_name(0, 0, metrics=dict(file=params['file']))
#         trainer.checkpoint_callback.on_epoch_end()
        print(f'Epoch: {self.epoch}, Loss:  {self.total_loss/self.batch_count}')
        neptune.create_experiment(params = params)
        neptune.log_metric('train_loss', self.total_loss/self.batch_count)
        neptune.log_metric('val_loss', self.val_loss/self.val_batch)
        neptune.log_metric('acc', self.acc)
        neptune.log_metric('f1', self.f1)
        neptune.log_metric('macrof1', self.macrof1)
#         neptune.log_metric('rmse', self.rms)
        self.total_loss=0
        self.batch_count=0
#     def validation_step(self, batch, batch_idx):
#         ids = batch['ids']
#         mask = batch['mask']
#         token_type_ids = batch['token_type_ids']
#         targets = batch['targets']
#         targets = targets.to(torch.float32)
#         lx = batch['lexical']
#         hurt = batch['hurtlex']
#         outputs = self.forward(ids, mask, token_type_ids, lx, hurt)
# #         outputs = self.forward(ids, mask, token_type_ids)
#         loss = torch.nn.CrossEntropyLoss()(outputs.view(-1), targets.view(-1))
# #         labels_hat = torch.argmax(outputs, dim=1)
#         self.preds.extend(outputs.cpu().detach().numpy().tolist())
#         self.targets.extend(targets.cpu().detach().numpy().tolist())
#         self.val_loss += loss
#         self.val_batch += 1
# #         val_acc = torch.sum(targets == labels_hat).item() / (len(targets) * 1.0)
#         output = {
#             'val_loss': loss,
# #             'val_acc': torch.tensor(val_acc), # everything must be a tensor
#         }
#         return output
    def validation_step(self, batch, batch_idx):
        ids = batch['ids']
        mask = batch['mask']
        token_type_ids = batch['token_type_ids']
        targets = batch['targets']
        lx = batch['lexical']
        hurt = batch['hurtlex']
        lx = lx.to(torch.float32)
        hurt = hurt.to(torch.float32)
        outputs = self.forward(ids, mask, token_type_ids, lx, hurt)
        loss = torch.nn.CrossEntropyLoss()(outputs, targets)
        labels_hat = torch.argmax(outputs, dim=1)
        self.preds.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
        self.targets.extend(targets.cpu().detach().numpy().tolist())
        self.val_loss += loss
        self.val_batch += 1
        val_acc = torch.sum(targets == labels_hat).item() / (len(targets) * 1.0)
        output = {
            'val_loss': loss,
            'val_acc': torch.tensor(val_acc), # everything must be a tensor
        }
        return output
    
    def validation_epoch_end(self, validation_step_outputs):
        self.preds = list(np.argmax(np.array(self.preds), axis=1).flatten())
        print(classification_report(self.targets, self.preds, digits=4))
        self.f1 = f1_score(self.targets, self.preds)
        self.macrof1 = f1_score(self.targets, self.preds, average='macro')
        self.acc = accuracy_score(self.targets, self.preds)
        self.preds = []
        self.targets = []
    
    def test_step(self, batch, batch_idx):
        ids = batch['ids']
        mask = batch['mask']
        token_type_ids = batch['token_type_ids']
        outputs = self.forward(ids, mask, token_type_ids)
        labels_hat = torch.argmax(outputs, dim=1)
        self.test_preds.extend(labels_hat.cpu().detach().numpy().tolist())
        

In [22]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath='/scratch/tr/haha_cls1',
    filename=params['file']+'-{epoch}',
    save_top_k = -1
)


In [23]:
model = LMModelClassifier(params)
trainer = pl.Trainer(max_epochs=params['epochs'], callbacks=[checkpoint_callback], gpus=[params['gpu']])


GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


In [24]:
trainer.fit(model, train_dataloader=training_loader, val_dataloaders=testing_loader)

Set SLURM handle signals.

  | Name             | Type       | Params
------------------------------------------------
0 | l1               | XLNetModel | 116 M 
1 | pre_classifier_1 | Linear     | 590 K 
2 | hidden_lex       | Linear     | 272   
3 | hidden_hurt      | Linear     | 306   
4 | pre_classifier_2 | Linear     | 642 K 
5 | classifier       | Linear     | 1.6 K 
6 | dropout          | Dropout    | 0     
------------------------------------------------
117 M     Trainable params
0         Non-trainable params
117 M     Total params


Validation sanity check: |          | 0/? [00:00<?, ?it/s]



              precision    recall  f1-score   support

           0     0.3913    0.3462    0.3673        26
           1     0.5854    0.6316    0.6076        38

    accuracy                         0.5156        64
   macro avg     0.4883    0.4889    0.4875        64
weighted avg     0.5065    0.5156    0.5100        64





Training: |          | 0/? [00:00<?, ?it/s]

Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)


Validating: |          | 0/? [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0     0.8604    0.9263    0.8921       692
           1     0.9517    0.9061    0.9283      1108

    accuracy                         0.9139      1800
   macro avg     0.9060    0.9162    0.9102      1800
weighted avg     0.9166    0.9139    0.9144      1800

Epoch: 1, Loss:  0.3676433861255646
https://ui.neptune.ai/tathagataraha/haha-cls1/e/HAH2-421


Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
Please use self.log(...) inside the lightningModule instead.

# log on a step or aggregate epoch metric to the logger and/or progress bar
# (inside LightningModule)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)


1

In [25]:
del trainer, model
torch.cuda.empty_cache()

In [26]:
test = pd.read_csv('data/public_test.csv')

trainvec = pd.read_csv('hurtlex_features/public_testvec.csv')
trainvec = trainvec.values.tolist()
npt = 'haha-cls1'
startpath = '/scratch/tr/haha_cls1/'
savepath = 'test_preds/task_1/'
with open('lexical_features/public_test.pickle', 'rb') as f:
    lx = pickle.load(f)
for i in range(len(trainvec)):
    trainvec[i][1] = int(trainvec[i][1][1])
    trainvec[i][-1] = int(trainvec[i][-1][1])
    trainvec[i] = trainvec[i][1:]
test['hurtlex'] = trainvec
test['lexical'] = lx
test_data = test.reset_index(drop=True)
for i in range(params['epochs']):
    tokenizer = AutoTokenizer.from_pretrained(params['model'])
    testing = MultiLabelDataset(test_data, tokenizer, params['max_len'], t=True)
    test_params = {'batch_size': params['valid_batch'],
                'shuffle': False,
                'num_workers': 0
                }
    testing_loader = DataLoader(testing, **test_params)
#     model = LMModelClassifier.load_from_checkpoint(startpath+'2c1kyhlwp739kr31cpsx-epoch=0.ckpt')
    model = LMModelClassifier.load_from_checkpoint(startpath+params['file']+'-epoch='+str(i)+'.ckpt')
    device = torch.device("cuda:"+str(params['gpu']))
    model.to(device)
    model.eval()
    test_preds = []
    test_conf = []
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            lx = data['lexical'].to(device, dtype = torch.float)
            hurt = data['hurtlex'].to(device, dtype = torch.float)
            outputs = model.forward(ids, mask, token_type_ids, lx, hurt)
            outputs = outputs.cpu().numpy()
#             print(outputs)
            labels_hat = np.argmax(outputs, axis=1)
            conf = np.max(outputs, axis=1)
            test_preds.extend(labels_hat.tolist())
            test_conf.extend(list(conf))
    df = pd.DataFrame()
    df['id'] = list(range(9001,10001))
    df[params['label']] = list(test_preds)
    df['conf'] = list(test_conf)
    df.to_csv(savepath+params['file']+'-epoch='+str(i)+'.csv', index=False)
#     train=train.reset_index(drop=True)
    training = MultiLabelDataset(valid_data, tokenizer, params['max_len'])
    testing_loader = DataLoader(training, **test_params)
    test_preds = []
    test_conf = []
    savepath = 'train_preds/task_1/'
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            lx = data['lexical'].to(device, dtype = torch.float)
            hurt = data['hurtlex'].to(device, dtype = torch.float)
            outputs = model.forward(ids, mask, token_type_ids, lx, hurt)
            outputs = outputs.cpu().numpy()
#             print(outputs)
            labels_hat = np.argmax(outputs, axis=1)
            conf = np.max(outputs, axis=1)
            test_preds.extend(labels_hat.tolist())
            test_conf.extend(list(conf))
    df = pd.DataFrame()
    df['id'] = list(range(1,1801))
    df[params['label']] = list(test_preds)
    df['conf'] = list(test_conf)
    df.to_csv(savepath+params['file']+'-epoch='+str(i)+'.csv', index=False)
#     print(len(test_preds),'dsf' ,test_preds[0:10])

32it [00:03,  8.37it/s]
57it [00:06,  8.37it/s]
32it [00:03,  8.48it/s]
57it [00:06,  8.38it/s]


FileNotFoundError: [Errno 2] No such file or directory: '/scratch/tr/haha_cls1/46d97s2f2a49m23loqci-epoch=2.ckpt'