In [164]:
# Insert code here.
import pandas as pd
import numpy as np
import random
import re
import time
import datetime
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, neighbors
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, BertConfig, AutoModel
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from torch.utils.data import Dataset
from tqdm import tqdm
# from sentence_transformers import SentenceTransformer
# sent_encoder = SentenceTransformer('bert-base-nli-mean-tokens')

In [165]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda:2")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
torch.cuda.empty_cache()

There are 4 GPU(s) available.
We will use the GPU: GeForce RTX 2080 Ti


In [166]:
models = ['bert-base-uncased', 'distilbert-base-uncased-finetuned-sst-2-english', 'textattack/roberta-base-SST-2','roberta-base', 'google/electra-base-discriminator', 'xlnet-base-cased', 'xlm-roberta-base', '/scratch/covid-tapt', '/scratch/covid-tapt/checkpoint-500']
model_num = 8
tokenizer = AutoTokenizer.from_pretrained(models[model_num])


In [167]:
train = pd.read_csv('../datasets/covid/Constraint_English_Train - Sheet1.csv')
test = pd.read_csv('../datasets/covid/Constraint_English_Val - Sheet1.csv')

In [187]:
import pickle
with open('train.pickle','rb') as f:
    train = pickle.load(f)
    train = pd.DataFrame.from_dict(train)
    train.drop(train.head(1).index, inplace=True)
with open('valid.pickle','rb') as f:
    valid = pickle.load(f)
    valid = pd.DataFrame.from_dict(valid)
    valid.drop(valid.head(1).index, inplace=True)
with open('test.pickle','rb') as f:
    test = pickle.load(f)
    del test['task_1']
    test = pd.DataFrame.from_dict(test)
#     test.drop(test.head(1).index, inplace=True)
#     test = pd.DataFrame.from_dict(test)
# test = pd.read_csv('data/valid.tsv', sep='\t')

In [188]:
train = pd.concat([train, valid])
test.head(10)

Unnamed: 0,tweet_id,full_tweet,tweet_raw_text,hashtags,smiley,emoji,url,mentions,numerals,reserved_word,emotext,segmented_hash,clean
0,1,Our daily update is published. States reported...,Our daily update is published. States reported...,[],[],[],[https://t.co/wzSYMe0Sht],[],"[734, 39, 532, 30, 22]",[],[],[],Our daily update is published. States reported...
1,2,Alfalfa is the only cure for COVID-19.,Alfalfa is the only cure for COVID-19.,[],[],[],[],[],[],[],[],[],Alfalfa is the only cure for COVID-19.
2,3,President Trump Asked What He Would Do If He W...,President Trump Asked What He Would Do If He W...,"[#donaldtrump, #coronavirus]",[],[],[https://t.co/3MEWhusRZI],[],[],[],[],"[donald trump, coronavirus]",President Trump Asked What He Would Do If He W...
3,4,States reported 630 deaths. We are still seein...,States reported deaths. We are still seeing a ...,[],[],[],[https://t.co/LBmcot3h9a],[],"[630, 28]",[],[],[],States reported 630 deaths. We are still seein...
4,5,This is the sixth time a global health emergen...,This is the sixth time a global health emergen...,[],[],[],[https://t.co/JvKC0PTett],[@DrTedros],[],[],[],[],This is the sixth time a global health emergen...
5,6,Low #vitaminD was an independent predictor of ...,Low was an independent predictor of worse prog...,[#vitaminD],[],[],"[https://t.co/CGD6Kphn31, https://t.co/chtni8K...",[],[],[],[],[vitamin d],Low was an independent predictor of worse pr...
6,7,A common question: why are the cumulative outc...,A common question: why are the cumulative outc...,[#s],[],[],[],[],[],[],[],[s],A common question: why are the cumulative outc...
7,8,The government should consider bringing in any...,The government should consider bringing in any...,[],[],[],[https://t.co/pdOls6cqoN],[],[],[],[],[],The government should consider bringing in any...
8,9,Our daily update is published. We’ve now track...,Our daily update is published. Weve now tracke...,[],[],[],"[https://t.co/PZrmH4bl5Y, https://t.co/2588xW5...",[],"[2.9, 119, 1]",[],[],[],Our daily update is published. We’ve now track...
9,10,Breakdown of testing: 4 air crew 97 hotel &amp...,Breakdown of testing: air crew hotel &amp; hea...,[],[],[],[],[],"[4, 97, 71, 2, 200]",[],[],[],Breakdown of testing: 4 air crew 97 hotel &amp...


In [189]:
labels = ['fake','real']
def label_encode(val):
    return labels.index(val)

In [190]:
train['label'] = train.task_1.apply(label_encode)
train['tweet'] = train.full_tweet
test['tweet'] = test.full_tweet

In [191]:
valid.emoji.sample(10)

1588    []
1906    []
1883    []
829     []
977     []
983     []
769     []
1017    []
368     []
1294    []
Name: emoji, dtype: object

In [192]:
train = train.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = []

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
train.tweet = train.tweet.apply(clean_text)
train.tweet = train.tweet.str.replace('\d+', '')

In [193]:
# test.label = test.label.apply(label_encode)
test = test.reset_index(drop=True)
test.tweet = test.tweet.apply(clean_text)
test.tweet = test.tweet.str.replace('\d+', '')

In [194]:
train.tweet.sample(10)

5689    extremely good news bendigo hits covidzero but...
5117     new cases of #covid lagos ogun fct borno kadu...
4489    georgian homeopath discussed a homeopathic dru...
4490    coronavirus can be transmitted through mosquit...
5438    covid update there are two new cases of covid ...
1640    rt cdcdirector cdcs guidelines to combat the s...
6327    rt cdcdirector cdcgov to award  million to  st...
6445    breathlessness excessive fatigue and muscle ac...
5206    all family members has died infected by corona...
8210      at the start was reasonably necessary but it...
Name: tweet, dtype: object

In [195]:
# split the dataset into training and validation datasets 
from sklearn.model_selection import train_test_split
# train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['tweet'], train['label'])
train_x, valid_x, train_y, valid_y = train_test_split(train['tweet'], train['label'], test_size=0.2)

In [196]:
def count_words(text):
    try:
        return len(text.split())
    except:
        print(text)
        return None

In [197]:
total = 0
maxw = 0
large_count = 0
for i in train_x:
    temp = count_words(i)
    total += temp
    maxw = temp if temp > maxw else maxw
    large_count += 1 if temp > 120 else 0
total/len(train_x), maxw, large_count, len(train_x)

(27.31673481308411, 1446, 5, 6848)

In [198]:
# MAX_LENGTH = 50
posts = train.values
categories = train.values

In [199]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 1e-05


In [200]:
import gensim.models as gsm
e2v = gsm.KeyedVectors.load_word2vec_format('emoji2vec.bin', binary=True)
# happy_vector = e2v['😂']    # Produces an embedding vector of length 300

# Download the bin file from here https://github.com/uclnlp/emoji2vec/blob/master/pre-trained/emoji2vec.bin

def getEmojiEmbeddings(emojiList,dim=300,verbose = False):
  """ Generates an emoji vector by averaging the emoji representation for each emoji. If no emoji returns an empty list of dimension dim"""
  if dim < 300:
    raise IndexError("Dim has to be greater than 300")
  result = np.zeros(dim)
  if (len(emojiList) == 0):
    return result
  else:
    embs = None
    for i in emojiList:
      if verbose:
        if i not in e2v.vocab:
          print(i)
    embs = np.mean([e2v[i] for i in emojiList if i in e2v.vocab], axis=0)
  if np.any(np.isnan(embs)):
    return result
  result[:300] = embs
  return result
getEmojiEmbeddings(valid.emoji.values[0])

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [201]:
ids = tokenizer.encode_plus(
            valid.full_tweet.values[0],
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=128,
            pad_to_max_length=True,
            return_attention_mask = True,
            return_token_type_ids=True
        )['input_ids']
torch.tensor(ids, dtype=torch.long).shape, torch.tensor(getEmojiEmbeddings(valid.emoji.values[0]), dtype=torch.long).shape



(torch.Size([128]), torch.Size([300]))

In [202]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len, t = False):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.tweet
        self.emoji = dataframe.emoji
        self.hash = dataframe.segmented_hash
        self.t = t
        if not self.t:
            self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_attention_mask = True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        h_text = self.hash[index]
        h_text = " ".join(h_text)
        inputs = self.tokenizer.encode_plus(
            h_text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_attention_mask = True,
            return_token_type_ids=True
        )
        h_ids = inputs['input_ids']
        h_mask = inputs['attention_mask']
        h_token_type_ids = inputs["token_type_ids"]
#         h_inputs
        emoji = getEmojiEmbeddings(self.emoji[index])
        if self.t:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'h_ids': torch.tensor(h_ids, dtype=torch.long),
                'h_mask': torch.tensor(h_mask, dtype=torch.long),
                'h_token_type_ids': torch.tensor(h_token_type_ids, dtype=torch.long),
                'emoji' : torch.tensor(emoji, dtype=torch.long),
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'h_ids': torch.tensor(h_ids, dtype=torch.long),
                'h_mask': torch.tensor(h_mask, dtype=torch.long),
                'h_token_type_ids': torch.tensor(h_token_type_ids, dtype=torch.long),
                'emoji' : torch.tensor(emoji, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.long)
            }

In [203]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_data=train.sample(frac=train_size,random_state=200)
test_data=train.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(train.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

FULL Dataset: (8560, 16)
TRAIN Dataset: (6848, 16)
TEST Dataset: (1712, 16)


In [204]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [148]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = AutoModel.from_pretrained(models[model_num])
        self.l2 = AutoModel.from_pretrained(models[model_num])
        
        self.pre_classifier_1 = torch.nn.Linear(768, 768)
        self.pre_classifier_2 = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.pre_classifier_3 = torch.nn.Linear(1836, 1836)
#         self.pre_classifier_3 = torch.nn.Linear(768, 100)
        self.classifier = torch.nn.Linear(1836, 2)

    def forward(self, input_ids, attention_mask, token_type_ids, h_ids, h_mask, h_token_type_ids, emoji):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state_1 = output_1[0]
        pooler_1 = hidden_state_1[:, 0]
        pooler_1 = self.pre_classifier_1(pooler_1)
        pooler_1 = torch.nn.Tanh()(pooler_1)
        pooler_1 = self.dropout(pooler_1)
        output_2 = self.l2(input_ids=h_ids, attention_mask=h_mask)
        hidden_state_2 = output_2[0]
        pooler_2 = hidden_state_2[:, 0]
        pooler_2 = self.pre_classifier_2(pooler_2)
        pooler_2 = torch.nn.Tanh()(pooler_2)
        pooler_2 = self.dropout(pooler_2)
        pooler_3 = torch.cat((pooler_1, pooler_2), 1)
        pooler_3 = torch.cat((pooler_3, emoji), 1)
#         print(pooler_1.shape,hidden_state_1.shape, pooler_2.shape, emoji.type(torch.FloatTensor).shape)
#         pooler_3 = torch.nn.Tanh()(emoji.type(torch.FloatTensor))
#         pooler_3 = self.dropout(pooler_3)
#         print(pooler_3.shape)
        pooler_3 = self.pre_classifier_3(pooler_3)
#         pooler_3 = self.pre_classifier_3(pooler_2)
        pooler_3 = torch.nn.Tanh()(pooler_3)
        pooler_3 = self.dropout(pooler_3)
        output = self.classifier(pooler_3)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, elementw

In [149]:
# from torchsummary import summary
# print(repr(model))


In [150]:
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))


The BERT model has 406 different named parameters.

==== Embedding Layer ====

l1.embeddings.word_embeddings.weight                    (50265, 768)
l1.embeddings.position_embeddings.weight                  (514, 768)
l1.embeddings.token_type_embeddings.weight                  (1, 768)
l1.embeddings.LayerNorm.weight                                (768,)
l1.embeddings.LayerNorm.bias                                  (768,)

==== First Transformer ====

l1.encoder.layer.0.attention.self.query.weight            (768, 768)
l1.encoder.layer.0.attention.self.query.bias                  (768,)
l1.encoder.layer.0.attention.self.key.weight              (768, 768)
l1.encoder.layer.0.attention.self.key.bias                    (768,)
l1.encoder.layer.0.attention.self.value.weight            (768, 768)
l1.encoder.layer.0.attention.self.value.bias                  (768,)
l1.encoder.layer.0.attention.output.dense.weight          (768, 768)
l1.encoder.layer.0.attention.output.dense.bias                (

In [151]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets)

In [152]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [28]:
def train(epoch):
    total_train_loss = 0
    count = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        h_ids = data['h_ids'].to(device, dtype = torch.long)
        h_mask = data['h_mask'].to(device, dtype = torch.long)
        h_token_type_ids = data['h_token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        emoji = data['emoji'].to(device, dtype = torch.long)
        outputs = model(ids, mask, token_type_ids, h_ids, h_mask, h_token_type_ids, emoji)
        optimizer.zero_grad()
#         loss = outputs.loss
        loss = loss_fn(outputs, targets)
#         if _%50==0:
#             print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        total_train_loss += loss.item()
        count += 1
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    print(f'Epoch: {epoch}, Loss:  {total_train_loss/count}')
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            h_ids = data['h_ids'].to(device, dtype = torch.long)
            h_mask = data['h_mask'].to(device, dtype = torch.long)
            h_token_type_ids = data['h_token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            emoji = data['emoji'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids, h_ids, h_mask, h_token_type_ids, emoji)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    fin_outputs = list(np.argmax(np.array(fin_outputs), axis=1).flatten())
    print(classification_report(fin_outputs, fin_targets))
    torch.save(model, '/scratch/epoch_'+str(epoch))
    return fin_outputs, fin_targets
#     final_outputs = np.array(fin_outputs) >=0.5
#     final = []
#     final_t = []
#     final_fine = [[],[],[],[]]
#     final_fine_t = [[],[],[],[]]
#     for (i,j) in zip(final_outputs, fin_targets):
#         output_sum = sum(i)
#         target_sum = sum(j)
#         if output_sum == 0:
#             final.append(0)
#         else:
#             final.append(1)
#         if target_sum == 0:
#             final_t.append(0)
#         else:
#             final_t.append(1)
#         for p in range(4):
#             final_fine[p].append(int(i[p]))
#             final_fine_t[p].append(int(j[p]))
#     print("Coarse:")
#     print(classification_report(final, final_t))
#     for i in range(4):
#         print("Fine", i)
    
#     return fin_outputs, fin_targets

In [29]:
for epoch in range(EPOCHS):
    out, tar = train(epoch)
#     break

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
428it [02:05,  3.41it/s]
1it [00:00,  5.62it/s]

Epoch: 0, Loss:  0.1893800597737987


54it [00:09,  5.56it/s]


              precision    recall  f1-score   support

           0       0.96      0.97      0.96       776
           1       0.98      0.96      0.97       936

    accuracy                           0.97      1712
   macro avg       0.97      0.97      0.97      1712
weighted avg       0.97      0.97      0.97      1712



428it [02:07,  3.36it/s]
1it [00:00,  5.59it/s]

Epoch: 1, Loss:  0.070123767337727


54it [00:09,  5.62it/s]


              precision    recall  f1-score   support

           0       0.96      0.97      0.96       783
           1       0.97      0.97      0.97       929

    accuracy                           0.97      1712
   macro avg       0.97      0.97      0.97      1712
weighted avg       0.97      0.97      0.97      1712



428it [02:06,  3.38it/s]
1it [00:00,  5.61it/s]

Epoch: 2, Loss:  0.034703462969845764


54it [00:09,  5.61it/s]


              precision    recall  f1-score   support

           0       0.97      0.98      0.97       774
           1       0.99      0.97      0.98       938

    accuracy                           0.98      1712
   macro avg       0.98      0.98      0.98      1712
weighted avg       0.98      0.98      0.98      1712



428it [02:06,  3.37it/s]
1it [00:00,  5.57it/s]

Epoch: 3, Loss:  0.018472962436152277


54it [00:09,  5.61it/s]


              precision    recall  f1-score   support

           0       0.95      0.98      0.97       761
           1       0.99      0.96      0.97       951

    accuracy                           0.97      1712
   macro avg       0.97      0.97      0.97      1712
weighted avg       0.97      0.97      0.97      1712



428it [02:06,  3.37it/s]
1it [00:00,  5.59it/s]

Epoch: 4, Loss:  0.015005094774436505


54it [00:09,  5.61it/s]


              precision    recall  f1-score   support

           0       0.96      0.98      0.97       773
           1       0.99      0.97      0.98       939

    accuracy                           0.98      1712
   macro avg       0.98      0.98      0.98      1712
weighted avg       0.98      0.98      0.98      1712



428it [02:06,  3.38it/s]
1it [00:00,  5.60it/s]

Epoch: 5, Loss:  0.010796221844905963


54it [00:09,  5.61it/s]


              precision    recall  f1-score   support

           0       0.95      0.99      0.97       760
           1       0.99      0.96      0.98       952

    accuracy                           0.97      1712
   macro avg       0.97      0.97      0.97      1712
weighted avg       0.97      0.97      0.97      1712



428it [02:07,  3.37it/s]
1it [00:00,  5.54it/s]

Epoch: 6, Loss:  0.020441482442616246


54it [00:09,  5.58it/s]


              precision    recall  f1-score   support

           0       0.97      0.98      0.97       783
           1       0.98      0.97      0.98       929

    accuracy                           0.97      1712
   macro avg       0.97      0.97      0.97      1712
weighted avg       0.97      0.97      0.97      1712



428it [02:07,  3.36it/s]
1it [00:00,  5.56it/s]

Epoch: 7, Loss:  0.004851448877153101


54it [00:09,  5.58it/s]


              precision    recall  f1-score   support

           0       0.95      0.99      0.97       758
           1       0.99      0.96      0.98       954

    accuracy                           0.97      1712
   macro avg       0.97      0.97      0.97      1712
weighted avg       0.97      0.97      0.97      1712



428it [02:06,  3.37it/s]
1it [00:00,  5.59it/s]

Epoch: 8, Loss:  0.009885061413540178


54it [00:09,  5.61it/s]


              precision    recall  f1-score   support

           0       0.97      0.98      0.97       776
           1       0.98      0.97      0.98       936

    accuracy                           0.98      1712
   macro avg       0.97      0.98      0.98      1712
weighted avg       0.98      0.98      0.98      1712



428it [02:06,  3.37it/s]
1it [00:00,  5.62it/s]

Epoch: 9, Loss:  0.001821532530722715


54it [00:09,  5.61it/s]


              precision    recall  f1-score   support

           0       0.92      0.99      0.95       731
           1       0.99      0.94      0.96       981

    accuracy                           0.96      1712
   macro avg       0.96      0.96      0.96      1712
weighted avg       0.96      0.96      0.96      1712



In [30]:
out[0:10], tar[0:10]

([1, 1, 0, 1, 0, 0, 0, 1, 1, 1], [1, 1, 0, 1, 0, 0, 0, 1, 1, 1])

In [205]:
# Creating the dataset and dataloader for the neural network
model = torch.load('/scratch/epoch_4')
# train_size = 0.8
# test_data=test.sample(frac=1,random_state=200)
# test_data=train.drop(train_data.index).reset_index(drop=True)
test_data = test.reset_index(drop=True)
testing = MultiLabelDataset(test_data, tokenizer, MAX_LEN, t=True)
test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }
testing_loader = DataLoader(testing, **test_params)

In [206]:
model.eval()
fin_targets=[]
fin_outputs=[]
# print(f'Epoch: {epoch}, Loss:  {total_train_loss/count}')
with torch.no_grad():
    for _, data in tqdm(enumerate(testing_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        h_ids = data['h_ids'].to(device, dtype = torch.long)
        h_mask = data['h_mask'].to(device, dtype = torch.long)
        h_token_type_ids = data['h_token_type_ids'].to(device, dtype = torch.long)
#         targets = data['targets'].to(device, dtype = torch.long)
        emoji = data['emoji'].to(device, dtype = torch.long)
        outputs = model(ids, mask, token_type_ids, h_ids, h_mask, h_token_type_ids, emoji)
#         fin_targets.extend(targets.cpu().detach().numpy().tolist())
        fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
fin_outputs = list(np.argmax(np.array(fin_outputs), axis=1).flatten())
# print(classification_report(fin_outputs, fin_targets))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
67it [00:11,  5.80it/s]


In [209]:
fin_outputs[0:20]

[1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1]

In [208]:
test_data.head(10)

Unnamed: 0,tweet_id,full_tweet,tweet_raw_text,hashtags,smiley,emoji,url,mentions,numerals,reserved_word,emotext,segmented_hash,clean,tweet
0,1,Our daily update is published. States reported...,Our daily update is published. States reported...,[],[],[],[https://t.co/wzSYMe0Sht],[],"[734, 39, 532, 30, 22]",[],[],[],Our daily update is published. States reported...,our daily update is published states reported ...
1,2,Alfalfa is the only cure for COVID-19.,Alfalfa is the only cure for COVID-19.,[],[],[],[],[],[],[],[],[],Alfalfa is the only cure for COVID-19.,alfalfa is the only cure for covid
2,3,President Trump Asked What He Would Do If He W...,President Trump Asked What He Would Do If He W...,"[#donaldtrump, #coronavirus]",[],[],[https://t.co/3MEWhusRZI],[],[],[],[],"[donald trump, coronavirus]",President Trump Asked What He Would Do If He W...,president trump asked what he would do if he w...
3,4,States reported 630 deaths. We are still seein...,States reported deaths. We are still seeing a ...,[],[],[],[https://t.co/LBmcot3h9a],[],"[630, 28]",[],[],[],States reported 630 deaths. We are still seein...,states reported deaths we are still seeing a ...
4,5,This is the sixth time a global health emergen...,This is the sixth time a global health emergen...,[],[],[],[https://t.co/JvKC0PTett],[@DrTedros],[],[],[],[],This is the sixth time a global health emergen...,this is the sixth time a global health emergen...
5,6,Low #vitaminD was an independent predictor of ...,Low was an independent predictor of worse prog...,[#vitaminD],[],[],"[https://t.co/CGD6Kphn31, https://t.co/chtni8K...",[],[],[],[],[vitamin d],Low was an independent predictor of worse pr...,low #vitamind was an independent predictor of ...
6,7,A common question: why are the cumulative outc...,A common question: why are the cumulative outc...,[#s],[],[],[],[],[],[],[],[s],A common question: why are the cumulative outc...,a common question why are the cumulative outco...
7,8,The government should consider bringing in any...,The government should consider bringing in any...,[],[],[],[https://t.co/pdOls6cqoN],[],[],[],[],[],The government should consider bringing in any...,the government should consider bringing in any...
8,9,Our daily update is published. We’ve now track...,Our daily update is published. Weve now tracke...,[],[],[],"[https://t.co/PZrmH4bl5Y, https://t.co/2588xW5...",[],"[2.9, 119, 1]",[],[],[],Our daily update is published. We’ve now track...,our daily update is published weve now tracked...
9,10,Breakdown of testing: 4 air crew 97 hotel &amp...,Breakdown of testing: air crew hotel &amp; hea...,[],[],[],[],[],"[4, 97, 71, 2, 200]",[],[],[],Breakdown of testing: 4 air crew 97 hotel &amp...,breakdown of testing air crew hotel amp heal...


In [210]:
test['label'] = np.array(fin_outputs)

In [211]:
len(fin_outputs)

2140

In [212]:
len(test.full_tweet.values)

2140

In [59]:
test.sample(10)

Unnamed: 0,tweet_id,full_tweet,tweet_raw_text,hashtags,smiley,emoji,url,mentions,numerals,reserved_word,emotext,segmented_hash,clean,tweet
1566,1568,The opening ceremony of the London 2012 Olympi...,The opening ceremony of the London Olympics an...,[],[],[],[],[],[2012],[],[],[],The opening ceremony of the London 2012 Olympi...,the opening ceremony of the london olympics a...
1169,1171,RT @drharshvardhan: .@MoHFW_INDIA has decided ...,: . has decided to deploy high level Central t...,"[#UttarPradesh, #Jharkhand, #Chhattisgarh]",[],[],[],"[@drharshvardhan, @MoHFW_INDIA]",[],[RT],[],"[uttar pradesh, jharkhand, chhattisgarh]",RT : . has decided to deploy high level Cent...,rt drharshvardhan mohfw_india has decided to d...
283,285,Our daily update is published. States reported...,Our daily update is published. States reported...,[],[],[],[https://t.co/hDJwxhheNS],[],"[727, 39, 475, 30]",[],[],[],Our daily update is published. States reported...,our daily update is published states reported ...
1434,1436,(4/4)India's calibrated testing strategy formu...,(4/4)India's calibrated testing strategy formu...,[],[],[],[],[],[],[],[],[],(4/4)India's calibrated testing strategy formu...,indias calibrated testing strategy formulati...
1224,1226,A video of a policeman taking down a man wande...,A video of a policeman taking down a man wande...,[],[],[],[],[],[],[],[],[],A video of a policeman taking down a man wande...,a video of a policeman taking down a man wande...
1705,1707,Our daily update is published. We’ve now track...,Our daily update is published. Weve now tracke...,[],[],[],"[https://t.co/PZrmH4bl5Y, https://t.co/dTN3Ivm...",[],"[167, 2]",[],[],[],Our daily update is published. We’ve now track...,our daily update is published weve now tracked...
1249,1251,We also just a number of new cases for Texas—1...,We also just a number of new cases for Texas18...,[],[],[],[https://t.co/GtgfKkinE8],[],[],[],[],[],We also just a number of new cases for Texas—1...,we also just a number of new cases for texasin...
271,273,#CoronaVirusUpdates: #COVID19 testing status u...,: testing status update: stated that samples t...,"[#CoronaVirusUpdates, #COVID19, #StaySafe, #In...",[],[],[https://t.co/6G5M8rsZ66],[@ICMRDELHI],"[66279462, 22, 2020, 953683, 22, 2020]",[],[],"[corona virus updates, covid 19, stay safe, in...",: testing status update: stated that 6627...,#coronavirusupdates #covid testing status upda...
149,151,There are 4 #COVID19 Govt. testing lab in #Agr...,There are Govt. testing lab in . Kindly refer ...,"[#COVID19, #Agra, #UttarPradesh, #COVID__19, #...",[],[],"[https://t.co/SQCvfE2ZNc, https://t.co/nPLgGkj...",[],[4],[],[],"[covid 19, agra, uttar pradesh, covid _ _ 19, ...",There are 4 Govt. testing lab in . Kindly...,there are #covid govt testing lab in #agra #u...
1278,1280,As per @ICMRDELHI it is not recommended to rel...,As per it is not recommended to rely on numeri...,"[#COVID19, #COVID__19, #COVID, #COVID_19, #COV...",[],[],"[https://t.co/o61SDKSKpg, https://t.co/GzEg37D...",[@ICMRDELHI],[],[],[],"[covid 19, covid _ _ 19, covid, covid _ 19, co...",As per it is not recommended to rely on nume...,as per icmrdelhi it is not recommended to rely...


In [214]:
def label_decode(val):
    return labels[val]
test.label = test.label.apply(label_decode)

In [215]:

test.to_csv(path_or_buf='answers2.txt', index=False, columns = ['tweet_id', 'label'] )