In [7]:
import os
import os.path
import json
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.optim import lr_scheduler

import logging
logging.basicConfig(level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore")

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv("train.csv")

In [9]:
data.sample(6)

Unnamed: 0,id,keyword,location,text,target
5342,7626,pandemonium,Dallas Fort-Worth,Pandemonium In Aba As Woman Delivers Baby With...,0
7258,10392,whirlwind,Frostburg,HELP I'M IN A WHIRLWIND OF NOSTALGIA,0
7449,10660,wounds,United States,Having your wounds kissed by Someone who doesn...,0
378,543,army,,One Direction Is my pick for http://t.co/q2eBl...,0
2521,3623,desolation,,Free Kindle Book - Aug 3-7 - Thriller - Desola...,0
83,120,accident,"Arlington, TX",#TruckCrash Overturns On #FortWorth Interstate...,1


In [10]:
train_maxlen = 140
dev_maxlen = 140
batch_size = 16
epochs = 10
bert_model = 'bert-base-uncased'
learning_rate = 3e-5

In [11]:
class Tokenize_dataset:
  """
  This class tokenizes the dataset using bert tokenizer
  """

  def __init__(self, text, targets, tokenizer, max_len):
    self.text = text
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.targets = targets

  def __len__(self):
    return len(self.targets)

  def __getitem__(self, item):
    text = str(self.text[item])
    targets = self.targets[item]
    """
    Using encode_plus instead of encode as it helps to provide additional information that we need
    """
    inputs = self.tokenizer.encode_plus(
        str(text),
        add_special_tokens = True,
        max_length = self.max_len,
        pad_to_max_length = True
    )

    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]
    token_type_ids = inputs["token_type_ids"]

    return {
        "ids": torch.tensor(ids, dtype=torch.long),
        "mask": torch.tensor(mask, dtype=torch.long),
        "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        "targets": torch.tensor(targets, dtype=torch.long)
    }

In [12]:
def loss_function(outputs, targets):
	"""
	This function defines the loss function we use in the model which since is multiclass is crossentropy
	"""
	return nn.CrossEntropyLoss()(outputs, targets)

In [13]:
def train_function(data_loader, model, optimizer, device):
  """
  Function defines the training that we will happen over the entire dataset
  """
  model.train()

  running_loss = 0.0
  """
  looping over the entire training dataset
  """
  for i, data in enumerate(data_loader):
    mask = data["mask"].to(device, dtype=torch.long)
    ids = data["ids"].to(device, dtype=torch.long)
    token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
    target = data["targets"].to(device, dtype=torch.long)
    optimizer.zero_grad()

    output = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
    
    loss = loss_function(output, target)
    loss.backward()
    optimizer.step()
    """
    calculating loss and running loss
    """
    running_loss += loss.item()
    if i % 10 == 0 and i!=0:
      temp = f'Batch index = {i}\tRunning Loss = {running_loss/10}'
      print(temp)
      running_loss = 0.0

In [14]:
def eval_function(data_loader, model, device):
  """
  This function defines the loop over the dev set.
  """
  model.eval()
  correct_labels = 0
  tot = 0
  """
  no_grad as this is evaluation set and we dont want the model to update weights
  """
  with torch.no_grad():
    for i, data in enumerate(data_loader):
      mask = data["mask"].to(device, dtype=torch.long)
      ids = data["ids"].to(device, dtype=torch.long)
      token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
      targets = data["targets"].to(device, dtype=torch.long)
      outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)

      max_probs, predicted = torch.max(outputs, 1)
      tot = tot + targets.size(0)
      correct_labels = correct_labels + torch.sum(predicted==targets)

      print(f"Batch Index: {i}\tPredicted: {predicted}\tTargets: {targets}")
    """
    basic metrics for accuracy calculation
    """
    accuracy = correct_labels / tot * 100
    print(accuracy)
  return accuracy

In [15]:
class CompleteModel(nn.Module):
  """
  The model architecture is defined here which is a fully connected layer + normalization on top of a BERT model
  """

  def __init__(self, bert):
    super(CompleteModel, self).__init__()
    self.bert = BertModel.from_pretrained(bert)
    self.drop = nn.Dropout(p=0.25)
    self.out = nn.Linear(self.bert.config.hidden_size, 2) # Number of output classes = 3, positive, negative and N(none)

  def forward(self, ids, mask, token_type_ids):
    _, pooled_output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
    output = self.drop(pooled_output)
    return self.out(output)

In [23]:
def run():
  training_set_path = "train.csv"
    #validation_set_path = '/content/drive/MyDrive/dataset/dev/' + str(location) + '_' + str(aspect) + '.csv'
  df_train = pd.read_csv(training_set_path)
    #df_valid = pd.read_csv(validation_set_path)
      
  df_train['target'] = df_train['target']
   # df_valid['target'] = df_valid['target'].map(sentiment_mapping)
  df_train = df_train.reset_index(drop=True)
   # df_valid = df_valid.reset_index(drop=True)
  tokenizer = BertTokenizer.from_pretrained(bert_model)
  train_dataset = Tokenize_dataset(
        text = df_train['text'].values,
        targets = df_train['target'].values,
        tokenizer = tokenizer,
        max_len = train_maxlen
  )
  class_counts = []
  for i in range(3):
    class_counts.append(df_train[df_train['target']==i].shape[0])
  print(f"Class Counts: {class_counts}")
      
  num_samples = sum(class_counts)
  print(num_samples)
  labels = df_train['target'].values
  class_weights = []
  for i in range(len(class_counts)):
      if class_counts[i] != 0:
          class_weights.append(num_samples/class_counts[i])
      else:
          class_weights.append(0)
  weights = [class_weights[labels[i]] for i in range(int(num_samples))]
  sampler = torch.utils.data.sampler.WeightedRandomSampler(torch.DoubleTensor(weights), int(num_samples))
  train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size = batch_size,
        shuffle = False,
        sampler = sampler
    )
  #valid_dataset = Tokenize_dataset(
    #    text = df_valid['text'].values,
     #   targets = df_valid['sentiment'].values,
    #   tokenizer = tokenizer,
     #   max_len = dev_maxlen
   # )
   # valid_data_loader = torch.utils.data.DataLoader(
    #    valid_dataset,
     #   batch_size = batch_size,
   #     shuffle = False
#    )
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  print(f"Device: {device}")
  model = CompleteModel(bert_model).to(device)
  optimizer = AdamW(model.parameters(), lr=learning_rate)
  scheduler = lr_scheduler.StepLR(
        optimizer,
        step_size = 1,
        gamma = 0.8
    )
  for epoch in range(epochs):
    train_function(data_loader=train_data_loader, model=model, optimizer=optimizer, device=device)
        #accuracy = eval_function(data_loader=valid_data_loader, model=model, device=device, location=location, aspect=aspect)
  print("\nEpoch = "+ str(epoch))
  print("\nLearning Rate = " + str(scheduler.get_lr()[0])+"\n")
  scheduler.step()
  torch.save(model, 'NLP_disaster' + '/'+ str(epoch) + '.bin')
  
if __name__ == "__main__":
  run()

Class Counts: [4342, 3271, 0]
7613
Device: cuda:0


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max 

Batch index = 10	Running Loss = 0.7792901515960693
Batch index = 20	Running Loss = 0.6368680596351624
Batch index = 30	Running Loss = 0.5220463186502456
Batch index = 40	Running Loss = 0.47894205451011657
Batch index = 50	Running Loss = 0.4591632753610611
Batch index = 60	Running Loss = 0.4566039204597473
Batch index = 70	Running Loss = 0.3605636447668076
Batch index = 80	Running Loss = 0.4686270236968994
Batch index = 90	Running Loss = 0.418293134868145
Batch index = 100	Running Loss = 0.4143214523792267
Batch index = 110	Running Loss = 0.47180355489254
Batch index = 120	Running Loss = 0.41585087925195696
Batch index = 130	Running Loss = 0.3955050319433212
Batch index = 140	Running Loss = 0.37610195875167846
Batch index = 150	Running Loss = 0.44535134732723236
Batch index = 160	Running Loss = 0.4521370127797127
Batch index = 170	Running Loss = 0.4811849623918533
Batch index = 180	Running Loss = 0.36152465343475343
Batch index = 190	Running Loss = 0.3512241765856743
Batch index = 200	R

KeyboardInterrupt: 

In [17]:
MAX_LEN = 140

tokenizer = BertTokenizer.from_pretrained(bert_model)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

df = pd.read_csv("test.csv")
result = []
idees = []
model = torch.load("NLP_disaster/9.bin")
for i in range(len(df)):
  id_test = df.loc[i, 'id']
  text = df.loc[i,'text']


  inputs = tokenizer.encode_plus(
          str(text),
          add_special_tokens = True,
          max_length = MAX_LEN,
          pad_to_max_length = True,
      )
  ids = torch.tensor(inputs["input_ids"], dtype=torch.long).unsqueeze(0)
  mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).unsqueeze(0)
  token_type_ids = torch.tensor(inputs["token_type_ids"], dtype=torch.long).unsqueeze(0)

  ids = ids.to(device, dtype=torch.long)
  mask = mask.to(device, dtype=torch.long)
  token_type_ids = token_type_ids.to(device, dtype=torch.long)

#model = models_set[f"{location}{aspect}"]
  outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
  prob_max, predicted = torch.max(outputs, 1)

  predicted = predicted.detach().cpu().numpy()

        # Add the predicted to the json only if it is not N(none)
        # Reverse mapping from numbers to sentiments
  idees.append(id_test)
  result.append(predicted[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [18]:
print(idees)

[0, 2, 3, 9, 11, 12, 21, 22, 27, 29, 30, 35, 42, 43, 45, 46, 47, 51, 58, 60, 69, 70, 72, 75, 84, 87, 88, 90, 94, 99, 101, 103, 106, 108, 111, 115, 116, 122, 123, 124, 125, 127, 140, 142, 147, 148, 150, 152, 154, 155, 166, 167, 169, 177, 179, 181, 186, 188, 189, 192, 200, 202, 206, 207, 214, 217, 223, 224, 227, 228, 230, 233, 234, 236, 239, 250, 255, 257, 259, 275, 278, 282, 284, 286, 288, 292, 295, 300, 304, 305, 306, 308, 311, 317, 319, 323, 324, 325, 326, 333, 339, 342, 343, 350, 351, 357, 359, 362, 366, 367, 369, 373, 374, 376, 377, 378, 379, 382, 385, 387, 388, 391, 392, 395, 399, 400, 403, 405, 408, 411, 414, 416, 417, 422, 425, 428, 430, 431, 433, 434, 439, 441, 449, 458, 460, 464, 473, 488, 491, 494, 497, 500, 505, 507, 508, 510, 511, 515, 525, 529, 532, 534, 537, 539, 541, 545, 547, 548, 549, 553, 554, 555, 557, 562, 566, 572, 573, 582, 586, 587, 590, 591, 593, 595, 596, 597, 601, 602, 605, 610, 616, 618, 620, 626, 627, 629, 632, 634, 639, 645, 647, 648, 650, 663, 666, 668, 670

In [19]:
print(result)

[0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [20]:
dt = pd.DataFrame(list(zip(idees, result)), columns = ['id', 'target'])

In [21]:
dt.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,1
4,11,1


In [22]:
dt.to_csv('submissions.csv')

In [27]:
from torchsummary import summary

summary(model, (30522, 768))

TypeError: forward() missing 2 required positional arguments: 'mask' and 'token_type_ids'

In [26]:
print(model)

CompleteModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff