# BERT test

## 0. import libs

In [23]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from transformers import AutoTokenizer

from utils import getData, clean
from data import DataProcessor
from model import AutoRegressor

## 1. Set model

In [2]:
# set cofig
CONFIG = dict(
    seed = 12345,
    pretrained_model = 'bert-base-uncased',
    output_dir = '../models/bert_regression_mini',
    train_file = '4th/v0/train.csv',
    dev_file = '4th/v0/dev.csv',
    train_batch_size = 32,
    dev_batch_size = 32,
    lr = 5e-5,
    epochs = 5,
    num_class = 1,
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    device_ids = [0,1]
)

### load pretrained model & dataset

In [26]:
# # download transformers pretrained model
# tokenizer = AutoTokenizer.from_pretrained(CONFIG['pretrained_model'])
# tokenizer.save_pretrained(os.path.join(CONFIG['output_dir'], 'tokenizer.pt'))
# bert = AutoModel.from_pretrained(CONFIG['pretrained_model'])
# bert.save_pretrained(os.path.join(CONFIG['output_dir'], 'bert.pt'))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ModuleAttributeError: 'BertRegressor' object has no attribute 'save_pretrained'

In [3]:
# init bert pretrained model
tokenizer = AutoTokenizer.from_pretrained(CONFIG['pretrained_model'])
model = AutoRegressor(CONFIG['pretrained_model'], CONFIG['num_class'])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 2. Load dataset

In [4]:
# load dataset
# train dataset
train_df = getData(data_path=CONFIG['train_file'])
# data processing with tokenizing
train_data = DataProcessor(train_df, tokenizer, is_eval=False)
train_dataloader = DataLoader(train_data, batch_size=CONFIG['train_batch_size'], shuffle=True, num_workers=4)

# dev dataset
dev_df = getData(data_path=CONFIG['dev_file'])
# data processing with tokenizing
dev_data = DataProcessor(dev_df, tokenizer, is_eval=False)
dev_dataloader = DataLoader(dev_data, batch_size=CONFIG['dev_batch_size'], shuffle=True, num_workers=4)

Read 4th/v0/train.csv ...
Read 4th/v0/dev.csv ...


## 3. Train or Predict

In [6]:
def train(model, epochs, train_dataloader, dev_dataloader, criterion, optimizer, scheduler, device, output_dir):

    torch.cuda.empty_cache()
    model.train()
    for epoch_num in range(epochs):

            total_loss_train = 0.0

            print(f"[Epochs : {epoch_num+1}/{epochs}]")
            for i, (train_input, train_label) in enumerate(tqdm(train_dataloader)):
                input_id = train_input['input_ids'].squeeze(1).to(device)
                mask = train_input['attention_mask'].squeeze(1).to(device)

                output = model(input_id, mask)
                output = torch.squeeze(output, 1)
                del input_id
                del mask
                
                train_label = train_label.to(device)
                batch_loss = criterion(output.float(), train_label.float())
                del train_label

                total_loss_train += batch_loss.item()

                optimizer.zero_grad()
                batch_loss.backward()
                optimizer.step()
                scheduler.step()

                if i%10000 == 0:  
                    print(f'Epochs: {epoch_num + 1} | Train Loss: {batch_loss: .3f}')
                    torch.save(model.state_dict(),\
                            os.path.join(output_dir, f'bert_regression-{epoch_num+1}-{i}.pt'))

            # validate using our dev set 
            model.eval()
            total_loss_dev = 0.0

            with torch.no_grad():
                for dev_input, dev_label in dev_dataloader:
                    dev_label = dev_label.to(device)
                    input_id = dev_input['input_ids'].squeeze(1).to(device)
                    mask = dev_input['attention_mask'].squeeze(1).to(device)

                    output = model(input_id, mask)
                    output = torch.squeeze(output, 1)

                    batch_loss = criterion(output.float(), dev_label.float())
                    total_loss_dev += batch_loss.item()

                    del dev_label
                    del input_id
                    del mask
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_dataloader): .3f} \
                | Val Loss: {total_loss_dev / len(dev_dataloader): .3f}')

            torch.save(model.state_dict(),\
                    os.path.join(output_dir, f'bert_regression-{epoch_num+1}-{len(train_dataloader)}.pt'))

### load pretrained model

In [4]:
checkpoint = torch.load(os.path.join(CONFIG['output_dir'], 'model_ckpt.pt'))
# checkpoint = torch.load(os.path.join(CONFIG['output_dir'], 'model_ckpt-55587.pt'))
model.load_state_dict(checkpoint)
# if torch.cuda.device_count() > 1:
#     model = nn.DataParallel(model, device_ids=CONFIG['device_ids'])
model.to(CONFIG['device'])

BertRegressor(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

## 4. Prediction

### Load Validation and Test data

In [5]:
# Validation data 
df_val = getData(data_path="4th/validation_cleaned.csv")
# Test data
df_sub = getData(data_path="4th/comments_to_score.csv")

Read 4th/validation_cleaned.csv ...
Read 4th/comments_to_score.csv ...


In [25]:
df_sub = clean(df_sub, 'text')
df_sub.head(5)

  data = data.str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
  data = data.str.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')
  data = data.str.replace(r'([*!?\']+)',r' \1 ')
  data = data.str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1')
  data = data.str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1')
  data = data.str.replace(r'[ ]{2,}|\n',' ')
  data = data.str.replace(pattern, '')


0        Gjalexei, you asked about whether there is...
1    Looks like be have an abuser , can you please ...
2    I confess to having complete (and apparently b...
3      Freud ' s ideas are certainly much discussed...
4    It is not just you. This is a laundry list of ...
Name: text, dtype: object

In [6]:
val1_data = DataProcessor(df_val['less_toxic'], tokenizer, is_eval=True)
val1_dataloader = DataLoader(val1_data, batch_size=CONFIG['dev_batch_size'], shuffle=False, num_workers=4)

val2_data = DataProcessor(df_val['more_toxic'], tokenizer, is_eval=True)
val2_dataloader = DataLoader(val2_data, batch_size=CONFIG['dev_batch_size'], shuffle=False, num_workers=4)

In [7]:
def predict(model, val_dataloader, device):

	torch.cuda.empty_cache()
	model.eval()

	outputs = []
	with torch.no_grad():
		for i, val_input in enumerate(tqdm(val_dataloader)):
			input_id = val_input['input_ids'].squeeze(1).to(device)
			mask = val_input['attention_mask'].squeeze(1).to(device)

			output = model(input_id, mask)
			if i == 0:
				print(output.shape)
			#output = torch.squeeze(output, 1)
			outputs.extend(output.detach().cpu().numpy())

			del input_id
			del mask

	return outputs

## 3. Validation
- final validation and submission

In [8]:
device = CONFIG['device']
print('predict less toxic')
p1 = predict(model, val1_dataloader, device)	
print('predict more toxic')
p2 = predict(model, val2_dataloader, device)	

  0%|          | 0/941 [00:00<?, ?it/s]

predict less toxic
torch.Size([32, 1])


100%|██████████| 941/941 [02:47<00:00,  5.63it/s]
  0%|          | 0/941 [00:00<?, ?it/s]

predict more toxic
torch.Size([32, 1])


100%|██████████| 941/941 [02:48<00:00,  5.60it/s]


In [17]:
p1 = np.asarray(p1)
p2 = np.asarray(p2)

In [18]:
print(f'Validation Accuracy is { np.round((p1 < p2).mean() * 100,2)}')

Validation Accuracy is 66.23


### Analyze bad predictions

In [19]:
df_val['p1'] = p1
df_val['p2'] = p2
df_val['diff'] = np.abs(p2 - p1)
df_val['correct'] = (p1 < p2).astype('int')

In [20]:
### Incorrect predictions with similar scores
df_val[df_val.correct == 0].sort_values('diff', ascending=True).head(20)

Unnamed: 0,worker,less_toxic,more_toxic,p1,p2,diff,correct
22995,554,Good move on your part by listing the Billy R...,User:Lagoo Sab is a sockpuppet of User:Ketabt...,0.09423,0.094181,4.9e-05,0
22996,316,Good move on your part by listing the Billy R...,User:Lagoo Sab is a sockpuppet of User:Ketabt...,0.09423,0.094181,4.9e-05,0
14357,302,ASK YOUR MOTHER <<<>>,Bisexual pride flag the bisexual pride flag...,0.091989,0.091826,0.000164,0
14356,208,ASK YOUR MOTHER <<<>>,Bisexual pride flag the bisexual pride flag...,0.091989,0.091826,0.000164,0
14355,393,ASK YOUR MOTHER <<<>>,Bisexual pride flag the bisexual pride flag...,0.091989,0.091826,0.000164,0
21725,675,". (Ah, did I hurt you ? ) Just call this a res...",YOUR ZEALOUS CONTRIBUTIONS TO THE DAHN YOGA ...,0.101272,0.101091,0.000181,0
21724,74,". (Ah, did I hurt you ? ) Just call this a res...",YOUR ZEALOUS CONTRIBUTIONS TO THE DAHN YOGA ...,0.101272,0.101091,0.000181,0
21723,417,". (Ah, did I hurt you ? ) Just call this a res...",YOUR ZEALOUS CONTRIBUTIONS TO THE DAHN YOGA ...,0.101272,0.101091,0.000181,0
8057,485,Tens of thousands of innocent Muslims ? Jaysu...,ps i love to destroy. i guess creating is g...,0.205743,0.205559,0.000184,0
5265,74,im going to punch jim wales one day. ill find ...,hiding behind these bitches wikipedians who ke...,0.502962,0.502755,0.000207,0


In [21]:
### Incorrect predictions with dis-similar scores
df_val[df_val.correct == 0].sort_values('diff', ascending=False).head(20)

Unnamed: 0,worker,less_toxic,more_toxic,p1,p2,diff,correct
25062,246,CaliforniaAliBaba is a Bitch Go fuck yourself ...,I ' ve heard a snippet on their myspace page b...,0.655871,0.01638,0.639491,0
28241,118,you are a stupid fuck and your mother ' s c...,::: According to Stratfordian authority H ....,0.646331,0.014239,0.632091,0
9800,488,"Move it to just Milwaukee, you FAT JACKASS !!!",Vandalism Please refrain from adding nonsense...,0.640709,0.014517,0.626192,0
13424,633,Everybody lock ur dicks cuz theyre gay !!!,Useless This page is not helpful to Wikipedi...,0.610338,-0.01026,0.620598,0
2823,338,Have I made myself clear yet ? It should be p...,Please stop adding nonsense to Wikipedia. It ...,0.620994,0.006543,0.614451,0
17936,146,ya EXTRA Ur fuckin EXTRA tryna bryte up urs...,"::::::Whatever, my policy is that I think that...",0.613233,-0.000233,0.613466,0
14301,193,Warnings you asshole gave me mutiple warnin...,Editor issue I noticed you have had a bit of a...,0.60095,-0.012351,0.613301,0
3331,216,"Hey fuck you cunt bag ! ,you dont own wikpedia...",Please stop your disruptive editing. If you...,0.643886,0.031957,0.611929,0
3330,639,"Hey fuck you cunt bag ! ,you dont own wikpedia...",Please stop your disruptive editing. If you...,0.643886,0.031957,0.611929,0
13641,439,"You can suck my cock, motherfucker ALSO, NI...",Speedy deletion of Allan kamuyu A page you...,0.631355,0.019654,0.611702,0


## 4. Predict on test data

In [26]:
# Predict using pipeline
df_sub['score'] = test_preds_arr.mean(axis=1)

In [27]:
# Cases with duplicates scores
df_sub['score'].count() - df_sub['score'].nunique()

12

In [28]:
same_score = df_sub['score'].value_counts().reset_index()[:10]
same_score

Unnamed: 0,index,score
0,0.572927,2
1,0.464975,2
2,0.303382,2
3,0.022798,2
4,0.230748,2
5,0.130651,2
6,0.0645,2
7,0.124117,2
8,0.230264,2
9,0.14535,2


In [29]:
df_sub[df_sub['score'].isin(same_score['index'].tolist())]

Unnamed: 0,comment_id,text,score
1832,95080362,"""\n\nPlease do not add nonsense to Wikipedia. ...",0.022798
2842,160935265,"""\n\nPlease do not add nonsense to Wikipedia. ...",0.022798
4832,275797183,Hi\n\nCould you please learn to interact like ...,0.0645
4833,275812977,Could you please learn to interact like a sent...,0.0645
5140,298854514,"her!\n\nPoop, pee, toot, fart, gas, diareah!\n...",0.464975
5190,301925517,"her!\n\nPoop, pee, toot, fart, gas, diareah!\n...",0.464975
5752,339478276,I'm gonna beat you to a bloody pulp then sho...,0.230748
5753,339478966,I'm gonna beat you to a bloody pulp then shoo...,0.230748
5832,345043812,JIMBO SAID I COULD EDIT HIS PAGE. YOU ARE A MO...,0.303382
5833,345043888,JIMBO SAID I COULD EDIT HIS PAGE. YOU ARE A M...,0.303382


In [30]:
df_sub.sample(5)

Unnamed: 0,comment_id,text,score
3294,186197494,"""\nFor copying and pasting of what I felt stro...",0.141626
2167,116257386,Dude! \nThat was an attempt at saying somethi...,0.160497
7070,457417171,You simply display your ignorance. Fatuorum,0.274591
4347,242591983,"""\n\nSockpuppetry case\n \nYou have been accus...",0.012607
1370,70880071,Now let's see who's gonna start crying like a ...,0.279953


In [31]:
# save submission
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)