In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import BertTokenizer, BertModel, logging
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

logging.set_verbosity_error()

In [2]:
df_train = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv')
df_train

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [3]:
df_test = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
df_test

Unnamed: 0,comment_id,text
0,114890,"""\n \n\nGjalexei, you asked about whether ther..."
1,732895,"Looks like be have an abuser , can you please ..."
2,1139051,I confess to having complete (and apparently b...
3,1434512,"""\n\nFreud's ideas are certainly much discusse..."
4,2084821,It is not just you. This is a laundry list of ...
...,...,...
7532,504235362,"Go away, you annoying vandal."
7533,504235566,This user is a vandal.
7534,504308177,""" \n\nSorry to sound like a pain, but one by f..."
7535,504570375,Well it's pretty fucking irrelevant now I'm un...


In [4]:
df_sample_submission = pd.read_csv('../input/jigsaw-toxic-severity-rating/sample_submission.csv')
df_sample_submission

Unnamed: 0,comment_id,score
0,114890,0.5
1,732895,0.5
2,1139051,0.5
3,1434512,0.5
4,2084821,0.5
...,...,...
7532,504235362,0.5
7533,504235566,0.5
7534,504308177,0.5
7535,504570375,0.5


In [5]:
def check_imbalance(row):
    toxity = row[2:].sum()
    if toxity > 0:
        return 1
    else:
        return 0

In [6]:
df_train['is_toxic'] = df_train.apply(check_imbalance, axis=1)
sample_numb = len(df_train.loc[df_train['is_toxic'] == 0]) - len(df_train.loc[df_train['is_toxic'] == 1])
not_toxic_df = df_train.loc[df_train['is_toxic'] == 0].drop('is_toxic', axis=1).reset_index(drop=True)
toxic_df = df_train.loc[df_train['is_toxic'] == 1].sample(n=sample_numb, replace=True, random_state=0, axis=0).drop('is_toxic', axis=1).reset_index(drop=True)
oversampled_df = pd.concat([not_toxic_df, toxic_df], axis=0)
oversampled_df.index = range(len(oversampled_df))
oversampled_df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
270462,b1466d2e90cefed3,Mctrain \n\nThis talk of the article being a h...,1,0,1,0,0,0
270463,4920382ef3caf461,"Fuck Andrew Quah, father of the bonsai and chi...",1,0,1,0,0,0
270464,067ca425add7d922,suck my dick. suck mybig harry ballsack and sh...,1,0,1,0,1,0
270465,38d8bff6a474f8b1,What the fuck? \n\nCluebot NG is in your house...,1,0,1,0,0,0


In [7]:
category_weights = {
    'toxic': 0.5,
    'severe_toxic': 1.5,
    'obscene': 0.25,
    'threat': 1.5,
    'insult': 0.8,
    'identity_hate': 1.5
}

for category, weight in category_weights.items():
    oversampled_df[category] = oversampled_df[category] * weight

oversampled_df['score'] = oversampled_df.drop(['id', 'comment_text'], axis=1).mean(axis=1)
oversampled_df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,score
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0.0,0.0,0.00,0.0,0.0,0.0,0.000000
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0.0,0.0,0.00,0.0,0.0,0.0,0.000000
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0.0,0.0,0.00,0.0,0.0,0.0,0.000000
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0.0,0.0,0.00,0.0,0.0,0.0,0.000000
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0.0,0.0,0.00,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...
270462,b1466d2e90cefed3,Mctrain \n\nThis talk of the article being a h...,0.5,0.0,0.25,0.0,0.0,0.0,0.125000
270463,4920382ef3caf461,"Fuck Andrew Quah, father of the bonsai and chi...",0.5,0.0,0.25,0.0,0.0,0.0,0.125000
270464,067ca425add7d922,suck my dick. suck mybig harry ballsack and sh...,0.5,0.0,0.25,0.0,0.8,0.0,0.258333
270465,38d8bff6a474f8b1,What the fuck? \n\nCluebot NG is in your house...,0.5,0.0,0.25,0.0,0.0,0.0,0.125000


In [8]:
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('../input/bert-uncased')
train_df, val_df = train_test_split(oversampled_df, test_size=0.2, random_state=0, shuffle=True)
train_df.index = range(len(train_df))
val_df.index = range(len(val_df))
print(f'train_len: {len(train_df)}, val_len: {len(val_df)}')

train_len: 216373, val_len: 54094


In [9]:
class TextDataset(Dataset):
    def __init__(self, data, tokenizer, max_length, is_test=False):
        super(TextDataset, self).__init__()
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        x = self.data.iloc[index, 1]
        if self.is_test:
            targets = torch.tensor(self.data.iloc[index, 0])
        else:
            targets = torch.tensor(self.data.iloc[index, -1])
        
        encoded = self.tokenizer(x, add_special_tokens=True, max_length=self.max_length,
                                return_token_type_ids=False, padding='max_length',
                                truncation=True, return_attention_mask=True,
                                return_tensors='pt')
        
        input_ids = encoded['input_ids'].squeeze()
        attention_mask = encoded['attention_mask'].squeeze()
        
        return input_ids, attention_mask, targets

In [10]:
train_dataset = TextDataset(train_df, tokenizer, max_length=256)
val_dataset = TextDataset(val_df, tokenizer, max_length=256)
test_dataset = TextDataset(df_test, tokenizer, max_length=256, is_test=True)

BATCH_SIZE = 16
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, pin_memory=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, pin_memory=True)

#bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('../input/bert-base-uncased')

In [11]:
class TextNet(nn.Module):
    def __init__(self, bert_model):
        super(TextNet, self).__init__()
        self.bert_model = bert_model
        self.fc = nn.Linear(768, 1)
    
    def forward(self, input_ids, attention_mask):
        out = self.bert_model(input_ids, attention_mask, return_dict=True)['pooler_output']
        return self.fc(out)

In [12]:
def train_epoch(model, train_loader, criterion, optimizer, DEVICE):
    model.train()
    
    losses = []
    
    for data in tqdm(train_loader):
        input_ids, attention_mask, targets = data
        input_ids = input_ids.to(DEVICE)
        attention_mask = attention_mask.to(DEVICE)
        targets = targets.to(DEVICE)

        output = model(input_ids, attention_mask)

        loss = criterion(output.squeeze().float(), targets.float())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        losses.append(loss.item())

    return np.mean(losses)

In [13]:
def val_epoch(model, val_loader, criterion, DEVICE):
    model.eval()
    
    losses = []
    
    with torch.no_grad():
        for data in tqdm(val_loader):
            input_ids, attention_mask, targets = data
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
            targets = targets.to(DEVICE)

            output = model(input_ids, attention_mask)

            loss = criterion(output.squeeze().float(), targets.float())

            losses.append(loss.item())

    return np.mean(losses)

In [14]:
def make_submission(model, test_loader, DEVICE, submission_data):
    model.eval()
    
    current_ind = 0
    
    for data in tqdm(test_loader):
        input_ids, attention_mask, _ = data
        input_ids = input_ids.to(DEVICE)
        attention_mask = attention_mask.to(DEVICE)
        
        preds = model(input_ids, attention_mask).cpu().tolist()
        submission_data.loc[current_ind:current_ind + len(preds) - 1, 'score'] = preds
        current_ind += len(preds)
    
    print(f'submission_data: {submission_data}')
    
    submission_data.to_csv('submission.csv', index=False)

In [15]:
EPOCHS = 1
LEARNING_RATE = 2e-5

criterion = nn.MSELoss()

model = TextNet(bert_model).to(DEVICE)

submission_data = df_test[['comment_id']]
submission_data['score'] = 0.0

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

best_val_loss = np.inf

for epoch in range(EPOCHS):
    print(f'Epoch: {epoch+1}/{EPOCHS}')
    print('-' * 10)

    print('Training')
    train_loss = train_epoch(model, train_loader, criterion, optimizer, DEVICE)

    print('Validating')
    val_loss = val_epoch(model, val_loader, criterion, DEVICE)

    print(f'Train Loss: {train_loss}\t Val Loss: {val_loss}')

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'toxicity_best_model.pth.tar')

print('Make submission')
make_submission(model, test_loader, DEVICE, submission_data)

Epoch: 1/1
----------
Training


100%|██████████| 13524/13524 [1:44:46<00:00,  2.15it/s]


Validating


100%|██████████| 3381/3381 [10:26<00:00,  5.40it/s]


Train Loss: 0.004002951711843482	 Val Loss: 0.0016211633204258307
Make submission


100%|██████████| 472/472 [01:32<00:00,  5.11it/s]

submission_data:       comment_id     score
0         114890  0.010086
1         732895  0.010412
2        1139051  0.034749
3        1434512  0.010657
4        2084821  0.098227
...          ...       ...
7532   504235362  0.112159
7533   504235566  0.101456
7534   504308177  0.020198
7535   504570375  0.119463
7536   504598250  0.013401

[7537 rows x 2 columns]



