In [1]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch
from transformers import AutoTokenizer, BertModel

In [2]:
from tqdm import tqdm

In [3]:
data_root = "/data/koushurui/Data/kaggle/disaster-tweets"
pipe_root = "/data/koushurui/Code/LLMS/google-bert/bert-base-uncased"
train_pth = os.path.join(data_root, 'train.csv')
test_pth = os.path.join(data_root, 'test.csv')
sample_pth = os.path.join(data_root, 'sample_submission.csv')

In [4]:
train_csv = pd.read_csv(train_pth)
test_csv = pd.read_csv(test_pth)
num_train_examples = train_csv.shape[0]
num_test_examples = test_csv.shape[0]

In [5]:
class MyDataset(Dataset):
    def __init__(self, data_root, model_root, subset='train') -> None:
        super().__init__()
        self.subset = subset
        data_pth = os.path.join(data_root, 'train.csv') if subset == 'train' else os.path.join(data_root, 'test.csv')
        self.df_data = pd.read_csv(data_pth)
        self.tokenizer = AutoTokenizer.from_pretrained(model_root)

    def __len__(self):
        return len(self.df_data)

    def __getitem__(self, index):
        text = self.df_data['text'][index]
        if self.subset == 'train':
            label = self.df_data['target'][index]

        encoded_pair = self.tokenizer(text=text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        input_ids, attention_mask, token_type_ids = encoded_pair['input_ids'].squeeze(0), encoded_pair['attention_mask'].squeeze(0), encoded_pair['token_type_ids'].squeeze(0)


        return (input_ids, attention_mask, token_type_ids), label if self.subset == 'train' else (input_ids, attention_mask, token_type_ids), 

In [6]:
dataset = MyDataset(data_root=data_root, model_root=pipe_root, subset='train')

In [7]:
class Params:
    batch_size = 256
    shuffle = True
    num_workers = 0
    lr = 2e-5
    weight_decay = 1e-2
    epochs = 25
    

In [8]:
train_iter = DataLoader(dataset=dataset, batch_size=Params.batch_size, shuffle=Params.shuffle, num_workers=Params.num_workers)

In [9]:
class BertClassifer(nn.Module):
    def __init__(self, model_path) -> None:
        super().__init__()
        self.bert = BertModel.from_pretrained(model_path)
        # for param in self.bert.parameters():
        #     param.requires_grad = True
        self.linear = nn.Linear(768, 2)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, X):
        input_ids, attention_mask, token_type_ids = X[0], X[1], X[2]
        _, output = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=False)
        output = self.dropout(output)
        output = self.linear(output)

        return output

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
model = BertClassifer(model_path=pipe_root)
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=Params.lr, weight_decay=Params.weight_decay)

In [12]:
model = model.to(device=device)
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)
loss = loss.to(device=device)

In [13]:
for epoch in range(Params.epochs):
    processor = tqdm(train_iter, dynamic_ncols=True)
    total_loss = 0
    it = 0
    for data, label in processor:
        label = label.to(device)
        data = (data[0].to(device), data[1].to(device), data[2].to(device))
        output = model(data)
        optimizer.zero_grad()
        ls = loss(output, label)

        ls.backward()
        optimizer.step()
        processor.set_description(f"epoch: {epoch+1}, loss: {ls.item()}")
        processor.update()
        total_loss += ls.item()
        it += 1
    processor.set_description(f"epoch: {epoch+1}, total loss: {total_loss / it }")
    processor.update()
    processor.close()

epoch: 1, loss: 0.35335609316825867: 100%|██████████| 30/30 [00:36<00:00,  1.20s/it]
epoch: 2, loss: 0.3745112717151642: 100%|██████████| 30/30 [00:22<00:00,  1.33it/s] 
epoch: 3, loss: 0.3581593334674835: 100%|██████████| 30/30 [00:22<00:00,  1.33it/s] 
epoch: 4, loss: 0.2701689600944519: 100%|██████████| 30/30 [00:22<00:00,  1.33it/s] 
epoch: 5, loss: 0.1919027417898178: 100%|██████████| 30/30 [00:22<00:00,  1.34it/s] 
epoch: 6, loss: 0.1446167528629303: 100%|██████████| 30/30 [00:22<00:00,  1.33it/s] 
epoch: 7, loss: 0.07978209108114243: 100%|██████████| 30/30 [00:22<00:00,  1.34it/s]
epoch: 8, loss: 0.053553394973278046: 100%|██████████| 30/30 [00:22<00:00,  1.34it/s]
epoch: 9, loss: 0.05055006220936775: 100%|██████████| 30/30 [00:22<00:00,  1.33it/s] 
epoch: 10, loss: 0.045316994190216064: 100%|██████████| 30/30 [00:22<00:00,  1.33it/s]
epoch: 11, loss: 0.028251200914382935: 100%|██████████| 30/30 [00:22<00:00,  1.33it/s]
epoch: 12, loss: 0.03933781385421753: 100%|██████████| 30/3

In [14]:
test_dataset = MyDataset(data_root=data_root, model_root=pipe_root, subset='test')

In [15]:
sample_csv = pd.read_csv(sample_pth)
test_csv

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [16]:
test_iter = DataLoader(dataset=test_dataset, shuffle=False, batch_size=Params.batch_size, num_workers=0)

In [17]:
model.eval()
df = {"id": [], "target": []}
with torch.no_grad():
    for i, (data, _) in enumerate(test_iter):
        data = (data[0].to(device), data[1].to(device), data[2].to(device))
        output = model(data)
        pred = torch.argmax(output, dim=1).cpu().numpy()
        for idx in range(pred.shape[0]):
            identifier = test_csv["id"][i * Params.batch_size + idx]
            df['id'].append(identifier)
            df['target'].append(pred[idx])
pd.DataFrame(df).to_csv('/home/koushurui/Documents/Code/Kaggle/disaster-tweets/submission.csv', index=False)

In [18]:
submission = pd.read_csv('submission.csv')
submission[-100:-1]

Unnamed: 0,id,target
3163,10495,1
3164,10497,1
3165,10501,1
3166,10504,0
3167,10507,1
...,...,...
3257,10858,1
3258,10861,1
3259,10865,1
3260,10868,1


In [19]:
sample_csv

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0
...,...,...
3258,10861,0
3259,10865,0
3260,10868,0
3261,10874,0
