In [1]:
import os
import gc
import re
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.nn import functional as F
import transformers
from torch.utils.data import DataLoader, Dataset
from torch.utils.data import Subset

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

In [2]:
class CONFIG:
    MODEL_PATH = './model/'
    SAVE_EVERT = 10
    EPOCHS = 20
    BATCH_SIZE = 32
    LEARNING_RATE = 1e-5
    TRAIN_TEST_SPLIT = 0.3

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
print("loading data")
df = pd.read_csv("../input/nlp-getting-started/train.csv")
test = pd.read_csv('../input/nlp-getting-started/test.csv')
# train_df, valid_df = train_test_split(df, test_size=CONFIG.TRAIN_TEST_SPLIT, random_state=42)
# test_df = pd.read_csv("../input/nlp-getting-started/test.csv")
# sub = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")

Device: cuda
loading data


In [4]:
df.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
#Emoji patterns
emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)

In [6]:
from nltk.tokenize import WordPunctTokenizer
import re
import emoji
from bs4 import BeautifulSoup
import itertools

tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'

# ref: https://towardsdatascience.com/another-twitter-sentiment-analysis-bb5b01ebad90
    # removing UTF-8 BOM (Byte Order Mark)

def tweet_cleaner(text):
    try:
        text1 = text.decode("utf-8-sig").replace(u"\ufffd", "?") # The UTF-8 BOM is a sequence of bytes (EF BB BF) that allows the reader to identify a file as being encoded in UTF-8
    except:
        text1 = text
    
    
    #replace consecutive non-ASCII characters with a space
    text1 = re.sub(r'[^\x00-\x7F]+',' ', text1)
    
    #remove emojis from tweet
    text2 = emoji_pattern.sub(r'', text1)
    
    # Remove emoticons
    # text3 = [word for word in text2.split() if word not in emoticons]
    # text3 = " ".join(text3)
    
    # contradictions and special characters 
    # text4 = spl_ch_contra(text3)
    
    # HTML encoding
    soup = BeautifulSoup(text2, 'lxml') #HTML encoding has not been converted to text, and ended up in text field as ‘&amp’,’&quot’,etc.
    text5 = soup.get_text()
    
    # removing @ mentions
    text6 = re.sub(pat1, '', text5)
    
    # Removing URLs
    text7 = re.sub(pat2, '', text6)
    
    # Removing punctuations
    # text8 = re.sub("[\.\,\!\?\:\;\-\=\(\)\[\]\"\'\%\*\#\@]", " ", text7)
    
    # Fix misspelled words
    text9 = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text7))# checking that each character should occur not more than 2 times in every word

    # Tokenizing ,change cases & join together to remove unneccessary white spaces
    text9_list = tok.tokenize(text9.lower())
    text10 = (" ".join(text9_list)).strip()
    
    return text10

In [7]:
# cleaning tweets
df['text_cleaned'] = list(map(lambda x:tweet_cleaner(x),df['text']) )

In [13]:
df.sample(10)

Unnamed: 0,id,keyword,location,text,target,text_cleaned
7386,10570,windstorm,Houston,newroofandhardyupwindstorminspectiontomorrow,0,new roof and hardy up .. windstorm inspection ...
867,1252,blood,,scotto519happybirthdayyoungblood,0,happy birthday young blood
7210,10329,weapon,,weaponscatalogue,0,weapon ' s catalogue ~
4005,5688,floods,"Ogba, Lagos, Nigeria",apcchieftaintasksdicksononn15bfloodsdonationto...,1,apc chieftain tasks dickson on n15b floods don...
24,36,,,looooool,0,lool
1198,1724,buildings%20burning,Quincy MA,dougmartin17firemanedrunsintoburningbuildingsw...,1,fireman ed runs into burning buildings while o...
7084,10146,upheaval,INDIA,lyfneedsqualityandacertainsenseofsecuritybeing...,0,lyf needs quality and a certain sense of secur...
6285,8978,storm,NC || OR,icecreamcupcakewarsstormcontentsara,0,ice cream + cupcake wars + storm = content sara
7357,10534,wildfire,"Bakersfield, California",#california#wildfiredestroysmorehomesbutcrewsa...,1,# california # wildfire destroys more homes bu...
2406,3464,derailed,"Kwara, Nigeria",ofwhatuseexactlyisthenationalassembly?honestly...,0,of what use exactly is the national assembly ?...


In [None]:
class BERT(nn.Module):
    def __init__(self, bert_model_name, num_labels, dropout=0.1, freeze_bert=True):
        super(BERT, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(768, num_labels)
        self.num_labels = num_labels
    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        _, pooled_output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=False)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [None]:
class DisasterTweetsDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512, train=True):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.df = df
        self.train = train
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = str(self.df.iloc[idx]["text_cleaned"])
        if self.train:
            targets = torch.tensor(self.df.iloc[idx]["target"], dtype=torch.long)
        
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors="pt",
        )
        input_ids, attention_mask, token_type_ids = inputs["input_ids"], inputs["attention_mask"], inputs["token_type_ids"]
        if self.train:
            return {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "token_type_ids": token_type_ids,
                "targets": targets,
                }
        else:
            return {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "token_type_ids": token_type_ids,
                }

In [None]:
df.head(5)

In [None]:
df.isnull().sum()

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
#helper function to get train and val data loaders for each fold 
def get_data_loaders(dataset,train_indexes,val_indexes):
    train_tensor = Subset(dataset,train_indexes)
    val_tensor = Subset(dataset,val_indexes)
    train_dataloader = DataLoader(
            train_tensor, 
            sampler = RandomSampler(train_tensor), 
            batch_size = CONFIG.BATCH_SIZE
        )

    val_dataloader = DataLoader(
            val_tensor, 
            sampler = SequentialSampler(val_tensor), 
            batch_size = CONFIG.BATCH_SIZE 
        )
    return train_dataloader,val_dataloader

In [None]:
total_folds = 6
current_fold = -1
all_folds_preds = []
epochs = 1
fold=StratifiedKFold(n_splits=total_folds, shuffle=True, random_state=42)

training_stats = []

In [None]:
import time
import datetime

In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
def train_one_epoch(model, optimizer, scheduler, train_dataloader, device):
    t0 = time.time()
    model.train()
    total_loss = 0.0
    correct_predictions = 0.0
#     bar = tqdm(enumerate(train_dataloader), total=len(train_dataloader))
    for data in train_dataloader:
        optimizer.zero_grad()
        input_ids = data["input_ids"].to(device).squeeze(1)
        attention_mask = data["attention_mask"].to(device).squeeze(1)
        token_type_ids = data["token_type_ids"].to(device).squeeze(1)
        targets = data["targets"].to(device).unsqueeze(1)
#         print(targets.shape)
        out = model(input_ids, attention_mask, token_type_ids)
        
        loss = criterion(out, targets.float())
#         bar.set_postfix({
#                 "Train Loss": "{:.6f}".format(abs(loss)),}
#         )
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    training_time = format_time(time.time() - t0)
    return total_loss / len(train_dataloader), training_time

In [None]:
def valid_one_epoch(model, valid_dataloader, device):
    t0 = time.time()
    model.eval()
    total_loss = 0.0
    correct_predictions = 0.0
    with torch.no_grad():
#         bar = tqdm(enumerate(valid_dataloader), total=len(valid_dataloader))
        for data in valid_dataloader:
            input_ids = data["input_ids"].to(device).squeeze(1)
            attention_mask = data["attention_mask"].to(device).squeeze(1)
            token_type_ids = data["token_type_ids"].to(device).squeeze(1)
            targets = data["targets"].to(device).unsqueeze(1)
            out = model(input_ids, attention_mask, token_type_ids)
            loss = criterion(out, targets.float())
#             bar.set_postfix({
#                 "Valid Loss": "{:.6f}".format(abs(loss)),}
#             )
            total_loss += loss.item()
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    return total_loss / len(valid_dataloader),validation_time

In [None]:
def get_dataloader(train_dataset, valid_dataset, batch_size=CONFIG.BATCH_SIZE):
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
    return train_dataloader, valid_dataloader

In [None]:
#for each fold..
for train_index, test_index in fold.split(df,df['target']):
    model = BERT("../input/huggingface-bert/bert-base-cased", num_labels=1, dropout=0.1).to(device)
    tokenizer = transformers.BertTokenizer.from_pretrained("../input/huggingface-bert/bert-base-cased")
    dataset = DisasterTweetsDataset(df, tokenizer, max_len=64, train=True)
    optimizer = transformers.AdamW(model.parameters(), lr = 1e-5,eps = 1e-8)
    current_fold = current_fold+1
    scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(dataset) * CONFIG.EPOCHS,
    )
    current_fold = current_fold+1
    criterion = nn.BCEWithLogitsLoss()
    train_dataloader,validation_dataloader = get_data_loaders(dataset,train_index,test_index)
    
    best_valid_loss = float("inf")
    for epoch in range(CONFIG.EPOCHS):
        print(f'Epoch {epoch + 1}/{CONFIG.EPOCHS}')
        print('-' * 10)
        
        train_loss, training_time = train_one_epoch(model, optimizer, scheduler, train_dataloader, device)
        print(f'Train loss {train_loss} Training time {training_time}')
        valid_loss, validation_time = valid_one_epoch(model, validation_dataloader, device)
        print(f'Val loss {valid_loss} Validation time {validation_time}')
        

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), "best_model.bin")  