# Import

In [None]:
import os
import gc
import copy
import re
import time
import random
import string
import warnings

warnings.filterwarnings("ignore")
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim 
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader


from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold

from bs4 import BeautifulSoup
from tqdm import tqdm
from collections import defaultdict
import copy
from copy import deepcopy

import nltk
# from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from transformers import AutoTokenizer, AutoModel, AdamW

from colorama import Fore, Back, Style
b_ = Fore.BLUE
y_ = Fore.YELLOW
sr_ = Style.RESET_ALL

# id_generator

In [None]:
def id_generator(size = 12, chars = string.ascii_lowercase + string.digits):
    return ''.join(random.SystemRandom().choice(chars) for _ in range(size))

HASH_NAME = id_generator(size = 12)
print(HASH_NAME)

# Config

In [None]:
CONFIG = {"seed": 2021,
          "epochs": 10,
          "model_name": "cardiffnlp/twitter-roberta-base-hate",
#           "model_name": "roberta-base",
          "train_batch_size": 32,
          "valid_batch_size": 64,
          "max_length": 128,
          "learning_rate": 1e-4,
          "scheduler": 'CosineAnnealingLR',
          "min_lr": 1e-6,
          "T_max": 500,
          "weight_decay": 1e-6,
          "n_fold": 5,
          "n_accumulate": 1,
          "num_classes": 1,
          "margin": 0.5,
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
          "hash_name": HASH_NAME
          }

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
CONFIG['group'] = f'{HASH_NAME}-Baseline'

# Random Seed

In [None]:
def set_seed(seed = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deteministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(CONFIG['seed'])

# Data

In [None]:
df = pd.read_csv('../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv')
print(df.shape)
df.head(10)

In [None]:
# # Give more weight to severe toxic 
# df['severe_toxic'] = df.severe_toxic * 10
# df['toxic'] = df.toxic * 6 
# df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) ).astype(int)
# df['y'] = df['y']/df['y'].max()

# df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
# df.sample(5)

In [None]:
df = df.dropna()
df = df[['txt', 'offensiveness_score']]
df.columns = ['text', 'score']
df.head()

In [None]:
df.score.value_counts()

In [None]:
# import numpy as np
# import pandas as pd

# df1 = pd.read_csv('../input/ruddit-jigsaw-dataset-combined-cleaned/toxic_train.csv')
# df1 = df1[['txt', 'offensiveness_score']]
# df1.columns = ['text', 'score']
# print(df1.shape)
# df1.head()

In [None]:
# from copy import deepcopy
# print(df1.loc[df1.score!=0.0].shape)
# df2 = deepcopy(df1.loc[df1.score!=0.0])
# df2.head()

In [None]:
# from copy import deepcopy
# df3 = deepcopy(df1.loc[df1.score == 0])
# print(df3.shape)
# df3.head()

In [None]:
# df = pd.concat([df, df2],axis = 0)
# df = df.reset_index(drop=True)
# print(df.shape)
# df.tail()

# Preprocessing

In [None]:
# nltk.download('stopwords')
# STOPWORDS = nltk.corpus.stopwords.words('english')

# ## kesha_mandal's code
# def washing(comment):

#     comment = re.sub('[^a-zA-Z]', ' ', comment)
#     comment = comment.lower()
#     comment = comment.split()
#     stemmer = SnowballStemmer('english')
#     lemmatizer = WordNetLemmatizer()
#     comment = [stemmer.stem(word) for word in comment if not word in set(STOPWORDS)]
#     comment = [lemmatizer.lemmatize(word) for word in comment]
#     comment = ' '.join(comment)
#     # corpus.append(comment)
#     # return corpus
#     return comment


In [None]:
# ##  https://www.kaggle.com/manabendrarout/pytorch-roberta-ranking-baseline-jrstc-infer/notebook
# def text_cleaning(text):
    
#     template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
#     text = template.sub(r'', text)
    
#     soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
#     only_text = soup.get_text()
#     text = only_text
    
#     emoji_pattern = re.compile("["
#                                u"\U0001F600-\U0001F64F"  # emoticons
#                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
#                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                                u"\U00002702-\U000027B0"
#                                u"\U000024C2-\U0001F251"
#                                "]+", flags=re.UNICODE)
#     text = emoji_pattern.sub(r'', text)
    
#     text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
#     text = re.sub(' +', ' ', text) #Remove Extra Spaces
#     text = text.strip() # remove spaces at the beginning and at the end of string

#     return text

In [None]:
## https://www.kaggle.com/kishalmandal/most-detailed-eda-tf-idf-and-logistic-reg

# df["less_toxic"] = df["less_toxic"].str.replace('fk', 'fuck')
# df["less_toxic"] = df["less_toxic"].str.replace('fuk', 'fuck')

# df.head(10)

In [None]:
# df['less_toxic'] = df['less_toxic'].apply(text_cleaning)
# df['more_toxic'] = df['more_toxic'].apply(text_cleaning)

# df.head(10)

In [None]:
# df['less_toxic'] = df['less_toxic'].apply(washing)
# df['more_toxic'] = df['more_toxic'].apply(washing)

# df.head(10)

# KFold

In [None]:
k = CONFIG['n_fold']
skf = KFold(n_splits = k, shuffle = True, random_state = CONFIG['seed'])
for fold, (k, v) in enumerate(skf.split(X = df)):
    df.loc[v, 'kfold'] = int(fold)

df['kfold'] = df['kfold'].astype(int)
df.head()

# Dataset

In [None]:
class JDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df['text'].values
        self.score = df['score']
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(text, truncation = True,
                                            add_special_tokens = True, 
                                            max_length = self.max_len,
                                            padding = 'max_length')
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        target = self.score[index]
        
        return {'ids' : torch.tensor(ids, dtype = torch.long), 
                'mask' : torch.tensor(mask, dtype = torch.long),
                'target' : torch.tensor(target, dtype = torch.float)
               }


# prepare_loader Function

In [None]:
def prepare_loaders(fold):
    
    df_train = df[df.kfold != fold].reset_index(drop = True)
    df_valid = df[df.kfold == fold].reset_index(drop = True)
    
    train_dataset = JDataset(df_train, tokenizer = CONFIG['tokenizer'], max_length = CONFIG['max_length'])
    valid_dataset = JDataset(df_valid, tokenizer = CONFIG['tokenizer'], max_length = CONFIG['max_length'])
    
    train_loader = DataLoader(train_dataset, 
                              batch_size = CONFIG['train_batch_size'],
                              num_workers = os.cpu_count(),
                              shuffle = True, 
                              pin_memory = True,
                              drop_last = True)
    
    valid_loader = DataLoader(valid_dataset, 
                              batch_size = CONFIG['train_batch_size'],
                              num_workers = os.cpu_count(),
                              shuffle = False,
                              pin_memory = True)
    
    return train_loader, valid_loader

# Model

In [None]:
class Model(nn.Module):
    
    def __init__(self, model_name):
        super().__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(p = 0.2)
        self.linear = nn.Linear(768, CONFIG['num_classes'])
#         self.sigmoid = nn.Sigmoid()
    
    def forward(self, ids, mask):
        model_out = self.model(input_ids = ids,
                               attention_mask = mask,
                               output_hidden_states = False)
        
        out = self.dropout(model_out[1])
        output = self.linear(out)
#         outputs = self.sigmoid(output)
        return output

# Loss Function

In [None]:
loss_fn = nn.MSELoss()
# loss_fn = nn.BCELoss()
# loss_fn= nn.BCEWithLogitsLoss()

# Train one Epoch Function

In [None]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    
    model.train()
    
    dataset_size = 0
    running_loss = 0.
    
    bar = tqdm(enumerate(dataloader), total = len(dataloader))
    
    for step, data in bar:
        
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        
        targets = data['target'].to(device, dtype = torch.float)
        targets= targets.reshape(-1, 1)
        
        batch_size = ids.size(0)
        
        outputs = model(ids, mask)
        
        loss = loss_fn(outputs, targets)
        loss = loss / CONFIG['n_accumulate']
        loss.backward()
        
        if (step + 1) % CONFIG['n_accumulate'] == 0:
            optimizer.step()
            optimizer.zero_grad()
            
            if scheduler is not None:
                scheduler.step()
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        bar.set_postfix(Epoch = epoch, Train_Loss = epoch_loss, LR = optimizer.param_groups[0]['lr'])
        
    gc.collect()
    
    return epoch_loss

# Validation Function

In [None]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    
    model.eval()
    
    dataset_size = 0
    running_loss = 0.
    
    bar = tqdm(enumerate(dataloader), total = len(dataloader))
    
    for step, data in bar:
        
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        
        targets = data['target'].to(device, dtype = torch.float)
        targets= targets.reshape(-1, 1)
        batch_size = ids.size(0)
        
        outputs = model(ids, mask)
#         outputs
        loss = loss_fn(outputs, targets)
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        bar.set_postfix(Epoch = epoch, Train_Loss = epoch_loss, LR = optimizer.param_groups[0]['lr'])
        
    gc.collect()
    
    return epoch_loss

# Run Training Function

In [None]:
def run_training(model, optimizer, scheduler, device, num_epochs, fold):
    
    if torch.cuda.is_available():
        print(f"Using GPU: {torch.cuda.get_device_name()}")
        print()
        
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1):
        gc.collect()
        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, dataloader = train_loader, device = CONFIG['device'], epoch = epoch)
        valid_epoch_loss = valid_one_epoch(model, dataloader = valid_loader, device = CONFIG['device'], epoch = epoch)
        
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(valid_epoch_loss)
        
        if valid_epoch_loss <= best_epoch_loss:
            print(f"{b_} Validation Loss Improved: [{best_epoch_loss} ---> {valid_epoch_loss}]")
            best_epoch_loss = valid_epoch_loss
#             run.summary['Best Loss'] = best_epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = f"Loss-Fold-{fold}.bin"
            torch.save(model.state_dict(), PATH)
            print(f"Model Saved{sr_}")
            
        print()
        
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Loss: {:.4f}".format(best_epoch_loss))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history

# fetch_scheduler function

In [None]:
def fetch_scheduler(optimizer):
    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer, 
                                                   T_max = CONFIG['T_max'],
                                                   eta_min = CONFIG['min_lr'])
        
    elif CONFIG['scheduler'] == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,
                                                             T_0 = CONFIG['T_0'],
                                                             eta_min = CONFIG['min_lr'])
        
    
    elif CONFIG['scheduler'] == None:
        return None
    
    return scheduler

# Let's Try Run

In [None]:
foldss = CONFIG['n_fold']

for fold in range(0, foldss):

    print(f"{y_}===== Fold: {fold} ====={sr_}")
    
    train_loader, valid_loader = prepare_loaders(fold = fold)
    
    model = Model(CONFIG['model_name'])
    model.to(CONFIG['device'])
    
    optimizer = AdamW(model.parameters(), lr = CONFIG['learning_rate'], weight_decay = CONFIG['weight_decay'])
    scheduler = fetch_scheduler(optimizer)
    
    model, history = run_training(model,
                                  optimizer,
                                  scheduler,
                                  device = CONFIG['device'],
                                  num_epochs = CONFIG['epochs'],
                                  fold = fold)
    
    del model, train_loader, valid_loader
    gc.collect()
    print()

In [None]:
print("Training Over")