In [None]:
import os
import gc
import copy
import re
import time
import random
import string
import warnings
warnings.filterwarnings("ignore")
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim 
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

import copy
from copy import deepcopy


from tqdm import tqdm
from collections import defaultdict

from bs4 import BeautifulSoup

import nltk
# from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold

from transformers import AutoTokenizer, AutoModel, AdamW

from colorama import Fore, Back, Style
b_ = Fore.BLUE
y_ = Fore.YELLOW
sr_ = Style.RESET_ALL

# CONFIG

In [None]:
CONFIG = dict(
    seed = 42,
#     model_name = '../input/dislike-friends-part3/Hate-speech-CNERG/bert-base-uncased-hatexplain',
    model_name = '../input/dislike-friends-eng-all/cardiffnlp/twitter-roberta-base-hate',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)


CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])

# Random Seed

In [None]:
def set_seed(seed = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deteministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(CONFIG['seed'])

# Data

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
print(df.shape)
print()
df.head(10)

# Preprocessing

In [None]:
# nltk.download('stopwords')
# STOPWORDS = nltk.corpus.stopwords.words('english')
# set(STOPWORDS)

In [None]:
# STOPWORDS = ['a','about', 'above','after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at',\
#              'be', 'because', 'been', 'before','being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', 'd', 'did','didn',"didn't",\
#              'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few','for', 'from', 'further', 'had', 'hadn',\
#              "hadn't", 'has', 'hasn',"hasn't",'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', \
#              'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", \
#              'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other',\
#              'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', \
#              "shouldn't", 'so', 'some', 'such', 't', 'than','that', "that'll", 'the', 'their', 'theirs','them','themselves', 'then', 'there', 'these', 'they', \
#              'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn', "wasn't",\
#              'we', 'were', 'weren', "weren't", 'what', 'when', 'where', 'which', 'while', 'who', 'whom', \
#              'why', 'will', 'with', 'won', "won't", 'wouldn', "wouldn't", 'y', 'you', "you'd", "you'll", \
#              "you're", "you've", 'your', 'yours', 'yourself','yourselves']

# print(len(STOPWORDS))
# set(STOPWORDS)

In [None]:
## kesha_mandal's code
# def washing(comment):

#     comment = re.sub('[^a-zA-Z]', ' ', comment)
#     comment = comment.lower()
#     comment = comment.split()
#     stemmer = SnowballStemmer('english')
#     lemmatizer = WordNetLemmatizer()
#     comment = [stemmer.stem(word) for word in comment if not word in set(STOPWORDS)]
#     comment = [lemmatizer.lemmatize(word) for word in comment]
#     comment = ' '.join(comment)
    # corpus.append(comment)
    # return corpus
#     return comment



##  https://www.kaggle.com/manabendrarout/pytorch-roberta-ranking-baseline-jrstc-infer/notebook
# def text_cleaning(text):
    
#     template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
#     text = template.sub(r'', text)
    
#     soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
#     only_text = soup.get_text()
#     text = only_text
    
#     emoji_pattern = re.compile("["
#                                u"\U0001F600-\U0001F64F"  # emoticons
#                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
#                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                                u"\U00002702-\U000027B0"
#                                u"\U000024C2-\U0001F251"
#                                "]+", flags=re.UNICODE)
#     text = emoji_pattern.sub(r'', text)
#     text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
#     text = re.sub(' +', ' ', text) #Remove Extra Spaces
#     text = text.strip() # remove spaces at the beginning and at the end of string

#     return text

In [None]:
## https://www.kaggle.com/kishalmandal/most-detailed-eda-tf-idf-and-logistic-reg

# df["text"] = df["text"].str.replace('fk', 'fuck')

# df.head(10)

In [None]:
# df["text"] = df["text"].apply(text_cleaning)

# df.head(10)

In [None]:
# df["text"] = df["text"].apply(washing)

# df.head(10)

# Test Dataset Class

In [None]:
class JDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df['text'].values
#         self.score = df['score']
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(text, truncation = True,
                                            add_special_tokens = True, 
                                            max_length = self.max_len,
                                            padding = 'max_length')
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
#         target = self.score[index]
        
        return {'ids' : torch.tensor(ids, dtype = torch.long), 
                'mask' : torch.tensor(mask, dtype = torch.long),
#                 'target' : torch.tensor(target, dtype = torch.float)
               }


# Test Dataset, Test DataLoader

In [None]:
## Actual TestDataset
test_dataset = JDataset(df, CONFIG['tokenizer'], max_length = CONFIG['max_length'])

test_loader = DataLoader(test_dataset,
                         batch_size = CONFIG['test_batch_size'],
                         num_workers = os.cpu_count(),
                         shuffle = False,
                         pin_memory = True)

# Model

In [None]:
class Model(nn.Module):
    
    def __init__(self, model_name):
        super().__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(p = 0.2)
        self.linear = nn.Linear(768, CONFIG['num_classes'])
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, ids, mask):
        model_out = self.model(input_ids = ids,
                               attention_mask = mask,
                               output_hidden_states = False)
        
        out = self.dropout(model_out[1])
        output = self.linear(out)
        outputs = self.sigmoid(output)
        
        return outputs

# Model Paths

In [None]:
## Model paths
MODEL_PATHS = [
    '../input/twitterhate-mseloss-train-10epoch-rudditdata/Loss-Fold-0.bin',
    '../input/twitterhate-mseloss-train-10epoch-rudditdata/Loss-Fold-1.bin',
    '../input/twitterhate-mseloss-train-10epoch-rudditdata/Loss-Fold-2.bin',
    '../input/twitterhate-mseloss-train-10epoch-rudditdata/Loss-Fold-3.bin',
    '../input/twitterhate-mseloss-train-10epoch-rudditdata/Loss-Fold-4.bin'
]

# Validation Function

In [None]:
@torch.no_grad()
def valid_fn(model, dataloader, device):
    
    model.eval()
    
    dataset_size = 0
    running_loss = 0.
    PREDS =[]
    
    bar = tqdm(enumerate(dataloader), total = len(dataloader))
    
    for step, data in bar:
        
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        
        outputs = model(ids, mask)
        PREDS.append(outputs.view(-1).cpu().detach().numpy())
        
    PREDS = np.concatenate(PREDS)
    gc.collect()
    
    return PREDS

# inference Function

In [None]:
def inference(model_paths, dataloader, device):
    final_preds = []
    for i, path in enumerate(model_paths):
        model = Model(CONFIG['model_name'])
        model.to(CONFIG['device'])
        model.load_state_dict(torch.load(path))
        
        print(f"Getting prediction for model {i + 1}")
        preds = valid_fn(model, dataloader, device)
        final_preds.append(preds)
    
    final_preds = np.array(final_preds)
    final_predss = np.mean(final_preds, axis = 0)
    return final_predss

# Prediction

In [None]:
preds = inference(MODEL_PATHS, test_loader, CONFIG['device'])

In [None]:
print(f"Total Predictiions: {preds.shape[0]}")

In [None]:
print(f"Total Unique Predictions: {np.unique(preds).shape[0]}")

In [None]:
df['score'] = preds

print(df.shape)
print()

df.head(10)

# Submission

In [None]:
df['score'] = df['score'].rank(method='first')

print(df.shape)
print()

df.head(10)

In [None]:
df.drop('text', axis=1, inplace=True)

print(df.shape)
print()

df.to_csv("submission.csv", index=False)
df.head(10)