In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import BertTokenizer, BertModel, logging
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold

from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import re 
import scipy
from scipy import sparse
from scipy.stats import rankdata

from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.linear_model import Ridge, Lasso, BayesianRidge
from sklearn.svm import SVR

import nltk
import re
from tqdm.auto import tqdm
from datasets import Dataset
from transformers import PreTrainedTokenizerFast
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

import time
import scipy.optimize as optimize
import warnings

warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100
logging.set_verbosity_error()

In [None]:
df_train = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv')
df_train

In [None]:
df_test = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
df_test

In [None]:
df_sample_submission = pd.read_csv('../input/jigsaw-toxic-severity-rating/sample_submission.csv')
df_sample_submission

In [None]:
class WeakLearner1:
    def __init__(self):
        self.vectorizer1 = None
        self.vectorizer2 = None
        self.model1 = None
        self.model2 = None
    
    def fit(self):
        df_regression = pd.read_csv("../input/jigsaw-regression-based-data/train_data_version2.csv")
        df = df_regression[['text', 'y']]

        self.vectorizer1 = TfidfVectorizer(analyzer='char_wb', max_df=0.7, min_df=1, ngram_range=(2, 5) )
        X = self.vectorizer1.fit_transform(df['text'])
        z = df["y"].values
        y = np.around(z, decimals=2)

        self.model1 = Ridge(alpha=0.5)
        self.model1.fit(X, y)
        
        #--------------------------------------------------------------------------------------------------
        
        rud_df = pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
        rud_df['y'] = rud_df["offensiveness_score"] 

        df = rud_df[['txt', 'y']].rename(columns={'txt': 'text'})
        self.vectorizer2 = TfidfVectorizer(analyzer='char_wb', max_df=0.7, min_df=3, ngram_range=(3, 4) )
        X = self.vectorizer2.fit_transform(df['text'])
        z = df["y"].values
        y = np.around(z, decimals=1)
        self.model2 = Ridge(alpha=0.5)
        self.model2.fit(X, y)
    
    def predict(self, x):
        df_scores = pd.DataFrame()
        df_scores.index = range(len(x))
        test = self.vectorizer1.transform(x['text_to_transform'])
        jr_preds = self.model1.predict(test)
        df_scores['score1'] = rankdata(jr_preds, method='ordinal')
        
        #--------------------------------------------------------------------
        
        test = self.vectorizer2.transform(x['text_to_transform'])
        rud_preds = self.model2.predict(test)

        df_scores['score2'] = rankdata(rud_preds, method='ordinal')
        df_scores['score3'] = df_scores['score1'] + df_scores['score2']
        df_scores['score4'] = rankdata(df_scores['score3'], method='ordinal')
        
        df_scores.index = x.index
        
        return df_scores

In [None]:
def dummy_fun(doc):
    return doc


class WeakLearner2:
    def __init__(self, data):
        self.data = data
        self.regressor = None
        self.tokenizer = None
        self.vectorizer = None
    
    def fit(self):
        cat_mtpl = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

        for category in cat_mtpl:
            self.data[category] = self.data[category] * cat_mtpl[category]

        self.data['score'] = self.data.loc[:, 'toxic':'identity_hate'].mean(axis=1)

        self.data['y'] = self.data['score']

        min_len = (self.data['y'] > 0).sum()  # len of toxic comments
        df_y0_undersample = self.data[self.data['y'] == 0].sample(n=min_len, random_state=0)  # take non toxic comments
        df_train_new = pd.concat([self.data[self.data['y'] > 0], df_y0_undersample])  # make new df

        raw_tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
        raw_tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
        raw_tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
        special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
        trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

        dataset = Dataset.from_pandas(df_train_new[['comment_text']])

        def get_training_corpus():
            for i in range(0, len(dataset), 1000):
                yield dataset[i : i + 1000]["comment_text"]

        raw_tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

        self.tokenizer = PreTrainedTokenizerFast(
            tokenizer_object=raw_tokenizer,
            unk_token="[UNK]",
            pad_token="[PAD]",
            cls_token="[CLS]",
            sep_token="[SEP]",
            mask_token="[MASK]",
        )
        
        labels = df_train_new['y']
        comments = df_train_new['comment_text']
        tokenized_comments = self.tokenizer(comments.to_list())['input_ids']

        self.vectorizer = TfidfVectorizer(
            analyzer = 'word',
            tokenizer = dummy_fun,
            preprocessor = dummy_fun,
            token_pattern = None)

        comments_tr = self.vectorizer.fit_transform(tokenized_comments)

        self.regressor = Ridge(random_state=42, alpha=0.8)
        self.regressor.fit(comments_tr, labels)
    
    def predict(self, x):
        texts = x['text_to_transform']
        texts = self.tokenizer(texts.to_list())['input_ids']
        texts = self.vectorizer.transform(texts)
        
        df_scores = pd.DataFrame()
        df_scores.index = range(len(x))

        df_scores['score5'] = self.regressor.predict(texts)
        
        df_scores.index = x.index
        
        return df_scores

In [None]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text


class WeakLearner3:
    def __init__(self, data):
        self.data = data
        self.vectorizer = None
        self.model = None
        self.l_model = None
        self.s_model = None
    
    def fit(self):
        cat_mtpl = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
                    'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

        for category in cat_mtpl:
            self.data[category] = self.data[category] * cat_mtpl[category]

        self.data['score'] = self.data.loc[:, 'toxic':'identity_hate'].sum(axis=1)

        self.data['y'] = self.data['score']

        min_len = (self.data['y'] > 0).sum()  # len of toxic comments
        df_y0_undersample = self.data[self.data['y'] == 0].sample(n=min_len, random_state=201)  # take non toxic comments
        df_train_new = pd.concat([self.data[self.data['y'] > 0], df_y0_undersample])  # make new df
        self.data = self.data.rename(columns={'comment_text':'text'})


        tqdm.pandas()
        self.data['text'] = self.data['text'].progress_apply(text_cleaning)
        df = self.data.copy()
        df['y'].value_counts(normalize=True)
        min_len = (df['y'] >= 0.1).sum()
        df_y0_undersample = df[df['y'] == 0].sample(n=min_len * 2, random_state=402)
        df = pd.concat([df[df['y'] >= 0.1], df_y0_undersample])
        self.vectorizer = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))
        X = self.vectorizer.fit_transform(df['text'])
        self.model = Ridge(alpha=0.5)
        self.model.fit(X, df['y'])
        self.l_model = Ridge(alpha=1.)
        self.l_model.fit(X, df['y'])
        self.s_model = Ridge(alpha=2.)
        self.s_model.fit(X, df['y'])
    
    def predict(self, x):
        df_sub = x.copy()
        df_sub['text'] = x['text_to_transform'].progress_apply(text_cleaning)
        X_test = self.vectorizer.transform(df_sub['text'])
        p1 = self.model.predict(X_test)
        p2 = self.l_model.predict(X_test)
        p3 = self.s_model.predict(X_test)
        
        df_scores = pd.DataFrame()
        df_scores.index = range(len(x))
        
        df_scores['score6'] = p1
        df_scores['score7'] = p2
        df_scores['score8'] = p3
        df_scores['score9'] = (p1 + p2 + p3) / 3.
        
        df_scores.index = x.index
        
        return df_scores

In [None]:
class WeakLearner4:
    def __init__(self):
        self.vectorizer1 = None
        self.vectorizer2 = None
        self.regressor1 = None
        self.regressor2 = None
    
    def fit(self):
        ruddit_df = pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
        ruddit = ruddit_df[["txt", "offensiveness_score"]]

        self.vectorizer1 = TfidfVectorizer(analyzer = 'char_wb', ngram_range = (3,5))
        tfv = self.vectorizer1.fit_transform(ruddit["txt"])

        X = tfv
        Y = ruddit['offensiveness_score']
        self.regressor1 = LinearRegression().fit(X, Y)
        
        #----------------------------------------------------------------------------------------

        data2 = pd.read_csv("../input/jigsaw-regression-based-data/train_data_version2.csv")
        df2 = data2[['text', 'y']]
        
        self.vectorizer2 = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 5))
        X = self.vectorizer2.fit_transform(df2['text'])
        w = df2["y"].values
        y = np.around(w, decimals=2)

        self.regressor2=Ridge(alpha=0.3)
        self.regressor2.fit(X, y)
    
    def predict(self, x):
        tfv_comments = self.vectorizer1.transform(x["text_to_transform"])
        pred1 = self.regressor1.predict(tfv_comments)

        test = self.vectorizer2.transform(x['text_to_transform'])
        pred2 = self.regressor2.predict(test)

        df_scores = pd.DataFrame()
        df_scores.index = range(len(x))
        
        df_scores["score10"] = pred1
        df_scores["score11"] = pred2
        df_scores["score12"] = pred1 + pred2
        
        df_scores.index = x.index
        
        return df_scores

In [None]:
weak_learners_list1 = []
weak_learners_list2 = []
weak_learners_list3 = []
weak_learners_list4 = []

preds = pd.DataFrame()
preds.index = df_train.index

kfold = KFold(n_splits=5, random_state=0, shuffle=True)
for trn_ind, val_ind in kfold.split(df_train):
    train = df_train.loc[trn_ind].copy()
    val = df_train.loc[val_ind].copy().rename({'comment_text': 'text_to_transform'}, axis=1)
    
    weak_learner1 = WeakLearner1()
    weak_learner2 = WeakLearner2(train.copy())
    weak_learner3 = WeakLearner3(train.copy())
    weak_learner4 = WeakLearner4()
    
    weak_learner1.fit()
    preds.loc[val_ind, ['score1', 'score2', 'score3', 'score4']] = weak_learner1.predict(val)
    weak_learners_list1.append(weak_learner1)
    
    weak_learner2.fit()
    preds.loc[val_ind, ['score5']] = weak_learner2.predict(val)
    weak_learners_list2.append(weak_learner2)
    
    weak_learner3.fit()
    preds.loc[val_ind, ['score6', 'score7', 'score8', 'score9']] = weak_learner3.predict(val)
    weak_learners_list3.append(weak_learner3)
    
    weak_learner4.fit()
    preds.loc[val_ind, ['score10', 'score11', 'score12']] = weak_learner4.predict(val)
    weak_learners_list4.append(weak_learner4)

preds

In [None]:
"""p = {
    'weight1': 0.00021541577371536526,
    'weight2': 5.4136806364041875e-05,
    'weight3': 7.054474871811384e-05,
    'weight4': 0.0004950044846158598,
    'weight5': 0.7095369414928109,
    'weight6': 0.960770202502568,
    'weight7': 0.981466744436168,
    'weight8': 0.668985311802379,
    'weight9': 0.5862471473960014,
    'weight10': 0.8412784043394306,
    'weight11': 0.9543552087526861,
    'weight12': 0.9100233856490589
}"""

p = {
    'weight1': 0.00000021541577371536,
    'weight2': 0.00000054136806364041,
    'weight3': 0.00000070544748718113,
    'weight4': 0.00000049500448461585,
    'weight5': 0.7095369414928109,
    'weight6': 0.960770202502568,
    'weight7': 0.981466744436168,
    'weight8': 0.668985311802379,
    'weight9': 0.5862471473960014,
    'weight10': 0.8412784043394306,
    'weight11': 0.9543552087526861,
    'weight12': 0.9100233856490589
}

all_weights = p['weight1'] + p['weight2'] + p['weight3'] + p['weight4'] \
            + p['weight5'] + p['weight6'] + p['weight7'] + p['weight8'] \
            + p['weight9'] + p['weight10'] + p['weight11'] + p['weight12']
weight1 = p['weight1'] / all_weights
weight2 = p['weight2'] / all_weights
weight3 = p['weight3'] / all_weights
weight4 = p['weight4'] / all_weights
weight5 = p['weight5'] / all_weights
weight6 = p['weight6'] / all_weights
weight7 = p['weight7'] / all_weights
weight8 = p['weight8'] / all_weights
weight9 = p['weight9'] / all_weights
weight10 = p['weight10'] / all_weights
weight11 = p['weight11'] / all_weights
weight12 = p['weight12'] / all_weights

preds['score1'] = preds['score1']*weight1/all_weights
preds['score2'] = preds['score2']*weight2/all_weights
preds['score3'] = preds['score3']*weight3/all_weights
preds['score4'] = preds['score4']*weight4/all_weights
preds['score5'] = preds['score5']*weight5/all_weights
preds['score6'] = preds['score6']*weight6/all_weights
preds['score7'] = preds['score7']*weight7/all_weights
preds['score8'] = preds['score8']*weight8/all_weights
preds['score9'] = preds['score9']*weight9/all_weights
preds['score10'] = preds['score10']*weight10/all_weights
preds['score11'] = preds['score11']*weight11/all_weights
preds['score12'] = preds['score12']*weight12/all_weights

In [None]:
import optuna
from sklearn.metrics import mean_squared_error


def objective(trial, data, targets):
    weight1 = trial.suggest_float("weight1", 0.0, 1.0)
    weight2 = trial.suggest_float("weight2", 0.0, 1.0)
    weight3 = trial.suggest_float("weight3", 0.0, 1.0)
    weight4 = trial.suggest_float("weight4", 0.0, 1.0)
    weight5 = trial.suggest_float("weight5", 0.0, 1.0)
    weight6 = trial.suggest_float("weight6", 0.0, 1.0)
    weight7 = trial.suggest_float("weight7", 0.0, 1.0)
    weight8 = trial.suggest_float("weight8", 0.0, 1.0)
    weight9 = trial.suggest_float("weight9", 0.0, 1.0)
    weight10 = trial.suggest_float("weight10", 0.0, 1.0)
    weight11 = trial.suggest_float("weight11", 0.0, 1.0)
    weight12 = trial.suggest_float("weight12", 0.0, 1.0)
    
    all_weights = weight1 + weight2 + weight3 + weight4 + weight5 + weight6 + weight7 + weight8 + weight9 + weight10 + weight11 + weight12
    
    preds = data['score1']*weight1/all_weights + data['score2']*weight2/all_weights + data['score3']*weight3/all_weights + data['score4']*weight4/all_weights \
          + data['score5']*weight5/all_weights + data['score6']*weight6/all_weights + data['score7']*weight7/all_weights + data['score8']*weight8/all_weights \
          + data['score9']*weight9/all_weights + data['score10']*weight10/all_weights + data['score11']*weight11/all_weights + data['score12']*weight12/all_weights
    
    return mean_squared_error(targets['score'], preds)


cat_mtpl = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

for category in cat_mtpl:
    df_train[category] = df_train[category] * cat_mtpl[category]

df_train['score'] = df_train.loc[:, ['obscene', 'toxic', 'threat', 'insult', 'severe_toxic', 'identity_hate']].sum(axis=1)
    
objective_func = lambda trials: objective(trials, preds, df_train)
study = optuna.create_study(direction="minimize")
study.optimize(objective_func, n_trials=1000)

p = study.best_trial.params
p

In [None]:
"""
{'weight1': 0.00021541577371536526,
 'weight2': 5.4136806364041875e-05,
 'weight3': 7.054474871811384e-05,
 'weight4': 0.0004950044846158598,
 'weight5': 0.7095369414928109,
 'weight6': 0.960770202502568,
 'weight7': 0.981466744436168,
 'weight8': 0.668985311802379,
 'weight9': 0.5862471473960014,
 'weight10': 0.8412784043394306,
 'weight11': 0.9543552087526861,
 'weight12': 0.9100233856490589}
"""

In [None]:
"""
{'weight1': 0.002541796479236246,
 'weight2': 0.0037674690476265297,
 'weight3': 0.0001769536717097578,
 'weight4': 0.0005714484767582791,
 'weight5': 0.7417162064087187,
 'weight6': 0.6526050251204994,
 'weight7': 0.8562280212139448,
 'weight8': 0.46216647764090896,
 'weight9': 0.9585122662444765,
 'weight10': 0.8489960257790308,
 'weight11': 0.08578885440269987,
 'weight12': 0.9076543670790261}
"""

In [None]:
all_weights = p['weight1'] + p['weight2'] + p['weight3'] + p['weight4'] \
            + p['weight5'] + p['weight6'] + p['weight7'] + p['weight8'] \
            + p['weight9'] + p['weight10'] + p['weight11'] + p['weight12']
weight1 = p['weight1'] / all_weights
weight2 = p['weight2'] / all_weights
weight3 = p['weight3'] / all_weights
weight4 = p['weight4'] / all_weights
weight5 = p['weight5'] / all_weights
weight6 = p['weight6'] / all_weights
weight7 = p['weight7'] / all_weights
weight8 = p['weight8'] / all_weights
weight9 = p['weight9'] / all_weights
weight10 = p['weight10'] / all_weights
weight11 = p['weight11'] / all_weights
weight12 = p['weight12'] / all_weights

test_preds = pd.DataFrame()
test_preds.index = df_test.index
for i in range(12):
    test_preds[f'score{i+1}'] = 0.0
#print(f'weaklen: {weak_learners_list1[fold].predict(df_test.rename({"text": "text_to_transform"}, axis=1))}')
for fold in range(len(weak_learners_list1)):
    df_scores = weak_learners_list1[fold].predict(df_test.rename({'text': 'text_to_transform'}, axis=1))
    for column in df_scores.columns:
        df_scores[column] /= len(weak_learners_list1)
        test_preds[column] += df_scores[column]

for fold in range(len(weak_learners_list2)):
    df_scores = weak_learners_list2[fold].predict(df_test.rename({'text': 'text_to_transform'}, axis=1))
    for column in df_scores.columns:
        df_scores[column] /= len(weak_learners_list2)
        test_preds[column] += df_scores[column]

for fold in range(len(weak_learners_list3)):
    df_scores = weak_learners_list3[fold].predict(df_test.rename({'text': 'text_to_transform'}, axis=1))
    for column in df_scores.columns:
        df_scores[column] /= len(weak_learners_list3)
        test_preds[column] += df_scores[column]

for fold in range(len(weak_learners_list4)):
    df_scores = weak_learners_list4[fold].predict(df_test.rename({'text': 'text_to_transform'}, axis=1))
    for column in df_scores.columns:
        df_scores[column] /= len(weak_learners_list4)
        test_preds[column] += df_scores[column]

submission_data = df_test[['comment_id']]
submission_data['score'] = 0.0

submission_data['score'] = test_preds['score1']*weight1 + test_preds['score2']*weight2 + test_preds['score3']*weight3 \
                         + test_preds['score4']*weight4 + test_preds['score5']*weight5 + test_preds['score6']*weight6 \
                         + test_preds['score7']*weight7 + test_preds['score8']*weight8 + test_preds['score9']*weight9 \
                         + test_preds['score10']*weight10 + test_preds['score11']*weight11 + test_preds['score12']*weight12
submission_data.to_csv('submission.csv', index=False)

In [None]:
"""
def check_imbalance(row):
    toxity = row[2:].sum()
    if toxity > 0:
        return 1
    else:
        return 0


class TextDataset(Dataset):
    def __init__(self, data, tokenizer, max_length, is_test=False):
        super(TextDataset, self).__init__()
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        x = self.data.iloc[index, 1]
        if self.is_test:
            targets = torch.tensor(self.data.iloc[index, 0])
        else:
            targets = torch.tensor(self.data.iloc[index, -1])
        
        encoded = self.tokenizer(x, add_special_tokens=True, max_length=self.max_length,
                                return_token_type_ids=False, padding='max_length',
                                truncation=True, return_attention_mask=True,
                                return_tensors='pt')
        
        input_ids = encoded['input_ids'].squeeze()
        attention_mask = encoded['attention_mask'].squeeze()
        
        return input_ids, attention_mask, targets


class TextNet(nn.Module):
    def __init__(self, bert_model):
        super(TextNet, self).__init__()
        self.bert_model = bert_model
        self.fc = nn.Linear(768, 1)
    
    def forward(self, input_ids, attention_mask):
        out = self.bert_model(input_ids, attention_mask, return_dict=True)['pooler_output']
        return self.fc(out)


def train_epoch(model, train_loader, criterion, optimizer, DEVICE):
    model.train()
    
    losses = []
    
    for data in tqdm(train_loader):
        input_ids, attention_mask, targets = data
        input_ids = input_ids.to(DEVICE)
        attention_mask = attention_mask.to(DEVICE)
        targets = targets.to(DEVICE)

        output = model(input_ids, attention_mask)

        loss = criterion(output.squeeze().float(), targets.float())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        losses.append(loss.item())

    return np.mean(losses)


def val_epoch(model, val_loader, criterion, DEVICE):
    model.eval()
    
    losses = []
    
    with torch.no_grad():
        for data in tqdm(val_loader):
            input_ids, attention_mask, targets = data
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
            targets = targets.to(DEVICE)

            output = model(input_ids, attention_mask)

            loss = criterion(output.squeeze().float(), targets.float())

            losses.append(loss.item())

    return np.mean(losses)


def make_submission(model, test_loader, DEVICE, submission_data):
    model.eval()
    
    current_ind = 0
    
    for data in tqdm(test_loader):
        input_ids, attention_mask, _ = data
        input_ids = input_ids.to(DEVICE)
        attention_mask = attention_mask.to(DEVICE)
        
        preds = model(input_ids, attention_mask).cpu().tolist()
        submission_data.loc[current_ind:current_ind + len(preds) - 1, 'score'] = preds
        current_ind += len(preds)
    
    print(f'submission_data: {submission_data}')
    
    submission_data.to_csv('submission.csv', index=False)


df_train['is_toxic'] = df_train.apply(check_imbalance, axis=1)
sample_numb = len(df_train.loc[df_train['is_toxic'] == 0]) - len(df_train.loc[df_train['is_toxic'] == 1])
not_toxic_df = df_train.loc[df_train['is_toxic'] == 0].drop('is_toxic', axis=1).reset_index(drop=True)
toxic_df = df_train.loc[df_train['is_toxic'] == 1].sample(n=sample_numb, replace=True, random_state=0, axis=0).drop('is_toxic', axis=1).reset_index(drop=True)
oversampled_df = pd.concat([not_toxic_df, toxic_df], axis=0)
oversampled_df.index = range(len(oversampled_df))
oversampled_df

category_weights = {
    'toxic': 0.32, 
    'severe_toxic': 1.5, 
    'obscene': 0.16, 
    'threat': 1.5, 
    'insult': 0.64, 
    'identity_hate': 1.5
}

for category, weight in category_weights.items():
    oversampled_df[category] = oversampled_df[category] * weight

oversampled_df['score'] = oversampled_df.drop(['id', 'comment_text'], axis=1).mean(axis=1)

#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('../input/bert-uncased')
train_df, val_df = train_test_split(oversampled_df, test_size=0.2, random_state=0, shuffle=True)
train_df.index = range(len(train_df))
val_df.index = range(len(val_df))
print(f'train_len: {len(train_df)}, val_len: {len(val_df)}')

train_dataset = TextDataset(train_df, tokenizer, max_length=256)
val_dataset = TextDataset(val_df, tokenizer, max_length=256)
test_dataset = TextDataset(df_test, tokenizer, max_length=256, is_test=True)

BATCH_SIZE = 16
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, pin_memory=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, pin_memory=True)

#bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('../input/bert-base-uncased')

EPOCHS = 1
LEARNING_RATE = 2e-5

criterion = nn.MSELoss()

model = TextNet(bert_model).to(DEVICE)

submission_data = df_test[['comment_id']]
submission_data['score'] = 0.0

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

best_val_loss = np.inf

for epoch in range(EPOCHS):
    print(f'Epoch: {epoch+1}/{EPOCHS}')
    print('-' * 10)

    print('Training')
    train_loss = train_epoch(model, train_loader, criterion, optimizer, DEVICE)

    print('Validating')
    val_loss = val_epoch(model, val_loader, criterion, DEVICE)

    print(f'Train Loss: {train_loss}\t Val Loss: {val_loss}')

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'toxicity_best_model.pth.tar')

print('Make submission')
make_submission(model, test_loader, DEVICE, submission_data)
"""