In [None]:
# Insert code here.
import pandas as pd
import numpy as np
import random
import re
import time
import datetime
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, neighbors
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, BertConfig, AutoModel
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from torch.utils.data import Dataset
from tqdm import tqdm
# from sentence_transformers import SentenceTransformer
# sent_encoder = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
torch.cuda.empty_cache()

In [None]:

labels = ['fake','hate', 'defamation','offensive','non-hostile']
lab_num = 4
EPOCH_NUM = 0
lab = labels[lab_num]
epoch_name = '../temp/finetuned/'+lab+'_epoch_'+str(EPOCH_NUM)

In [None]:
models = ['ai4bharat/indic-bert', 'distilbert-base-uncased-finetuned-sst-2-english', 'textattack/roberta-base-SST-2','roberta-base', 'google/electra-base-discriminator', 'xlnet-base-cased', 'xlm-roberta-base', '/scratch/indic-tapt','/scratch/indic-tapt2', '/scratch/indic-tapt/checkpoint-500']
model_num = 0
tokenizer = AutoTokenizer.from_pretrained(models[model_num])
src = '../temp/preprocessed/'

In [None]:
# train = pd.read_csv('../datasets/covid/Constraint_English_Train - Sheet1.csv')
# test = pd.read_csv('../datasets/covid/Constraint_English_Val - Sheet1.csv')

In [None]:
import pickle
with open(src+'train.pickle','rb') as f:
    train = pickle.load(f)
    train = pd.DataFrame.from_dict(train)
    train.drop(train.head(1).index, inplace=True)
with open(src+'valid.pickle','rb') as f:
    valid = pickle.load(f)
    valid = pd.DataFrame.from_dict(valid)
    valid.drop(valid.head(1).index, inplace=True)
with open(src+'test.pickle','rb') as f:
    test = pickle.load(f)
#     del test['task_1']
    test = pd.DataFrame.from_dict(test)
#     test.drop(test.head(1).index, inplace=True)
#     test = pd.DataFrame.from_dict(test)
# test = pd.read_csv('data/valid.tsv', sep='\t')

In [None]:
train = pd.concat([train, valid])
test.head(10)

In [None]:
lab = labels[lab_num]
# def label_encode(val):
#     return labels.index(val)
def label_encode(val):
    val = val.split(',')
    if lab_num == 4:
        if lab in val:
            return 0
        else:
            return 1
    else:
        if lab in val:
            return 1
        else:
            return 0

In [None]:
train['label'] = train.task_1.apply(label_encode)
train['tweet'] = train.full_tweet
test['tweet'] = test.full_tweet

In [None]:
train.label.sample(10)

In [None]:
train = train.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = []

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
# train.tweet = train.tweet.apply(clean_text)
# train.tweet = train.tweet.str.replace('\d+', '')

In [None]:
# test.label = test.label.apply(label_encode)
test = test.reset_index(drop=True)
# test.tweet = test.tweet.apply(clean_text)
# test.tweet = test.tweet.str.replace('\d+', '')

In [None]:
train.tweet.sample(10)

In [None]:
# split the dataset into training and validation datasets 
from sklearn.model_selection import train_test_split
# train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['tweet'], train['label'])
train_x, valid_x, train_y, valid_y = train_test_split(train['tweet'], train['label'], test_size=0.2)

In [None]:
def count_words(text):
    try:
        return len(text.split())
    except:
        print(text)
        return None

In [None]:
total = 0
maxw = 0
large_count = 0
for i in train_x:
    temp = count_words(i)
    total += temp
    maxw = temp if temp > maxw else maxw
    large_count += 1 if temp > 120 else 0
total/len(train_x), maxw, large_count, len(train_x)

In [None]:
# MAX_LENGTH = 50
posts = train.values
categories = train.values

In [None]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 1e-05


In [None]:
import gensim.models as gsm
e2v = gsm.KeyedVectors.load_word2vec_format('emoji2vec.bin', binary=True)
# happy_vector = e2v['😂']    # Produces an embedding vector of length 300

# Download the bin file from here https://github.com/uclnlp/emoji2vec/blob/master/pre-trained/emoji2vec.bin

def getEmojiEmbeddings(emojiList,dim=300,verbose = False):
  """ Generates an emoji vector by averaging the emoji representation for each emoji. If no emoji returns an empty list of dimension dim"""
  if dim < 300:
    raise IndexError("Dim has to be greater than 300")
  result = np.zeros(dim)
  if (len(emojiList) == 0):
    return result
  else:
    embs = None
    for i in emojiList:
      if verbose:
        if i not in e2v.vocab:
          print(i)
    embs = np.mean([e2v[i] for i in emojiList if i in e2v.vocab], axis=0)
  if np.any(np.isnan(embs)):
    return result
  result[:300] = embs
  return result
getEmojiEmbeddings(valid.emoji.values[0])

In [None]:
ids = tokenizer.encode_plus(
            valid.full_tweet.values[0],
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=128,
            pad_to_max_length=True,
            return_attention_mask = True,
            return_token_type_ids=True
        )['input_ids']
torch.tensor(ids, dtype=torch.long).shape, torch.tensor(getEmojiEmbeddings(valid.emoji.values[0]), dtype=torch.long).shape

In [None]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len, t = False):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.tweet
        self.emoji = dataframe.emoji
        self.hash = dataframe.segmented_hash
        self.t = t
        if not self.t:
            self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_attention_mask = True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        h_text = self.hash[index]
        h_text = " ".join(h_text)
        inputs = self.tokenizer.encode_plus(
            h_text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_attention_mask = True,
            return_token_type_ids=True
        )
        h_ids = inputs['input_ids']
        h_mask = inputs['attention_mask']
        h_token_type_ids = inputs["token_type_ids"]
#         h_inputs
        emoji = getEmojiEmbeddings(self.emoji[index])
        if self.t:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'h_ids': torch.tensor(h_ids, dtype=torch.long),
                'h_mask': torch.tensor(h_mask, dtype=torch.long),
                'h_token_type_ids': torch.tensor(h_token_type_ids, dtype=torch.long),
                'emoji' : torch.tensor(emoji, dtype=torch.long),
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'h_ids': torch.tensor(h_ids, dtype=torch.long),
                'h_mask': torch.tensor(h_mask, dtype=torch.long),
                'h_token_type_ids': torch.tensor(h_token_type_ids, dtype=torch.long),
                'emoji' : torch.tensor(emoji, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.long)
            }

In [None]:
# Creating the dataset and dataloader for the neural network

train_size = 0.85
train_data=train.sample(frac=train_size,random_state=200)
test_data=train.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(train.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = AutoModel.from_pretrained(models[model_num])
        self.l2 = AutoModel.from_pretrained(models[model_num])
        
        self.pre_classifier_1 = torch.nn.Linear(768, 768)
        self.pre_classifier_2 = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.pre_classifier_3 = torch.nn.Linear(1836, 1836)
#         self.pre_classifier_3 = torch.nn.Linear(768, 100)
        self.classifier = torch.nn.Linear(1836, 2)

    def forward(self, input_ids, attention_mask, token_type_ids, h_ids, h_mask, h_token_type_ids, emoji):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state_1 = output_1[0]
        pooler_1 = hidden_state_1[:, 0]
        pooler_1 = self.pre_classifier_1(pooler_1)
        pooler_1 = torch.nn.Tanh()(pooler_1)
        pooler_1 = self.dropout(pooler_1)
        output_2 = self.l2(input_ids=h_ids, attention_mask=h_mask)
        hidden_state_2 = output_2[0]
        pooler_2 = hidden_state_2[:, 0]
        pooler_2 = self.pre_classifier_2(pooler_2)
        pooler_2 = torch.nn.Tanh()(pooler_2)
        pooler_2 = self.dropout(pooler_2)
        pooler_3 = torch.cat((pooler_1, pooler_2), 1)
        pooler_3 = torch.cat((pooler_3, emoji), 1)
#         print(pooler_1.shape,hidden_state_1.shape, pooler_2.shape, emoji.type(torch.FloatTensor).shape)
#         pooler_3 = torch.nn.Tanh()(emoji.type(torch.FloatTensor))
#         pooler_3 = self.dropout(pooler_3)
#         print(pooler_3.shape)
        pooler_3 = self.pre_classifier_3(pooler_3)
#         pooler_3 = self.pre_classifier_3(pooler_2)
        pooler_3 = torch.nn.Tanh()(pooler_3)
        pooler_3 = self.dropout(pooler_3)
        output = self.classifier(pooler_3)
        return output

model = BERTClass()
model.to(device)

In [None]:
# from torchsummary import summary
# print(repr(model))


In [None]:
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))


In [None]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets)

In [None]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    total_train_loss = 0
    count = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        h_ids = data['h_ids'].to(device, dtype = torch.long)
        h_mask = data['h_mask'].to(device, dtype = torch.long)
        h_token_type_ids = data['h_token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        emoji = data['emoji'].to(device, dtype = torch.long)
        outputs = model(ids, mask, token_type_ids, h_ids, h_mask, h_token_type_ids, emoji)
        optimizer.zero_grad()
#         loss = outputs.loss
        loss = loss_fn(outputs, targets)
#         if _%50==0:
#             print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        total_train_loss += loss.item()
        count += 1
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    print(f'Epoch: {epoch}, Loss:  {total_train_loss/count}')
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            h_ids = data['h_ids'].to(device, dtype = torch.long)
            h_mask = data['h_mask'].to(device, dtype = torch.long)
            h_token_type_ids = data['h_token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            emoji = data['emoji'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids, h_ids, h_mask, h_token_type_ids, emoji)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    fin_outputs = list(np.argmax(np.array(fin_outputs), axis=1).flatten())
    print(classification_report(fin_targets, fin_outputs))
    torch.save(model, '../temp/finetuned/'+lab+'_epoch_'+str(epoch))
    return fin_outputs, fin_targets
#     final_outputs = np.array(fin_outputs) >=0.5
#     final = []
#     final_t = []
#     final_fine = [[],[],[],[]]
#     final_fine_t = [[],[],[],[]]
#     for (i,j) in zip(final_outputs, fin_targets):
#         output_sum = sum(i)
#         target_sum = sum(j)
#         if output_sum == 0:
#             final.append(0)
#         else:
#             final.append(1)
#         if target_sum == 0:
#             final_t.append(0)
#         else:
#             final_t.append(1)
#         for p in range(4):
#             final_fine[p].append(int(i[p]))
#             final_fine_t[p].append(int(j[p]))
#     print("Coarse:")
#     print(classification_report(final, final_t))
#     for i in range(4):
#         print("Fine", i)
    
#     return fin_outputs, fin_targets

In [None]:
# for epoch in range(EPOCHS):
#     out, tar = train(epoch)
# #     break

In [None]:
# out[0:10], tar[0:10]hjkhnk

In [None]:
# Creating the dataset and dataloader for the neural network

# train_size = 0.8
# test_data=test.sample(frac=1,random_state=200)
# test_data=train.drop(train_data.index).reset_index(drop=True)
test_data = test.reset_index(drop=True)
testing = MultiLabelDataset(test_data, tokenizer, MAX_LEN, t=True)
test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }
testing_loader = DataLoader(testing, **test_params)

In [None]:
!nvidia-smi

In [None]:

fin_targets=[]
fin_outputs=[]
# print(f'Epoch: {epoch}, Loss:  {total_train_loss/count}')
with torch.no_grad():
    model = torch.load(epoch_name, map_location=device)
    model.eval()
    for _, data in tqdm(enumerate(testing_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        h_ids = data['h_ids'].to(device, dtype = torch.long)
        h_mask = data['h_mask'].to(device, dtype = torch.long)
        h_token_type_ids = data['h_token_type_ids'].to(device, dtype = torch.long)
#         targets = data['targets'].to(device, dtype = torch.long)
        emoji = data['emoji'].to(device, dtype = torch.long)
        outputs = model(ids, mask, token_type_ids, h_ids, h_mask, h_token_type_ids, emoji)
#         fin_targets.extend(targets.cpu().detach().numpy().tolist())
        fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
fin_outputs = list(np.argmax(np.array(fin_outputs), axis=1).flatten())
# print(classification_report(fin_outputs, fin_targets))

In [None]:
fin_outputs[0:20]

In [None]:
test_data.head(10)

In [None]:
test['label'] = np.array(fin_outputs)

In [None]:
len(fin_outputs)

In [None]:
len(test.full_tweet.values)

In [None]:
test.sample(10)

In [None]:
# def label_decode(val):
#     return labels[val]
# test.label = test.label.apply(label_decode)

In [None]:

test.to_csv(path_or_buf='../temp/labels/'+lab+'.txt', index=False, columns = ['tweet_id', 'label'] )