In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim

from transformers import BertTokenizer, BertModel
from tqdm.notebook import tqdm

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
NAMELIST = ['math.AT', 'stat.AP', 'cs.AR', 'math.QA', 'q-bio.MN', 'eess.AS','eess.IV', 'stat.ME', 'econ.GN',
            'eess.SP', 'q-fin.RM', 'cs.LG', 'cs.CR', 'q-bio.BM', 'q-fin.GN', 'q-fin.MF', 'q-fin.PR', 'math.CV',
            'cs.LO', 'econ.TH', 'math.CO', 'cs.AI', 'math.AC', 'q-bio.CB','q-fin.CP', 'cs.CL', 'cs.DC', 'math.LO', 
            'math.NT', 'cs.SD', 'q-fin.TR','cs.CV', 'stat.ML', 'q-fin.EC', 'econ.EM', 'cs.CE', 'stat.CO','math.PR', 
            'q-bio.NC', 'math.AP', 'cs.OS', 'cs.NI', 'cs.IT', 'cs.PL', 'cs.GT', 'cs.DM', 'math.IT', 'cs.SE', 'cs.RO', 
            'stat.TH', 'cs.DB','math.ST', 'q-bio.GN', 'q-fin.PM', 'q-bio.TO', 'math.GR', 'cs.IR']

DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

LEARNING_RATE = 1e-5
EPOCHS = 6
THRESHOLD = 0.1

In [None]:
data = pd.read_csv("/kaggle/input/kriti-dataset/train.csv")

In [None]:
data = data.sample(frac = 1, random_state = 42)

In [None]:
train_data = data[:48650]
val_data = data[48650:] # 95-5 Split

In [None]:
def get_list_from_string(text): #parses list-string to list of strings

    text = text[1:-1]
    text = text.replace(" ", "")
    text = text.replace("'", "")
    list = text.split(',')

    return list

In [None]:
def get_one_hot_vec(text):

    wlist = get_list_from_string(text)

    vec = []

    for name in NAMELIST:

        vec.append(1 if name in wlist else 0)
    
    return vec

In [None]:
!pip install pylatexenc

In [None]:
from pylatexenc.latex2text import LatexNodes2Text

In [None]:
import re

def remove_space(text):
    text = text.strip()
    text = text.split()
    return " ".join(text)

def lowerall(text):
    
    text = text.split()
    lwords = [word.lower() for word in text]
    return " ".join(lwords)

def remove_links(input_string):

    pattern1 = r'\\href\{.*?\}\{.*?\}'
    pattern2 = r'\\href\{.*?\}'
    pattern3 = r'\\url\{.*?\}'

    cleaned_string = re.sub(pattern1, '', input_string)
    cleaned_string = re.sub(pattern2, '', cleaned_string)
    cleaned_string = re.sub(pattern3, '', cleaned_string)
    
    return cleaned_string

def clean_text(text):
    text = remove_space(text)
    text = remove_links(text)
    
    text = LatexNodes2Text().latex_to_text(text)
    
    text = re.sub(r'[^a-zA-Z0-9\s.,;:!?(){}\[\]<>+-/*=%$&@#~≥\\_~`]', '', text)
    
    text = lowerall(text)
    
    return text


In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

In [None]:
class TheDataset(Dataset):
    
    def __init__(self, df, test):
        
        self.IDs = df['Id'].values
        Abstracts = df['Abstract'].values
        Titles = df['Title'].values
        
        self.test = test
        
        self.Texts = []
        
        for i in range(len(self.IDs)):
            
            self.Texts.append(clean_text(Titles[i] + ' ' + Abstracts[i]))
            

        if not self.test:
            
            Cats = df['Categories'].values
            self.Vectors = [get_one_hot_vec(cat) for cat in Cats]
            

    def __len__(self):
        return len(self.IDs)
    
    def __getitem__(self, idx):
        the_text = self.Texts[idx]
        
        inputs = bert_tokenizer.encode_plus(
            the_text,
            truncation=True,
            add_special_tokens=True,
            max_length= 510,
            padding='max_length',
            return_token_type_ids=True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        if self.test:
            return {
                'textID' : self.IDs[idx],
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            }
        else:
            return {
                'textID' : self.IDs[idx],
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'targets': torch.Tensor(self.Vectors[idx])
            }


In [None]:
class NeuralNet(nn.Module):
    
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.bert = BertModel.from_pretrained('allenai/scibert_scivocab_uncased')
        self.fc1 = nn.Linear(768, 1024)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(1024, 57)
    
    def forward(self, ids, mask, token_type_ids):
        _, features = self.bert(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        out1 = self.fc1(features)
        out2 = self.relu(out1)
        out = self.fc2(out2)
        
        return out

In [None]:
model = NeuralNet()
model.to(DEVICE)

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.AdamW(params =  model.parameters(), lr=LEARNING_RATE, weight_decay = 1e-6)

In [None]:
train_dataset = TheDataset(train_data, False)
val_dataset = TheDataset(val_data, False)

In [None]:
train_loader = DataLoader(train_dataset, batch_size = 8, num_workers = 4)
val_loader = DataLoader(val_dataset, batch_size = 8, num_workers = 4)

In [None]:
def train_fn(train_loader, model, criterion, optimizer):
    
    losses = []
    
    model.train()

    progress = tqdm(train_loader, total=len(train_loader))

    for _, data in enumerate(progress):
        
        ids = data['ids'].to(DEVICE, dtype = torch.long)
        mask = data['mask'].to(DEVICE, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(DEVICE, dtype = torch.long)
        
        targets = data['targets'].to(DEVICE, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        losses.append(loss.item())
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()              


    return np.mean(losses)

In [None]:
def validation(val_loader, model):
    
    model.eval()
    
    fin_targets=[]
    fin_outputs=[]
    
    progress = tqdm(val_loader, total=len(val_loader))
    
    with torch.no_grad():
        
        for _, data in enumerate(progress):
            
            ids = data['ids'].to(DEVICE, dtype = torch.long)
            mask = data['mask'].to(DEVICE, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(DEVICE, dtype = torch.long)
            
            targets = data['targets'].to(DEVICE, dtype = torch.float)
            
            outputs = model(ids, mask, token_type_ids)
            
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            
            outlist = torch.sigmoid(outputs).cpu().detach().numpy().tolist()
        
            one_hot_out = []
            
            for outl in outlist:
                
                yo = []
                
                for term in outl:
                    
                    if(term >= THRESHOLD):
                        
                        yo.append(1)
                    
                    else:
                        
                        yo.append(0)
                
                one_hot_out.append(yo)
                
            
            fin_outputs.extend(one_hot_out)
            
    return fin_outputs, fin_targets

In [None]:
torch.cuda.empty_cache()

In [None]:
# model.load_state_dict(torch.load('/kaggle/working/checkpoint.pth')['state_dict'])

In [None]:
train_losses = []

best_dict = None
best_loss = np.inf

for ep in range(EPOCHS):

    print('='*5 + f" Epoch {ep+1} " + '='*5)

    tr_loss = train_fn(train_loader, model, loss_fn, optimizer)

    if tr_loss < best_loss:
        best_loss = tr_loss
        best_dict = model.state_dict()
        
        checkpoint = {'model': model, 'state_dict': model.state_dict()}
        torch.save(checkpoint, 'checkpoint.pth')

    train_losses.append(tr_loss)

    print(f"Epoch {ep + 1} - Train Loss {tr_loss:.4f}\n")

In [None]:
import sklearn.metrics as metrics

In [None]:
outputs, targets = validation(val_loader, model)

f1_score_macro = metrics.f1_score(targets, outputs, average='macro')

print(f"F1 Score (Macro) = {f1_score_macro}")

In [None]:
def get_preds(test_loader, model):
    
    model.eval()
    
    fin_targets=[]
    fin_outputs=[]
    
    progress = tqdm(test_loader, total=len(test_loader))
    
    with torch.no_grad():
        
        pred_dict = {}
        
        for _, data in enumerate(progress):
            
            textid = data['textID']
            
            ids = data['ids'].to(DEVICE, dtype = torch.long)
            mask = data['mask'].to(DEVICE, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(DEVICE, dtype = torch.long)
            
            outputs = model(ids, mask, token_type_ids)
    
            outlist = torch.sigmoid(outputs).cpu().detach().numpy().tolist()
        
            one_hot_out = []
            
            for outl in outlist:
                
                yo = []
                
                for term in outl:
                    
                    if(term >= THRESHOLD):
                        
                        yo.append(1)
                    
                    else:
                        
                        yo.append(0)
                
                one_hot_out.append(yo)
                
            pred_dict[textid] = one_hot_out
    
            
    return pred_dict

In [None]:
testdf = pd.read_csv("/kaggle/input/kriti-dataset/test.csv")

In [None]:
test_dataset = TheDataset(testdf, test = True)
test_loader  = DataLoader(test_dataset, batch_size = 8, num_workers = 4)

In [None]:
pred_dict = get_preds(test_loader, model)

In [None]:
pred_df = pd.DataFrame(list(pred_dict.items()), columns=['Id', 'OneHotVec'])

In [None]:
final_pred_dict = {}

In [None]:
for i in range(1372):
    
    for _, id in enumerate(pred_df['Id'][i]):
        
        id = id.numpy()
        id = id.item()
        
        final_pred_dict[id] = pred_df['OneHotVec'][i][_]

In [None]:
final_pred_df = pd.DataFrame(list(final_pred_dict.items()), columns=['Id', 'OneHotVec'])

In [None]:
for i, name in enumerate(NAMELIST):
    
    final_pred_df[name] = final_pred_df['OneHotVec'].apply(lambda x : x[i])
    

In [None]:
final_pred_df = final_pred_df.drop(['OneHotVec'], axis = 1)

In [None]:
final_pred_df

In [None]:
final_pred_df.to_csv('manas.csv', index = False)