In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import transformers

import tokenizers
from tqdm import tqdm

import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score, log_loss

from scipy.special import softmax

import warnings
warnings.filterwarnings('ignore')

In [2]:
import os 
os.getcwd()

'/home/thanish/Competition/Zindi/Tech4MentalHealth/Notebook'

In [3]:
config = {'train_path' : '../data/train_corrected.csv',
          'test_path' : '../data/test_corrected.csv',
          'train_batch_size' : 4,
          'valid_batch_size' : 8,
          'test_batch_size' : 64,
          'MAX_LEN' : 196,
          'EPOCH' : 3, 
          'XLNET_PATH_Azure': '/home/thanish/transformer_models/xlnet_base_cased'
          }

TOKENIZER = transformers.XLNetTokenizer.from_pretrained('xlnet-large-cased', lowercase = True)

In [4]:
train_DF = pd.read_csv(config['train_path'])
test_DF = pd.read_csv(config['test_path'])

train_DF.head()

Unnamed: 0,ID,text,label
0,SUAVK39Z,i feel that it was better i dream happy,Depression
1,9JDAGUV3,why do i get hallucinations,Drugs
2,419WR1LQ,i am stressed due to lack of financial support...,Depression
3,6UY7DX6Q,why is life important,Suicide
4,FYC0FTFB,how could i be helped to go through the depres...,Depression


In [5]:
# Convert the label to OHE
train_DF = pd.concat([train_DF[['ID', 'text']], pd.get_dummies(train_DF.label)], axis = 1)
train_DF

Unnamed: 0,ID,text,Alcohol,Depression,Drugs,Suicide
0,SUAVK39Z,i feel that it was better i dream happy,0,1,0,0
1,9JDAGUV3,why do i get hallucinations,0,0,1,0
2,419WR1LQ,i am stressed due to lack of financial support...,0,1,0,0
3,6UY7DX6Q,why is life important,0,0,0,1
4,FYC0FTFB,how could i be helped to go through the depres...,0,1,0,0
...,...,...,...,...,...,...
611,BOHSNXCN,what should i do to stop alcoholism,1,0,0,0
612,GVDXRQPY,how to become my oneself again,0,0,0,1
613,IO4JHIQS,how can someone stop it,1,0,0,0
614,1DS3P1XO,i feel unworthy,0,1,0,0


In [6]:
np.random.seed(100)
train_local, valid_local = train_test_split(train_DF,
                                            test_size = 0.2,
                                            random_state = 100)

train_local = train_local.reset_index(drop = True)
valid_local = valid_local.reset_index(drop = True)

print(train_local.shape, valid_local.shape)

(492, 6) (124, 6)


In [7]:
class form_input():
    
    def __init__(self, text_id, text, label, data_type = 'test'):
        self.data_type = data_type
        self.text_id = text_id
        self.text = text
        self.label = label
        self.max_len = config['MAX_LEN']
        self.tokenizer = TOKENIZER

        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, item):
        inputs =  TOKENIZER.encode_plus(self.text[item])
        
        sub_id = self.text_id[item]
        ids = inputs['input_ids']
        tok_type_id = inputs['token_type_ids']
        att_mask = inputs['attention_mask']
        pad_len = self.max_len - len(ids)

        ids = ids + [0]*pad_len
        tok_type_id = tok_type_id + [0]*pad_len
        att_mask = att_mask + [0]*pad_len
        
        if self.data_type != 'test':
            label = self.label[item]
        else:
            label = 1
        
        return {'sub_id': sub_id,
                #'Actual_text': self.text[item],
                'ids': torch.tensor(ids, dtype = torch.long),
                'mask': torch.tensor(att_mask, dtype = torch.long),
                'token_type_ids': torch.tensor(tok_type_id, dtype = torch.long),
                'targets': torch.tensor(label, dtype = torch.long)}
    

In [8]:
lab_columns = ['Alcohol', 'Depression', 'Drugs', 'Suicide']

train_local_data = form_input(train_local.ID, train_local.text, train_local[lab_columns].values, 'train')
valid_local_data = form_input(valid_local.ID, valid_local.text, valid_local[lab_columns].values, 'train')
train_prod_data = form_input(train_DF.ID, train_DF.text, train_DF[lab_columns].values, 'train')
test_prod_data = form_input(test_DF.ID, test_DF.text, None, 'test')

train_local_data[10]

{'sub_id': 'BOHSNXCN',
 'ids': tensor([  113,   170,    17,   150,   112,    22,   829, 28793,     4,     3,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,

In [9]:
train_local_data_loader = DataLoader(train_local_data, 
                                     #shuffle=True,
                                     sampler = RandomSampler(train_local_data),
                                     batch_size=config['train_batch_size'])
valid_local_data_loader = DataLoader(valid_local_data,
                                     #shuffle=True,
                                     sampler = RandomSampler(valid_local_data),
                                     batch_size=config['valid_batch_size'])

train_prod_data_loader = DataLoader(train_prod_data, 
                                    #shuffle=True,
                                    sampler = RandomSampler(train_prod_data),
                                    batch_size=config['train_batch_size'])

test_prod_data_loader = DataLoader(test_prod_data,
                                   #shuffle=False,
                                   sampler = SequentialSampler(test_prod_data),
                                   batch_size=config['test_batch_size'])


In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [37]:
class XLNETMultiLabelSequenceClassification(torch.nn.Module):
    def __init__(self, num_labels):
        super(XLNETMultiLabelSequenceClassification, self).__init__()
        self.freeze_bert = True
        self.num_labels = num_labels
        self.xlnet = transformers.XLNetModel.from_pretrained(config['XLNET_PATH_Azure'])
        self.drop = nn.Dropout(0.2)
        self.classifier = torch.nn.Linear(768, self.num_labels)
        
        freez_parm = ['classifier', 
                      '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
                      '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23']
        if self.freeze_bert:
            for n, param in self.xlnet.named_parameters():
                if not any(nd in n for nd in freez_parm):
                    param.requires_grad = False
                    
    def pool_hidden_state(self, last_hidden_state):
        "Pool the hidden output into a single mean vector"
        last_hidden_state = last_hidden_state[0]
        mean_last_hidden_state = torch.mean(last_hidden_state, 1)
        return mean_last_hidden_state
        
    def forward(self, input_ids, token_type_ids = None, attention_mask = None, labels = None):
        # Last layer
        last_hidden_state = self.xlnet(input_ids = input_ids, 
                                       token_type_ids = token_type_ids,
                                       attention_mask = attention_mask
                                       )
        # Pooled the outputs in a mean vector
        mean_last_hidden_state = self.pool_hidden_state(last_hidden_state)
        mean_last_hidden_state = self.drop(mean_last_hidden_state)
        logits = self.classifier(mean_last_hidden_state)
        
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), 
                            labels.view(-1, self.num_labels))
            return loss, logits
        else:
            return logits
    
        

In [38]:
def setting_seed(seed_no = 100):
    random.seed(seed_no)
    np.random.seed(seed_no)
    torch.manual_seed(seed_no)
    torch.cuda.manual_seed_all(seed_no)
    

In [39]:
# if torch.cuda.device_count()>1:
#     print("It has {} GPUs".format(torch.cuda.device_count()))
    
#     setting_seed(seed_no = 50)
#     model = XLNETMultiLabelSequenceClassification(num_labels = len(lab_columns))
#     model = nn.DataParallel(model)
#     model.to(device)


In [40]:
def params_2_tune(model):
    FULL_FINETUNING = True
    if FULL_FINETUNING:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
             'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
             'weight_decay': 0.0}
        ]
    else:
        param_optimizer = list(model.named_parameters()) 
        optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
    
    return optimizer_grouped_parameters


In [41]:
def train_fn(data_loader, model, optimizer, scheduler, params):
    model.train()

    setting_seed(seed_no = seed)
    
    train_loss  = 0
    for index, dataset in tqdm(enumerate(data_loader), total = len(data_loader)):
        ids = dataset['ids'].to(device, dtype = torch.long)
        mask = dataset['mask'].to(device, dtype = torch.long)
        token_type_ids = dataset['token_type_ids'].to(device, dtype = torch.long)
        target = dataset['targets'].to(device, dtype = torch.float)
    
        output = model(input_ids = ids,
                       token_type_ids = token_type_ids,
                       attention_mask = mask,
                       labels = target
                      )
        
        step_loss = output[0]
        prediction = output[1]

        step_loss.sum().backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        torch.nn.utils. clip_grad_norm(model.parameters(), 1.0)
        
        train_loss += step_loss
     
    print("Saving the model")
    torch.save(model, '../output/best_XLnet_model.bin')
    
    print("Avg Train loss" , (train_loss/len(data_loader)))

In [42]:
def eval_fn(data_loader, model):
    model.eval()
    
    eval_loss = 0
    actual_output = []
    predicted_output = []
    with torch.no_grad():
        for index, dataset in tqdm(enumerate(data_loader), total = len(data_loader)):
            ids = dataset['ids'].to(device)
            token_type_ids = dataset['token_type_ids'].to(device)
            mask = dataset['mask'].to(device)
            target = dataset['targets'].to(device, dtype = torch.float)
            
            output = model(input_ids = ids,
                       token_type_ids = token_type_ids,
                       attention_mask = mask,
                       labels = target
                      )
            
            step_loss = output[0]
            prediction = output[1]
            
            eval_loss += step_loss
            
            actual_output.extend(target.detach().cpu().numpy().tolist())
            predicted_output.extend(prediction.detach().cpu().numpy().tolist())
        
        print("Avg Eval loss" , (eval_loss/len(data_loader)))
        
        return actual_output, predicted_output
            

In [43]:
def training_engine(epoc, train_data):

    #seed = 50
    
    setting_seed(seed_no = seed)
    model = XLNETMultiLabelSequenceClassification(num_labels = len(lab_columns))
    model = nn.DataParallel(model)
    model.to(device)
    
    optimizer_grouped_parameters = params_2_tune(model)
    optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr = 5e-5)
    
    EPOCHS = epoc
    total_steps = len(train_data) * EPOCHS
    
    # Set up the learning rate scheduler
    scheduler = transformers.get_linear_schedule_with_warmup(optimizer,
                                                             num_warmup_steps=0, # Default value
                                                             num_training_steps=total_steps)
    
    for epoch in range(EPOCHS):

        # Training
        train_fn(data_loader = train_data,
                 model = model,
                 optimizer = optimizer, 
                 scheduler = scheduler, 
                 params = optimizer_grouped_parameters)

        # Evaluation
        actual, predicted = eval_fn(data_loader = valid_local_data_loader,
                                    model = model)

        actual = np.array(actual)
        #predicted_prob = np.array(predicted)
        predicted_prob = predicted
        predicted_class = np.argmax(np.array(predicted), axis = 1)

    #    acc = accuracy_score(actual, predicted_class)
        log_ls = log_loss(actual, torch.tensor(predicted_prob).sigmoid())    

    #    print("Epoch {}/{} Eval Accuracy: {}, Logloss: {}".format(epoch, EPOCHS, acc, log_ls))
        print("Epoch {}/{} Logloss: {}".format(epoch, EPOCHS, log_ls))
    return model, actual, predicted_prob, predicted_class

In [None]:
seed = 50

model, actual, predicted_prob, predicted_class = training_engine(epoc = 5, train_data = train_local_data_loader)

actual, predicted_class, torch.tensor(predicted_prob).sigmoid()

In [45]:
model, actual, predicted_prob, predicted_class = training_engine(epoc = 2, train_data = train_prod_data_loader)

actual, predicted_class, torch.tensor(predicted_prob).sigmoid()

100%|██████████| 154/154 [01:15<00:00,  2.03it/s]


Saving the model
Avg Train loss tensor([0.4486, 0.4154], device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 16/16 [00:04<00:00,  3.96it/s]
  0%|          | 0/154 [00:00<?, ?it/s]

Avg Eval loss tensor([0.3172, 0.2508], device='cuda:0')
Epoch 0/2 Logloss: 0.6489418936412661


100%|██████████| 154/154 [01:15<00:00,  2.03it/s]


Saving the model
Avg Train loss tensor([0.2186, 0.1923], device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 16/16 [00:04<00:00,  3.96it/s]


Avg Eval loss tensor([0.1903, 0.1392], device='cuda:0')
Epoch 1/2 Logloss: 0.3571754060244383


(array([[1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [1.,

# Test evaluation

In [46]:
# model = torch.load('../output/best_XLnet_model.bin')

In [47]:
model.eval()
actual_output = []
predicted_output = []

submission_ID = []
with torch.no_grad():
    for index, dataset in tqdm(enumerate(test_prod_data_loader), total = len(test_prod_data_loader)):
        sub_id = dataset['sub_id']
        ids = dataset['ids'].to(device)
        #token_type_ids = dataset['token_type_ids'].to(device)
        mask = dataset['mask'].to(device)

        output = model(input_ids = ids,
                       #token_type_ids = token_type_ids,
                       attention_mask = mask)
        
        submission_ID.extend(sub_id)
        predicted_output.extend(output.sigmoid().detach().cpu().numpy().tolist())
    predicted_output = np.array(predicted_output)
            
        #predicted_output.extend(output.sigmoid().detach().cpu().numpy().tolist())

100%|██████████| 5/5 [00:06<00:00,  1.24s/it]


In [48]:
final_output = pd.DataFrame(predicted_output)
final_output.columns = ['Alcohol', 'Depression', 'Drugs', 'Suicide']
final_output['ID'] = submission_ID

final_output = final_output[['ID', 'Depression', 'Alcohol', 'Suicide', 'Drugs']]
final_output

Unnamed: 0,ID,Depression,Alcohol,Suicide,Drugs
0,02V56KMO,0.984825,0.002539,0.022060,0.002483
1,03BMGTOK,0.998472,0.001058,0.002150,0.000452
2,03LZVFM6,0.997337,0.002758,0.003937,0.001869
3,0EPULUM5,0.997436,0.001118,0.003053,0.000780
4,0GM4C5GD,0.092169,0.531902,0.018045,0.040550
...,...,...,...,...,...
304,Z9A6ACLK,0.995849,0.002250,0.003045,0.001412
305,ZDUOIGKN,0.997849,0.000799,0.010419,0.000308
306,ZHQ60CCH,0.658384,0.020317,0.568046,0.014261
307,ZVIJMA4O,0.008752,0.651517,0.056192,0.344829


In [49]:
final_output.to_csv('../output/sub_45_XLnet.csv', index = False)
