### Import Packages

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import warnings 

import os
import wget
import time
import datetime
import random

import numpy as np
import pandas as pd

from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset, random_split

from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef, accuracy_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### Download Dataset

In [None]:
# Importing the datasets

df_train = pd.read_csv("/kaggle/input/kuc-hackathon-winter-2018/drugsComTrain_raw.csv")
print ("The shape of the train set given is : ", df_train.shape)

df_train.head()

In [None]:
df_train.dropna(subset=['condition'], inplace=True)

In [None]:
df_train = df_train.drop(['uniqueID', 'date'], axis=1)

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
def clean_review(review):
    # Convert to lowercase
    review = review.lower()
    
    # Remove punctuation
    review = review.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the review
    tokens = nltk.word_tokenize(review)
    
    # Remove stop words
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    
    # Join the cleaned tokens back together
    cleaned_review = ' '.join(tokens)
    
    return cleaned_review

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [None]:
df_train['review'] = df_train['review'].apply(lambda x: clean_review(x))

In [None]:
print("Retrieving Features Dataset")

sentences = df.review.values
labels = df['rating']
drug_name = df.drugName.values
condition = df.condition.values

### Label Encoding

In [None]:
labels = labels.apply(lambda x: 1 if x >= 5.0 else 0)
labels = np.asarray(labels)

print("Encoding Labels")

### Bert Tokenizer

In [None]:
print('Downloading BERT tokenizer...')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

### Tokenization

In [None]:
warnings.filterwarnings("ignore")

input_ids = []
attention_mask = []

for s in sentences:

    input_encoded = tokenizer.encode_plus(
                        s,                      
                        add_special_tokens = True, 
                        max_length = 64,           
                        truncation = True,
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',    
                   )
      
    input_ids.append(input_encoded['input_ids'])
  
    attention_mask.append(input_encoded['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_mask = torch.cat(attention_mask, dim=0)
labels = torch.tensor(labels)

print('Tokenization Done')

### Dataset Split

In [None]:
dfs = TensorDataset(input_ids, attention_mask, labels)

size_train = int(0.9 * len(dfs))
size_val = len(dfs) - size_train

train_data, val_data = random_split(dfs, [size_train, size_val])

print("{:,} is the training dataset size".format(size_train))
print("{:,} is the validation dataset size".format(size_val))

### Batch Sampling

In [None]:
bs = 100

train_dl = DataLoader(
            train_data,
            sampler = RandomSampler(train_data),
            batch_size = bs
)

valid_dl = DataLoader(
            val_data,
            sampler = SequentialSampler(val_data),
            batch_size = bs
)

print("Batch Sampling Done")

### BERT Model

In [None]:
dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")

mod = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
)

mod.to(dev)

### Optimizer

In [None]:
opt = AdamW(
    mod.parameters(),
    lr = 3e-5,
    eps = 1e-8
)

print("Optimizer Initialized")

### Scheduler

In [None]:
epoch = 3
ts = len(train_dl) * epoch

sch = get_linear_schedule_with_warmup(
    opt,
    num_warmup_steps = 0,
    num_training_steps = ts
)

print("Scheduler Initialized")

### Accuracy

In [None]:
def acc(preds, labels):
    preds_flat = np.argmax(preds, axis = -1).flatten()
    labels_flat = labels

    return np.sum(preds_flat == labels_flat)/len(preds_flat)

print("Accuracy Function Defined")

### Time

In [None]:
def t(s):
    p = int(round(s))
    return str(datetime.timedelta(seconds = p))

print("Time Function Defined")

### Training and Validation

In [None]:
seed = 42

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

stats = []

tot_t0 = time.time()

for i in range(0, epoch):

    print("\n")
    print("Training epoch {} / {}".format(i + 1, epoch))
    print("Training.......................")
    print("\n")

    t0 = time.time()

    tot_train_loss = 0

    mod.train()

    for s, b in enumerate(train_dl):

        if s % bs == 0 and not s == 0:

            timef = t(time.time() - t0)

            print("Batch {} of {} has elasped in {}".format(s, len(train_dl), timef))

        input_ids_d = b[0].to(dev)
        attention_mask_d = b[1].to(dev)
        labels_d = b[2].to(dev)

        mod.zero_grad()

        cost, logits = mod(
              input_ids = input_ids_d,
              attention_mask = attention_mask_d,
              labels = labels_d,
              token_type_ids = None,
              return_dict = False
        )

        tot_train_loss += cost.item()

        cost.backward()

        torch.nn.utils.clip_grad_norm_(mod.parameters(), 1.0)

        opt.step()

        sch.step()

    avg_tloss = tot_train_loss / len(train_dl)

    train_time = t(time.time() - t0)

    print("\n")
    print("Average Training Loss is {}".format(avg_tloss))
    print("Training Time per epoch is {}".format(train_time))

    print("\n")
    print("Validating.......................")

    t0 = time.time()

    mod.eval()

    tot_eval_loss = 0
    tot_eval_steps = 0

    for s, b in enumerate(valid_dl):

        input_ids_d = b[0].to(dev)
        attention_mask_d = b[1].to(dev)
        labels_d = b[2].to(dev)

        mod.zero_grad()

        with torch.no_grad():

            cost, logits = mod(
                               input_ids = input_ids_d,
                               attention_mask = attention_mask_d,
                               labels = labels_d,
                               token_type_ids = None,
                               return_dict = False
                               )

        tot_eval_loss += cost.item()

        logits = logits.detach().cpu().numpy()
        labelx = labels_d.detach().cpu().numpy()

    avg_vloss = tot_eval_loss / len(valid_dl)
    valid_time = t(time.time() - t0)

    print("\n")
    print("Average Validation Loss is {}".format(avg_vloss))
    print("Validation Time per epoch is {}".format(valid_time))

    stats.append({
        'epoch' : i + 1,
        'Training Loss' : avg_tloss,
        'Training Time' : train_time,
        'Validation Loss' : avg_vloss,
        'Validation Time' : valid_time
    })

print("\n")
print("Completed!!!")
print("Total Time Taken for Training and Validation is {:}".format(t(time.time() - tot_t0)))

### Save Model Weights

In [None]:
torch.save(mod.state_dict(), "/content/drive/MyDrive/BERT Models/BERT_Weights.pt")
print("Model Saved")

### Statistics

In [None]:
statistics = pd.DataFrame(data = stats)
statistics = statistics.set_index('epoch')

statistics

### Test Data

In [None]:
df_test = pd.read_csv("./drugsComTest_raw.tsv", delimiter = "\t", header = 0, names = [None, "drugName", "condition", "review", "rating", "date", "usefulCount"])
df_test

In [None]:
warnings.filterwarnings("ignore")

print("Retrieving Features Dataset\n")
sentences = df_test.review.values
labels = df_test['rating']
drug_name = df_test.drugName.values
condition = df_test.condition.values

labels = labels.apply(lambda x: 1 if x >= 5.0 else 0)
labels = np.asarray(labels)

print("Encoding Labels\n")

input_ids = []
attention_mask = []

for s in sentences:
  
    input_encoded = tokenizer.encode_plus(
                        s,                     
                        add_special_tokens = True, 
                        max_length = 64, 
                        truncation = True,        
                        pad_to_max_length = True,
                        return_attention_mask = True,  
                        return_tensors = 'pt',     
                   )
       
    input_ids.append(input_encoded['input_ids'])
    attention_mask.append(input_encoded['attention_mask'])

input_ids = torch.cat(input_ids, dim = 0)
attention_mask = torch.cat(attention_mask, dim = 0)
labels = torch.tensor(labels)
  
bs = 100

pred_data = TensorDataset(input_ids, attention_mask, labels)
pred_sampler = SequentialSampler(pred_data)
pred_dl = DataLoader(pred_data, sampler = pred_sampler, batch_size = bs)

print('Test Data prepared')


### Prediction

In [None]:
test_acc = 0

mod.eval()

preds, tl = [], []
for s, b in enumerate(pred_dl):

    b = tuple(t.to(dev) for t in b)

    input_idsx, attention_maskx, labelsx = b

    with torch.no_grad():

        outs = mod(
            input_ids = input_idsx, 
            attention_mask = attention_maskx, 
            token_type_ids = None, 
            )
    
        logits = outs[0]

        logits = logits.detach().cpu().numpy()
        labels = labelsx.to('cpu').numpy()

        ta = acc(logits, labels)
        test_acc =  test_acc + ta

        preds.append(logits)
        tl.append(labels)

test_acc = (test_acc / len(pred_dl)) * 100

print('Prediction Done!')
print('Test Accuracy is {}%'.format(test_acc))

### Performance Visualization

### Matthew's Correlation Coefficient

In [None]:
mat = []

for i in range(len(tl)):

    pred_lab = np.argmax(preds[i], axis = 1).flatten()

    mat_set = matthews_corrcoef(tl[i], pred_lab)
    mat.append(mat_set)

print("Matthews's Correlation Coefficients Obtained")

### MCC Graph

In [None]:
mat_50  = mat[:50] 

fig = sns.barplot(x = list(range(len(mat_50))), y = mat_50)

plt.title('MCC Graph')
plt.xlabel('#Batch')
plt.ylabel('MCC Score')
plt.xticks(rotation=90)

plt.show()

### Final MCC Score

In [None]:
predx = np.concatenate(preds, axis = 0)
predm = np.argmax(predx, axis = 1).flatten()

labelm = np.concatenate(tl, axis = 0)

final_mcc = matthews_corrcoef(labelm, predm)
print('The Final MCC Score is {}'.format(final_mcc))

### Line Graph for Loss

In [None]:
sns.set(style = 'darkgrid')
sns.set(font_scale = 1.5)
plt.rcParams['figure.figsize'] = (12, 6)

plt.plot(statistics['Training Loss'], 'b-o', label = 'Training Loss')
plt.plot(statistics['Validation Loss'], 'g-o', label = 'Validation Loss')

plt.title('Performance Visualization - Loss')
plt.xlabel('#Epochs')
plt.ylabel('Loss')
plt.xticks(statistics.index, statistics.index + 1)
plt.legend()

plt.show()

### Bar Plot for Accuracy

In [None]:
bar_x = ['Validation Accuracy', 'Test Accuracy']
bar_y = [val_acc, test_acc]
db = {'Accuracy' : bar_x, 'Percentage' : bar_y}
db = pd.DataFrame(db)
sns.barplot(x = 'Accuracy', y = 'Percentage', hue = 'Accuracy', data = db, palette = "husl", dodge = False)
plt.title('Performance Visualization - Accuracy')
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), shadow=True, ncol=3)
plt.show()

### Multiple Pie Charts for Conditions Distributions per Drug

In [None]:
print("Drug <-- Conditions")

for d in drugs:
    df1 = df_test.loc[df_test["drugName"] == d]
    df1 = df1["condition"].value_counts().rename_axis("condition").reset_index(name="count")

    plt.pie(x = df1["count"], labels = df1["condition"], autopct = "%1.1f%%")
    plt.title(d)
    plt.show()

### Multiple Heatmaps for Drug Reccomendation based on Conditions

In [None]:
print("Drug <-- Conditions <-- Reccomend (Yes / No)")

for d in drugs:
    df2 = df_test.loc[df_test["drugName"] == d]
    conditions = df2["condition"].unique()

    Yes = []
    No = []

    for c in conditions:
        df3 = df2.loc[df2["condition"] == c]
        dict3 = df3["rating"].value_counts().to_dict()
        y = dict3.get(1, 0)
        n = dict3.get(0, 0)
        Yes.append(y)
        No.append(n)

    classes = ['Reccomend', 'Do Not Reccomend']
    vals = np.column_stack((Yes, No))

    fig, ax = plt.subplots()
    im = ax.imshow(vals, cmap="Dark2")
    ax.grid(False)

    ax.set_xticks(np.arange(len(classes)))
    ax.set_yticks(np.arange(len(conditions)))

    ax.set_xticklabels(classes)
    ax.set_yticklabels(conditions)

    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    for i in range(len(conditions)):
        for j in range(len(classes)):
            text = ax.text(j, i, 
                           vals[i, j],ha="center", 
                           va="center",color="w", 
                           fontweight = "bold", fontsize = 15)

    ax.set_title(d)
    fig.tight_layout()
    plt.show()