In [66]:
%matplotlib inline

import numpy as np
import pandas as pd

import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from pathlib import Path
from sklearn import metrics

## Data

In [5]:
def download_dataset():
    ! wget https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
    ! unzip drugsCom_raw.zip
    ! mkdir -p data
    ! mv drugsComTest_raw.tsv drugsComTrain_raw.tsv data

In [6]:
#download_dataset()

--2019-11-13 09:30:39--  https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42989872 (41M) [application/x-httpd-php]
Saving to: ‘drugsCom_raw.zip’


2019-11-13 09:30:44 (8.51 MB/s) - ‘drugsCom_raw.zip’ saved [42989872/42989872]

Archive:  drugsCom_raw.zip
  inflating: drugsComTest_raw.tsv    
  inflating: drugsComTrain_raw.tsv   


In [7]:
! ls data

drugsComTest_raw.tsv  drugsComTrain_raw.tsv


In [15]:
#! head data/drugsComTrain_raw.tsv

In [11]:
PATH = Path("data")

Variable descriptions
1. drugName (categorical): name of drug
2. condition (categorical): name of condition
3. review (text): patient review
4. rating (numerical): 10 star patient rating
5. date (date): date of review entry
6. usefulCount (numerical): number of users who found review useful

In [12]:
train_df = pd.read_csv(PATH/"drugsComTrain_raw.tsv", sep='\t')
train_df.shape

(161297, 7)

In [13]:
valid_df = pd.read_csv(PATH/"drugsComTest_raw.tsv", sep='\t')
valid_df.shape

(53766, 7)

In [14]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


## Pre-processing text

In [16]:
import unicodedata
import string
import re

def unicodeToAscii(s):
    """Turn a Unicode string to plain ASCII
    
    https://stackoverflow.com/a/518232/2809427
    """
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    """Lowercase, trim, and remove non-letter characters"""
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [17]:
train_df.review[1]

'"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\nWe have tried many different medications and so far this is the most effective."'

In [18]:
normalizeString(train_df.review[1])

' my son is halfway through his fourth week of intuniv . we became concerned when he began this last week when he started taking the highest dose he will be on . for two days he could hardly get out of bed was very cranky and slept for nearly hours on a drive home from school vacation very unusual for him . i called his doctor on monday morning and she said to stick it out a few days . see how he did at school and with getting up in the morning . the last two days have been problem free . he is much more agreeable than ever . he is less emotional a good thing less cranky . he is remembering all the things he should . overall his behavior is better . we have tried many different medications and so far this is the most effective . '

In [31]:
train_df.review = train_df.review.apply(normalizeString)

In [33]:
train_df.review[0]

' it has no side effect i take it in combination of bystolic mg and fish oil '

In [34]:
def compute_word_counts(col):
    word_counts = {}
    for i in range(len(col)):
        line = col[i]
        for word in line.split(" "):
            word_counts[word] = word_counts.get(word, 0) + 1
    return word_counts

In [35]:
word_counts = compute_word_counts(train_df.review)
len(word_counts.keys())

47497

In [36]:
for word in list(word_counts):
    if word_counts[word] < 5:
        del word_counts[word]

In [37]:
len(word_counts.keys())

18717

In [38]:
vocab2index = {"<PAD>":0, "UNK":1}
words = ["<PAD>", "UNK"]
for word in word_counts:
    vocab2index[word] = len(words)
    words.append(word)

In [39]:
len(words)

18719

In [40]:
## counting how long are reviews
review_lens = [len(rev.split(" ")) for rev in train_df.review.values]

In [42]:
np.quantile(review_lens, [0, 0.5, 0.95, 0.99])

array([  2.,  93., 159., 168.])

## Dataset

In [45]:
def encode_sentence(x, vocab2index, N=160, padding_start=False):
    x = x.split(" ")
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])
    l = min(N, len(enc1))
    if padding_start:
        enc[:l] = enc1[:l]
    else:
        enc[N-l:] = enc1[:l]
    return enc

In [46]:
encode_sentence(train_df.review[0], vocab2index)

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  2,  3,  4,  5,  6,  7,  8,  9,  3, 10, 11,
       12, 13, 14, 15, 16, 17,  2], dtype=int32)

In [51]:
class DrugDataset(Dataset):
    def __init__(self, df):
        self.x1 = [encode_sentence(x, vocab2index) for x in df.review.values]
        self.y1 = df.rating.values
        self.y2 = df.usefulCount.values
        
    def __len__(self):
        return len(self.y1)
    
    def __getitem__(self, idx):
        x1 = self.x1[idx]
        return x1, self.y1[idx], self.y2[idx]

In [52]:
train_ds = DrugDataset(train_df)
val_ds = DrugDataset(valid_df)

In [53]:
train_ds[0]

(array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  2,  3,  4,  5,  6,  7,  8,  9,  3, 10, 11,
        12, 13, 14, 15, 16, 17,  2], dtype=int32), 9.0, 27)

In [58]:
len(val_ds)

53766

In [59]:
batch_size = 1000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=len(val_ds))

## Model

In [56]:
vocab_size = len(words)

In [57]:
class ReviewModel(nn.Module):
    def __init__(self, vocab_size=vocab_size, emb_size=50, hidden_size=100):
        super(ReviewModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        self.gru = nn.GRU(emb_size, hidden_size, batch_first=True)
        self.linear1 = nn.Linear(hidden_size, 1)
        self.linear2 = nn.Linear(hidden_size, 1)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout(x)
        _, ht = self.gru(x)
        return self.linear1(ht[-1]), self.linear2(ht[-1])

In [61]:
model = ReviewModel()

## Training

In [68]:
# assumes one batch with all the data
def val_metrics(model, valid_dl):
    model.eval()
    for x, y1, y2 in valid_dl:
        batch = y1.shape[0]
        y1 = y1.float().unsqueeze(-1)
        y2 = y2.float().unsqueeze(-1)
        y1_hat, y2_hat = model(x.long())
        loss = F.mse_loss(y1_hat, y1) + F.mse_loss(y2_hat, y2)
    
    r2_1 = metrics.r2_score(y1.numpy(), y1_hat.detach().numpy())
    r2_2 = metrics.r2_score(y2.numpy(), y2_hat.detach().numpy())
    return loss.item(), r2_1, r2_2

In [67]:
val_metrics(model, val_dl)

(tensor(2155.1726, grad_fn=<AddBackward0>),
 -4.665369780144622,
 -0.6003902591869441)

In [72]:
def train(model, lr = 0.01, epochs = 30):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay= 1e-5)
    for i in range(epochs):
        sum_loss = 0
        total = 0
        for x, y1, y2 in train_dl:
            model.train()
            batch = y1.shape[0]
            y1 = y1.float().unsqueeze(-1)
            y2 = y2.float().unsqueeze(-1)
            y1_hat, y2_hat = model(x.long())
            loss = F.mse_loss(y1_hat, y1) + F.mse_loss(y2_hat, y2)
        
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            sum_loss += batch * loss.item()
            total += batch
            
            val_loss, val_r2_1,  val_r2_2 = val_metrics(model, val_dl)  # val_dl
        
        print("\tTrain loss: {:.3f} \t Valid loss: {:.3f} \t Valid R2:  {:.3f} {:.3f}".format(
            sum_loss/total, val_loss, val_r2_1, val_r2_2))


In [None]:
model = ReviewModel()
train(model, lr = 0.01, epochs=10)