In [1]:
%matplotlib inline

import numpy as np
import pandas as pd

import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from pathlib import Path

In [2]:
def download_dataset():
    ! wget https://archive.ics.uci.edu/ml/machine-learning-databases/00461/drugLib_raw.zip
    ! unzip drugLib_raw.zip
    ! mkdir -p data
    ! mv drugLibTest_raw.tsv drugLibTrain_raw.tsv data

In [3]:
download_dataset()

--2020-06-15 11:00:59--  https://archive.ics.uci.edu/ml/machine-learning-databases/00461/drugLib_raw.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1133354 (1.1M) [application/x-httpd-php]
Saving to: ‘drugLib_raw.zip’


2020-06-15 11:01:00 (5.10 MB/s) - ‘drugLib_raw.zip’ saved [1133354/1133354]

Archive:  drugLib_raw.zip
  inflating: drugLibTest_raw.tsv     
  inflating: drugLibTrain_raw.tsv    


In [4]:
PATH = Path("data")

In [5]:
!ls data

drugLibTest_raw.tsv  plot.tok.gt9.5000    quote.tok.gt9.5000
drugLibTrain_raw.tsv [34mpmlb[m[m                 subjdata.README.1.0


# Load data in

In [6]:
path = PATH/"drugLibTrain_raw.tsv"
!head -3 $path

	urlDrugName	rating	effectiveness	sideEffects	condition	benefitsReview	sideEffectsReview	commentsReview
2202	enalapril	4	Highly Effective	Mild Side Effects	management of congestive heart failure	"slowed the progression of left ventricular dysfunction into overt heart failure 
alone or with other agents in the managment of hypertension 


Data Description
Here is a descrition of the data
* `urlDrugName` (categorical): name of drug 
* `condition` (categorical): name of condition 
* `benefitsReview`(text): patient on benefits 
* `sideEffectsReview` (text): patient on side effects 
* `commentsReview` (text): overall patient comment 
* `rating` (numerical): 10 star patient rating 
* `sideEffects` (categorical): 5 step side effect rating 
* `effectiveness` (categorical): 5 step effectiveness rating

Notes:
* `condition` should be treated as text
* `urlDrugName` is probably not that useful because they are too many drugs.

In [7]:
train_df = pd.read_csv(PATH/"drugLibTrain_raw.tsv", sep='\t')
train_df.shape

(3107, 9)

In [8]:
valid_df = pd.read_csv(PATH/"drugLibTest_raw.tsv", sep='\t')
valid_df.shape

(1036, 9)

In [9]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,urlDrugName,rating,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview
0,2202,enalapril,4,Highly Effective,Mild Side Effects,management of congestive heart failure,slowed the progression of left ventricular dys...,"cough, hypotension , proteinuria, impotence , ...","monitor blood pressure , weight and asses for ..."
1,3117,ortho-tri-cyclen,1,Highly Effective,Severe Side Effects,birth prevention,Although this type of birth control has more c...,"Heavy Cycle, Cramps, Hot Flashes, Fatigue, Lon...","I Hate This Birth Control, I Would Not Suggest..."
2,1146,ponstel,10,Highly Effective,No Side Effects,menstrual cramps,I was used to having cramps so badly that they...,Heavier bleeding and clotting than normal.,I took 2 pills at the onset of my menstrual cr...
3,3947,prilosec,3,Marginally Effective,Mild Side Effects,acid reflux,The acid reflux went away for a few months aft...,"Constipation, dry mouth and some mild dizzines...",I was given Prilosec prescription at a dose of...
4,1951,lyrica,2,Marginally Effective,Severe Side Effects,fibromyalgia,I think that the Lyrica was starting to help w...,I felt extremely drugged and dopey. Could not...,See above


## Cleaning text

In [10]:
import unicodedata
import string
import re

def unicodeToAscii(s):
    """Turn a Unicode string to plain ASCII
    
    https://stackoverflow.com/a/518232/2809427
    """
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    """Lowercase, trim, and remove non-letter characters"""
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [11]:
train_df.commentsReview[1]

'I Hate This Birth Control, I Would Not Suggest This To Anyone.'

In [12]:
normalizeString(train_df.commentsReview[1])

'i hate this birth control i would not suggest this to anyone .'

In [13]:
def clean_columns(col):
    train_df[col] = train_df[col].fillna("NA")
    valid_df[col] = valid_df[col].fillna("NA")
    train_df[col] = train_df[col].apply(normalizeString)
    valid_df[col] = valid_df[col].apply(normalizeString)

In [14]:
for col in ["commentsReview", "sideEffectsReview", "benefitsReview"]:
    clean_columns(col)

In [15]:
train_df.benefitsReview[127]

'the nuvaring has proven to be very effective for me . i have had a lot of problems with birth control throughout my life . having tried many if not all possible birth control treatments i believe this is the one that works the best for me . it is practical and very few side effects when compared to the oral contraceptives iuds or injections .'

In [16]:
def update_word_counts(col, word_counts):
    for i in range(train_df.shape[0]):
        line = col[i]
        for word in line.split(" "):
            word_counts[word] = word_counts.get(word, 0) + 1

In [17]:
word_counts = {}
update_word_counts(train_df.benefitsReview.values, word_counts)
update_word_counts(train_df.sideEffectsReview.values, word_counts)
update_word_counts(train_df.commentsReview.values, word_counts)

In [18]:
#word_counts

In [19]:
for word in list(word_counts):
    if word_counts[word] < 5:
        del word_counts[word]

In [20]:
vocab2index = {"<PAD>":0, "UNK":1}
words = ["<PAD>", "UNK"]
for word in word_counts:
    vocab2index[word] = len(words)
    words.append(word)

In [21]:
len(words)

3909

In [22]:
def encode_sentence(x, vocab2index, N=40, padding_start=False):
    x = x.split(" ")
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])
    l = min(N, len(enc1))
    if padding_start:
        enc[:l] = enc1[:l]
    else:
        enc[N-l:] = enc1[:l]
    return enc, l

In [23]:
encode_sentence(train_df.commentsReview[0], vocab2index)

(array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0, 3499,  662,  642,
         747,   99,    1,   56, 3133,    5, 1328], dtype=int32), 10)

## Dataset

In [24]:
eff2index = {v: i for i,v in enumerate(train_df.effectiveness.unique())}
eff2index 

{'Highly Effective': 0,
 'Marginally Effective': 1,
 'Ineffective': 2,
 'Considerably Effective': 3,
 'Moderately Effective': 4}

In [25]:
sideEff2index = {v: i for i,v in enumerate(train_df.sideEffects.unique())}
sideEff2index

{'Mild Side Effects': 0,
 'Severe Side Effects': 1,
 'No Side Effects': 2,
 'Extremely Severe Side Effects': 3,
 'Moderate Side Effects': 4}

In [26]:
class DrugDataset(Dataset):
    def __init__(self, df):
        self.x1 = [encode_sentence(x, vocab2index) for x in df.commentsReview.values]
        self.y = df.rating.values
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x1, s = self.x1[idx]
        return x1, self.y[idx]

In [27]:
train_ds = DrugDataset(train_df)
val_ds = DrugDataset(valid_df)

In [28]:
train_ds[0]

(array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0, 3499,  662,  642,
         747,   99,    1,   56, 3133,    5, 1328], dtype=int32), 4)

In [29]:
val_ds[0]

(array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,  344,    3,  596,   56,   59,   31, 1414, 1201,
          43,  785,   85,    3, 1741,  456,   31], dtype=int32), 9)

In [30]:
batch_size = 1000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=batch_size)

## Task 1:  (12 points)
Use an RNN model (GRU or LSTM) to predict `rating (numerical)` based on `commentsReview (text)` (Either regression or classification). <br>

Compute either accuracy or $R^2$. I am not interested in the quality of the model. There is very little data. 

# Task 2:  (3 points)
Expand previous model adding additional text and categorical features to predict `rating (numerical)`. You may need to modify the origional dataset.