## Predicting Drug to Drug Interactions

by Vikram Reddy, Bhuvana Bellala, Sameer Bajaj, and Nic Mon

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.model_selection import cross_val_score
from sklearn.model_selection  import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import gensim
import spacy
from spacy.symbols import nsubj, VERB
from spacy.symbols import det

nlp = spacy.load('en')



In [2]:
train_data_f = "train_binary_offsets.csv"
test_data_f = "test_binary_offsets.csv"
df_train = pd.read_csv(train_data_f, low_memory=False)
df_test = pd.read_csv(test_data_f, low_memory=False)

df_train = df_train[df_train['drug1'] != df_train['drug2']]
df_test = df_test[df_test['drug1'] != df_test['drug2']]

In [3]:
df_train.head()

Unnamed: 0,sentenceid,sentence_text,drug_pair,ddi_label,drug1,drug2,drug1offset,drug2offset,drugsinsent
0,DDI-DrugBank.d519.s3,Laboratory Tests Response to Plenaxis should b...,DDI-DrugBank.d519.s3.p0,0,Plenaxis,testosterone,29-36,83-94,"Plenaxis,testosterone,"
1,DDI-DrugBank.d297.s1,Population pharmacokinetic analyses revealed t...,DDI-DrugBank.d297.s1.p0,0,MTX,NSAIDs,50-52,55-60,"MTX,NSAIDs,corticosteroids,TNF blocking agents..."
2,DDI-DrugBank.d297.s1,Population pharmacokinetic analyses revealed t...,DDI-DrugBank.d297.s1.p1,0,MTX,corticosteroids,50-52,63-77,"MTX,NSAIDs,corticosteroids,TNF blocking agents..."
3,DDI-DrugBank.d297.s1,Population pharmacokinetic analyses revealed t...,DDI-DrugBank.d297.s1.p2,0,MTX,TNF blocking agents,50-52,84-102,"MTX,NSAIDs,corticosteroids,TNF blocking agents..."
4,DDI-DrugBank.d297.s1,Population pharmacokinetic analyses revealed t...,DDI-DrugBank.d297.s1.p3,0,MTX,abatacept,50-52,122-130,"MTX,NSAIDs,corticosteroids,TNF blocking agents..."


####Let's learn more about our dataset

In [4]:
print("Length of train dataset: ", df_train.shape)
print("Number of drug paris with no affect on each other: ", df_train.loc[df_train['ddi_label'] == 0].shape)
print("Number of drug pairs that do affect each other: ", df_train.loc[df_train['ddi_label'] == 1].shape)

print("\nLength of test dataset: ", df_test.shape)
print("Number of drug paris with no affect on each other: ", df_test.loc[df_test['ddi_label'] == 0].shape)
print("Number of drug pairs that do affect each other: ", df_test.loc[df_test['ddi_label'] == 1].shape)

Length of train dataset:  (23114, 9)
Number of drug paris with no affect on each other:  (19410, 9)
Number of drug pairs that do affect each other:  (3704, 9)

Length of test dataset:  (5039, 9)
Number of drug paris with no affect on each other:  (4155, 9)
Number of drug pairs that do affect each other:  (884, 9)


The dataset is very uneven. We might be overfitting. Undersample the data.

In [5]:
def create_train_dev_test(df):
    random_index = np.random.permutation(df.index)
    df_shuffled = df.ix[random_index]
    df_shuffled.reset_index(drop=True, inplace=True)
    rows, columns =  df_shuffled.shape
    train_size = round(rows*.2)
    dev_size   = round(rows*.4)
    df_train = df_shuffled.loc[:train_size]
    df_dev = df_shuffled.loc[train_size:dev_size+train_size].reset_index(drop=True)
    return df_train, df_dev

# df_train.loc[df_train['ddi_label'] == 0].head()
df_train_no_interact, df_dev = create_train_dev_test(df_train.loc[df_train['ddi_label'] == 0])

In [6]:
frames = [df_train_no_interact, df_train.loc[df_train['ddi_label'] == 1]]
df_train = pd.concat(frames)
df_train.shape

(7587, 9)

###Create training and development sets

In [19]:
triggers = []
with open("triggers.txt") as f:
    for line in f:
        triggers.append(line.split()[1])
trigger_words = set(triggers)

negative_words = ['No', 'not', 'neither', 'without', 'lack', 'fail', 
                  'unable', 'abrogate', 'absence', 'prevent', 
                  'unlikely', 'unchanged', 'rarely']

negative_words = set(negative_words)

def tokenize(sentence_df):
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = [sentence for sentence in sentence_df]
    words_in_sents = [nltk.word_tokenize(sent) for sent in sentences]
    return words_in_sents
words_in_sents = tokenize(df_train['sentence_text'])
ddi_embedding_model = gensim.models.Word2Vec(words_in_sents)
ddi_embedding_model.save('ddi.embedding')
new_model = gensim.models.Word2Vec.load('ddi.embedding')

# Define Feature Functions

In [41]:
def negation_count(x, row):
    """counts total no, not and n't in the sentence """ 
    sent = row['sentence_text']
    count = 0
    for word in nltk.word_tokenize(sent):
        if word == 'not' or word == 'no' or word[-3:] == "n't":
            count += 1
    return count

def negation_count2(x, row):
    """ gives how many negative words in each segment"""
    count = 0
    drug1_i = row['drug1offset'].split('-')
    drug2_i = row['drug2offset'].split('-')
    sentence = row['sentence_text']
    first_phrase = sentence[0:int(drug1_i[0])].split()
    mid_phrase = sentence[int(drug1_i[-1])+1:int(drug2_i[0])].split()
    end_phrase = sentence[int(drug2_i[-1])+1:].split()
    before_count = len(set(first_phrase) & negative_words)
    middle_count = len(set(mid_phrase) & negative_words)
    end_count = len(set(end_phrase) & negative_words)
    return before_count, middle_count, end_count

def trigger_count(x, row):
    """ total number of trigger words """
    sent = row['sentence_text']
    count = 0
    for word in nltk.word_tokenize(sent.lower()):
        if word in triggers:
            count += 1
    return count

def trigger_count2(x, row):
    ''' counts number of trigger words in each segment '''
    count = 0
    drug1_i = row['drug1offset'].split('-')
    drug2_i = row['drug2offset'].split('-')
    sentence = row['sentence_text']
    first_phrase = sentence[0:int(drug1_i[0])].split()
    mid_phrase = sentence[int(drug1_i[-1])+1:int(drug2_i[0])].split()
    end_phrase = sentence[int(drug2_i[-1])+1:].split()
    before_count = len(set(first_phrase) & trigger_words)
    middle_count = len(set(mid_phrase) & trigger_words)
    end_count = len(set(end_phrase) & trigger_words)
    return before_count, middle_count, end_count

def ultra_trigger(x, row):
    ''' counts ultra-trigger words '''
    sent = row['sentence_text']
    words = ['coadministration', 'concomitant', 'concomitantly']
    count = 0
    for word in words:
        if word in nltk.word_tokenize(sent.lower()):
            count += 1
    return count
        
def words_sep_by_and(x, row):
    ''' binary feature if drugs are seperated only by and, not used '''
    sent = row['sentence_text']
    d1 = row['drug1']
    d2 = row['drug2']
    sent = sent.replace(d1, 'drug1')
    sent = sent.replace(d2, 'drug2')
    words = nltk.word_tokenize(sent)
    if abs(words.index('drug1') - words.index('drug2')) < 3:
        if words[words.index('drug1') + words.index('drug2')/2] == 'and':
            return 1
    return 0

def key_phrase(x, row):
    ''' binary feature that checks for key phrases, listed below'''
    sent = row['sentence_text']
    d1 = row['drug1']
    d2 = row['drug2']
    sent = sent.replace(d1, 'drug1')
    sent = sent.replace(d2, 'drug2')
    phrases = ['concurrent administration of drug1 and drug2', 'drug1 concurrently with drug2',
            'co-administration of drug1 and drug2', 'coadministration of drug1 and drug2', 
               'concurrent use of drug1 and drug2',]
    for phrase in phrases:
        if phrase in sent.lower():
            return 1
    return 0
    
def num_words_between_drugs(x,row):
    ''' counts number of words between drug mentions '''
    drug1_i = row['drug1offset'].split('-')
    drug2_i = row['drug2offset'].split('-')
    sentence = row['sentence_text']
    pre_phrase = sentence[int(drug1_i[-1])+1:int(drug2_i[0])].split()
    return len(pre_phrase)

def num_drugs_between_drugs(x, row):
    ''' counts number of drugs between drug mentions '''
    drug1_i = row['drug1offset'].split('-')
    drug2_i = row['drug2offset'].split('-')
    sentence = row['sentence_text']
    pre_phrase = sentence[int(drug1_i[-1])+1:int(drug2_i[0])]
    list_of_drugs = row['drugsinsent'].split(",")[0:-1]
    drugs_in_bet = 0
    for drug in list_of_drugs:
        drug_split = drug.split()
        if(pre_phrase.find(drug_split[0]) != -1):
            drugs_in_bet += 1

    return drugs_in_bet

def verb_count(x, row):
    '''counts number of verbs in each segment '''
    pattern = r'''(?x)     
    ([A-Z]\.)+                       # abbrevations
    |\w+'\w+                         # contractions
    | \w+(?:(-|(\.\s))(\n)?\w+)*     # words w/ internal hyphens, extend to next line, and with periods like Mrs. Reed
    '''
    count = 0
    drug1_i = row['drug1offset'].split('-')
    drug2_i = row['drug2offset'].split('-')
    sentence = row['sentence_text']
    first_phrase = nltk.pos_tag(nltk.word_tokenize(sentence[0:int(drug1_i[0])]))
    mid_phrase = nltk.pos_tag(nltk.word_tokenize(sentence[int(drug1_i[-1])+1:int(drug2_i[0])]))
    end_phrase = nltk.pos_tag(nltk.word_tokenize(sentence[int(drug2_i[-1])+1:]))

    before_count, middle_count, end_count = 0,0,0
    for tag in first_phrase:
        if tag[1].startswith('V'):
            before_count += 1
    for tag in mid_phrase:
        if tag[1].startswith('V'):
            middle_count += 1
    for tag in end_phrase:
        if tag[1].startswith('V'):
            end_count += 1
    return before_count, middle_count, end_count

def dist(idx, enum_list):
    '''helper function for trig_dist '''
    dif = [abs(idx-num) for num,word in enum_list]
    if len(dif) == 0:
        return 0
    else:
        return min(dif)
    
def trig_dist(x, row):
    '''counts distance from both drug mentions to closest trigger'''
    sentence = row['sentence_text']
    sentence = sentence.replace(row['drug1'], 'drug1')
    sentence = sentence.replace(row['drug2'], 'drug2')
    words = nltk.word_tokenize(sentence)
    
    try:
        drug1_i = words.index('drug1')
    except:
        drug1_i = 0
    try:
        drug2_i = words.index('drug2')
    except:
        drug2_i = 0
    enum_words = list(enumerate(words))
    enum_trigs = [(num,word) for num, word in enum_words if word in trigger_words]
    
    return dist(drug1_i, enum_trigs), dist(drug2_i, enum_trigs)

def subj_obj(x, row):
    '''binary feature that checks if drug1 is the nsubj or in its subtree 
    and if drug2 is dobj or in its subtree, returns 1 if both, 0 otherwise'''
    d1, d2 = row['drug1'],row['drug2']
    sent = row['sentence_text'].replace(d1,'drug1').replace(d2,'drug2')
    doc = nlp(sent)
    roots = [r for r in doc if r.head is r and r.pos == VERB] #and r.text in trigger_words]
    d1 = [d for d in doc if d.text == 'drug1']
    d2 = [d for d in doc if d.text == 'drug2']

    flag1 = False
    flag2 = False
    for r in roots:
        subj = [s for s in r.children if s.dep_[:5] == 'nsubj']
        obj = [o for o in r.children if o.dep_[:4] == 'dobj']
        for s in subj:
            if 'drug1' == s.text:
                flag1 = True
            for s1 in s.subtree:
                if s1.text == 'drug1':
                    flag1 = True
        for o in obj:
            if 'drug2' == o.text:
                flag2 = True
            for o1 in o.subtree:
                if o1.text == 'drug2':
                    flag2 = True
    if flag1 and flag2:
        return 1
    else:
        return 0
    
def trig_in_path(x, row):
    ''' binary feature if theres a trigger word in dep path to root '''
    d1, d2 = row['drug1'],row['drug2']
    sent = row['sentence_text'].replace(d1,'drug1').replace(d2,'drug2')
    doc = nlp(sent)
    drugs = [d for d in doc if d.text == 'drug1' or d.text == 'drug2']
    for d in drugs:
        d = d.head
        while d.head is not d:
            if d.text in trigger_words:
                return 1
            else:
                d = d.head
    return 0
    
def both_head(x, row):
    ''' checks if the head of both drug mentions is the same '''
    d1, d2 = row['drug1'],row['drug2']
    sent = row['sentence_text'].replace(d1,'drug1').replace(d2,'drug2')
    doc = nlp(sent)
    #roots = [r for r in doc if r.head is r and r.pos == VERB and r.text in trigger_words]
    drugs = [d for d in doc if d.text == 'drug1' or d.text == 'drug2']
    if  len(drugs) < 2:
        return 0
    if drugs[0].head == drugs[1].head: #and drugs[0].head.pos ==VERB:
        return 1
    return 0

def verb_vect(x, row):
    '''counts number of verbs in each segment '''
    
    sentence = row['sentence_text']
    words = nltk.word_tokenize(sentence)
    pos_words = nltk.pos_tag(words)
    
    try:
        drug1_i = words.index(row['drug1'])
        drug2_i = words.index(row['drug2'])
    except:
        return 0.0, 0.0, 0.0
   
    bef_verb = 0.0
    try:
        for i in range(drug1_i,0, -1):
            if pos_words[i][1].startswith('V') or pos_words[i][1].startswith('N'):
                bef_verb = np.mean(ddi_embedding_model[pos_words[i][0]])
                break
    except:
        bef_verb = 0.0
            
    bet_verb = 0.0
    try:
        for i in range(drug2_i,drug1_i, -1):
            if pos_words[i][1].startswith('V') or pos_words[i][1].startswith('N'):
                bet_verb = np.mean(ddi_embedding_model[pos_words[i][0]])
                break
    except:
        bet_verb = 0.0
            
            
    aft_verb = 0.0   
    try:
        for i in range(drug2_i, len(pos_words)):
            if pos_words[i][1].startswith('V') or pos_words[i][1].startswith('N'):
                aft_verb = np.mean(ddi_embedding_model[pos_words[i][0]])
                break
    except:
        aft_verb = 0.0

    return bef_verb, bet_verb, aft_verb
    

In [42]:
def build_features(df):
    list_of_features = []
    negation_count_l = []
    trigger_count_l = []
    ultra_trigger_l = []
    subj_obj_l = []
    both_head_l = []
    trig_in_path_l = []
    for index, row in df.iterrows():
        negation_count_l.append(negation_count(index,row))
        trigger_count_l.append(trigger_count(index,row))
        ultra_trigger_l.append(ultra_trigger(index,row))
        subj_obj_l.append(subj_obj(index,row))
        both_head_l.append(both_head(index,row))
        trig_in_path_l.append(trig_in_path(index,row))
    list_of_features.append(negation_count_l)
    list_of_features.append(trigger_count_l)
    list_of_features.append(ultra_trigger_l)
    list_of_features.append(subj_obj_l)
    list_of_features.append(both_head_l)
    list_of_features.append(trig_in_path_l)

    drugs_between_drugs = []
    words_between_drugs = []
    for index, row in df.iterrows():
        drugs_between_drugs.append(num_drugs_between_drugs(index,row))
        words_between_drugs.append(num_words_between_drugs(index,row))
    list_of_features.append(drugs_between_drugs)
    list_of_features.append(words_between_drugs)
    
    #words_sep_by_and_l = []
    #key_phrase_l = []
    #for index, row in df.iterrows():
        #words_sep_by_and_l.append(words_sep_by_and(index,row))
        #key_phrase_l.append(key_phrase(index,row))
    #list_of_features.append(words_sep_by_and_l)
    #list_of_features.append(key_phrase_l)
    
    btg, betg, atg = [], [], []
    for index, row in df.iterrows():
        before, between, after = trigger_count2(index,row)
        btg.append(before)
        betg.append(between)
        atg.append(after)
    list_of_features.append(btg)
    list_of_features.append(betg)
    list_of_features.append(atg)
    
    n_btg, n_betg, n_atg = [], [], []
    for index, row in df.iterrows():
        before, between, after = negation_count2(index,row)
        n_btg.append(before)
        n_betg.append(between)
        n_atg.append(after)
    list_of_features.append(n_btg)
    list_of_features.append(n_betg)
    list_of_features.append(n_atg)
    
    v_btg, v_betg, v_atg = [], [], []
    for index, row in df.iterrows():
        before, between, after = verb_count(index,row)
        v_btg.append(before)
        v_betg.append(between)
        v_atg.append(after)
    list_of_features.append(v_btg)
    list_of_features.append(v_betg)
    list_of_features.append(v_atg)
    
    word2vec_b, word2vec_bet, word2vec_a = [], [], []
    for index, row in df.iterrows():
        verb_bef, verb_bet, verb_aft = verb_vect(index, row)
        word2vec_b.append(verb_bef)
        word2vec_bet.append(verb_bet)
        word2vec_a.append(verb_aft)
        
    list_of_features.append(word2vec_b)
    list_of_features.append(word2vec_bet)
    list_of_features.append(word2vec_a)

        
    return list_of_features

In [43]:
#df_train, df_dev = create_train_dev_test();

In [11]:
vec = TfidfVectorizer(ngram_range=(1, 2), max_df=0.8, max_features=1000)
arr_train_feature_sparse = vec.fit_transform(df_train['sentence_text'])
arr_train_feature = arr_train_feature_sparse.toarray()

arr_test_feature_sparse = vec.transform(df_test["sentence_text"])
arr_test_feature = arr_test_feature_sparse.toarray()

In [44]:
def convert_to_numpy(list_):
    x = np.asarray(list_)
    y = x.reshape(-1, 1)
    return y

def create_feature_set(mainset,features):
    for list_f in features:
        mainset = np.append(mainset, convert_to_numpy(list_f), axis=1)
    return mainset

In [45]:
train_feature_list = build_features(df_train)
train_feature_set = create_feature_set(arr_train_feature, train_feature_list)

test_feature_list = build_features(df_test)
test_feature_set = create_feature_set(arr_test_feature, test_feature_list)

##Baseline accuracy

In [46]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
mn = MultinomialNB()
print("Baseline with only vectorized bag of words:")

print("starting to fit")
mn_model = mn.fit(arr_train_feature, df_train.ddi_label)

print("starting to predict")
pred_test = mn_model.predict(arr_test_feature)

print('accuracy: ', accuracy_score(df_test.ddi_label, pred_test))
print(f1_score(df_test.ddi_label, pred_test, average=None, pos_label = 1))

Baseline with only vectorized bag of words:
starting to fit
starting to predict
accuracy:  0.429648739829
[ 0.48531519  0.36048064]


# Lets Try Different Models 

In [17]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

def run_model(model):
    print("starting to fit")
    fit_model = model.fit(train_feature_set, df_train.ddi_label)
    print("starting to predict")
    pred_test = fit_model.predict(test_feature_set)
    print('accuracy: ', accuracy_score(df_test.ddi_label, pred_test))
    print(f1_score(df_test.ddi_label, pred_test, average=None, pos_label = 1))
    
def gridsearch(model, parameters):
    clf = GridSearchCV(model, parameters)
    clf.fit(train_feature_set, df_train.ddi_label)
    clf_pred_test = clf.predict(test_feature_set)
    print('accuracy: ', accuracy_score(df_test.ddi_label, clf_pred_test))
    print(f1_score(df_test.ddi_label, clf_pred_test, average=None, pos_label = 1))

## KNN

In [117]:
knn = KNeighborsClassifier(n_neighbors=3)
run_model(knn)

starting to fit
starting to predict
accuracy:  0.835681682874
[ 0.90098063  0.51748252]


###Run GridSearchCv

In [77]:
from sklearn.model_selection import GridSearchCV

knn_cv = KNeighborsClassifier()
param = {'n_neighbors': [3, 5, 8],
             'weights': ['uniform', 'distance'],
             'leaf_size': [30, 50, 75, 100]}
gridsearch(knn_cv, param)

accuracy:  0.725739233975
[ 0.81145975  0.49708879]


##AdaBoost

In [80]:
ada = AdaBoostClassifier(n_estimators=100)
run_model(ada)

starting to fit
starting to predict
accuracy:  0.755903949196
[ 0.83608742  0.52214452]


In [81]:
ada_cv = AdaBoostClassifier()
param = {'n_estimators': [50, 75, 100, 150, 200],
        'learning_rate': [0.5, 1.0, 1.15]}
gridsearch(ada_cv, param)

accuracy:  0.72494542568
[ 0.80851064  0.51197183]


# Multinomial NB

In [82]:
nb = MultinomialNB()
run_model(nb)

starting to fit
starting to predict
accuracy:  0.604485016868
[ 0.69989459  0.42013384]


In [83]:
multi_cv = MultinomialNB()
param = {'alpha': [0.8,0.9,1.0,1.1,1.2]}
gridsearch(multi_cv, param)

accuracy:  0.604683468942
[ 0.70009033  0.42025611]


# Log Reg

In [88]:
log_reg = LogisticRegression()
run_model(log_reg)

starting to fit
starting to predict
accuracy:  0.682278229808
[ 0.77479252  0.4607612 ]


In [90]:
log_cv = LogisticRegression()
param = {'C': [0.5, 0.75, 1.0],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag']}
gridsearch(log_cv, param)



accuracy:  0.695177614606
[ 0.78666667  0.46629604]


# SVM

In [92]:
svm_model = svm.SVC()
run_model(svm_model)

starting to fit
starting to predict
accuracy:  0.734471125223
[ 0.82183755  0.47897196]


##RandomForestClassifer

In [47]:
rfc = RandomForestClassifier()
run_model(rfc)

starting to fit
starting to predict
accuracy:  0.79241913078
[ 0.86362451  0.56561462]


In [53]:
rfc_cv = RandomForestClassifier()
params = {'n_estimators': [5, 10, 15, 20],
         'max_depth': [100, 200, 300]}
gridsearch(rfc_cv, params)

accuracy:  0.805318515578
[ 0.87161366  0.59745589]
