In [1]:
# importing data science libraries
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.neighbors import KNeighborsRegressor

In [2]:
# importing the patient data
data = pd.read_csv('raw data.csv')
data.head()

Unnamed: 0,Pat_ID,Age,Sex,Name_InitialMed,Date_InitialRx,sBP_t-6,Date_t-6,sBP_t-5,Date_t-5,sBP_t-4,...,Date_t+3,sBP_t+4,Date_t+4,sBP_t+5,Date_t+5,sBP_t+6,Date_t+6,Before_BP_Ave,After_BP_Ave,Delta_Before-After
0,3009000000002127,78,Female,ACEBUTOLOL,2007-12-18,,,,,,...,2008-02-05,164.0,2008-02-14,144.0,2008-08-21,140.0,2009-03-18,184.142857,171.666667,-12.47619
1,2001000000033249,60,Male,ACEBUTOLOL,2010-11-01,150.0,2008-01-14,130.0,2008-02-26,150.0,...,2012-01-17,153.0,2012-05-16,128.0,2012-06-05,118.0,2012-06-12,210.0,190.0,-20.0
2,1003000000071385,84,Female,ACEBUTOLOL,2006-10-18,,,,,,...,2007-08-01,162.0,2007-10-19,170.0,2008-01-11,172.0,2008-04-04,210.0,154.666667,-55.333333
3,10001000000008854,103,Female,ACEBUTOLOL,2002-04-30,,,,,,...,2003-07-08,,,,,,,210.0,154.666667,-55.333333
4,1003000000139736,69,Male,ACEBUTOLOL,2006-07-18,,,,,188.0,...,2006-11-07,200.0,2006-12-05,188.0,2007-01-16,174.0,2007-03-13,183.0,123.0,-60.0


In [3]:
# importing the drugs table
drugs = pd.read_csv('drug_info2.csv', encoding = "ISO-8859-1")

In [5]:
# extracting important features from the original data and renaming columns
important_features = data[['Age', 'Sex', 'Delta_Before-After', 'Name_InitialMed']]
important_features.columns = ['Age', 'Sex', 'Delta_Before-After', 'Drug Name']
drugs.columns = ['No.', 'Drug Name','ATCb','ATCa','CLASS1b','CLASS2b','CLASS3b']
important_features['Drug Name'] = important_features['Drug Name'].str.lower()
important_features.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,Age,Sex,Delta_Before-After,Drug Name
0,78,Female,-12.47619,acebutolol
1,60,Male,-20.0,acebutolol
2,84,Female,-55.333333,acebutolol
3,103,Female,-55.333333,acebutolol
4,69,Male,-60.0,acebutolol


In [6]:
# doing an inner join on Drug Names, removing unnessary columns afterwards
merged = important_features.merge(drugs, how= 'inner', on='Drug Name' )
merged = merged.drop(['No.', 'ATCb','ATCa', 'CLASS2b', 'CLASS3b'], axis = 1)
merged.columns = ['Age', 'Sex', 'Delta', 'Drug Name', 'Drug Class']

In [7]:
# data cleaning
# converting incorrect versions of the sex to proper variables
boolean_list = merged['Sex'] == 'FEMALE'
merged['Sex'][boolean_list] = 'Female'

boolean_list = merged['Sex'] == 'MALE'
merged['Sex'][boolean_list] = 'Male'

list_boolean = merged['Sex'] == 'nan'
merged['Sex'][list_boolean] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [8]:
# dropping all rows that doesn't have the correct category
list_boolean = (merged['Sex'] != 'Female') & (merged['Sex'] !=  'Male')
drop_rows = merged['Sex'][list_boolean].index
merged = merged.drop(drop_rows)

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
# initalizing the label encoder
le = LabelEncoder() 
merged = merged.drop(['Drug Name'], axis = 1)
merged['Sex']= le.fit_transform(merged['Sex']) 
merged['Drug Class']= le.fit_transform(merged['Drug Class'])


In [11]:
# creating dummy variables from labeled categories
dummy_variables = pd.get_dummies(merged['Sex'])
meds = pd.get_dummies(merged['Drug Class'])
# attaching the sex dummy variable to the merged dataframe
merged = pd.concat([merged, dummy_variables], axis='columns')

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
# initalizing the Standard Scaler
# normalizing the age and delta pressure variables
st_x= StandardScaler()
contin_variables = merged[['Age', 'Delta']]
contin_variables = st_x.fit_transform(contin_variables)

In [16]:
# attaching the normalized variables to the original dataframe
merged[['Age','Delta']] = contin_variables
merged.head()

Unnamed: 0,Age,Sex,Delta,Drug Class,0,1
0,0.625926,0,-0.166815,3,1,0
1,-0.598697,1,-0.362644,3,0,1
2,1.034133,0,-1.282296,3,1,0
3,2.326791,0,-1.282296,3,1,0
4,0.013614,1,-1.403759,3,0,1


In [17]:
X = merged.drop(['Sex','Drug Class'], axis = 'columns')
X.columns = ['Age','Delta', 'Female', 'Male']
# dropping one of the dummy variables, to avoid multicollinearity
X =  X.drop(['Female'], axis = 'columns')

In [18]:
#changing column name on the medicine list
meds.columns = ['0','1','two','3','4','5']
Y = meds.drop(['two'], axis = 'columns')

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
# doing a train test split on 67/33 split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [21]:
# training the knn model with the data and predicting the X-test values
knn = KNeighborsRegressor(n_neighbors=2)
knn.fit(X_train, y_train)
prediction_knn = knn.predict(X_test)

In [22]:
# convert prediction array into a Pandas DataFrame and assign it columns
predictDF_knn = pd.DataFrame(prediction_knn)
predictDF_knn.columns = ['1','2','3','4','5']

In [23]:
# convert dummy variable of prediction back to a label
predictDF_knn = predictDF_knn.idxmax(axis = 1)

In [24]:
# converting label back to category of drug (inverse label encoding)
predict_knn_list = []

for num in predictDF_knn:
    predict_knn_list.append(le.inverse_transform(int(num)))

In [31]:
# showing top 5 rows of predicted values
predict_knn_list[0:5]

['ANGIOTENSIN II ANTAGONISTS, PLAIN',
 'ANGIOTENSIN II ANTAGONISTS, PLAIN',
 'BETA BLOCKING AGENTS',
 'BETA BLOCKING AGENTS',
 'CALCIUM CHANNEL BLOCKERS']

In [26]:
# converting dummy to label encoding for Y-test array
y_test = y_test.idxmax(axis = 1)

In [27]:
# converting label back to category of drug (inverse label encoding)
y_test_list = []

for result in y_test:
    y_test_list.append(le.inverse_transform(int(result)))

In [32]:
y_test_list[0:5]

['DIURETICS',
 'ACE INHIBITORS, PLAIN',
 'ANGIOTENSIN II ANTAGONISTS, PLAIN',
 'CALCIUM CHANNEL BLOCKERS',
 'ACE INHIBITORS, PLAIN']

In [28]:
# calculating how many predictions were correct according to the acutal values
i = 0
correct = 0
while i < len(y_test_list):
    if predict_knn_list[i] == y_test_list[i]:
        correct += 1
    i += 1
correct

6164

In [29]:
# calculating accuracy
correct/len(y_test_list)

0.16392309124272