<a href="https://colab.research.google.com/github/varunsmhatre/Drug-Classification/blob/main/DrugClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DrugClassification

## Load Dataset directly from Kaggle

In [2]:
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

# For more details on how to load dataset from Kaggle please check URL Below
# https://www.analyticsvidhya.com/blog/2021/06/how-to-load-kaggle-datasets-directly-into-google-colab/

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
# Download Dataset as Zip
# ! kaggle competitions download -c 'playground-series-s3e9'
! kaggle datasets download prathamtripathi/drug-classification

Downloading drug-classification.zip to /content
  0% 0.00/1.68k [00:00<?, ?B/s]
100% 1.68k/1.68k [00:00<00:00, 1.58MB/s]


In [4]:
! mkdir data

In [5]:
! unzip /content/drug-classification.zip -d data



Archive:  /content/drug-classification.zip
  inflating: data/drug200.csv        


## Importing the libraries

In [29]:
!pip install xgboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [36]:
df = pd.read_csv('/content/data/drug200.csv')

In [8]:
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [53]:
df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


## Preprocessing

In [10]:
for x in df.columns:
  print(x)
  print(df[x].unique())
  print('')

Age
[23 47 28 61 22 49 41 60 43 34 74 50 16 69 32 57 63 48 33 31 39 45 18 65
 53 46 15 73 58 66 37 68 67 62 24 26 40 38 29 17 54 70 36 19 64 59 51 42
 56 20 72 35 52 55 30 21 25]

Sex
['F' 'M']

BP
['HIGH' 'LOW' 'NORMAL']

Cholesterol
['HIGH' 'NORMAL']

Na_to_K
[25.355 13.093 10.114  7.798 18.043  8.607 16.275 11.037 15.171 19.368
 11.767 19.199 15.376 20.942 12.703 15.516 11.455 13.972  7.298 25.974
 19.128 25.917 30.568 15.036 33.486 18.809 30.366  9.381 22.697 17.951
  8.75   9.567 11.014 31.876 14.133  7.285  9.445 13.938  9.709  9.084
 19.221 14.239 15.79  12.26  12.295  8.107 13.091 10.291 31.686 19.796
 19.416 10.898 27.183 18.457 10.189 14.16  11.34  27.826 10.091 18.703
 29.875  9.475 20.693  8.37  13.303 27.05  12.856 10.832 24.658 24.276
 13.967 19.675 10.605 22.905 17.069 20.909 11.198 19.161 13.313 10.84
 13.934  7.761  9.712 11.326 10.067 13.935 13.597 15.478 23.091 17.211
 16.594 15.156 29.45  29.271 15.015 11.424 38.247 25.395 35.639 16.725
 11.871 12.854 13.127  8.966 

In [54]:
df_encoded = pd.get_dummies(df, columns = ['Sex', 'BP', 'Cholesterol'])
print(df_encoded.head(1))

   Age  Na_to_K   Drug  Sex_F  Sex_M  BP_HIGH  BP_LOW  BP_NORMAL  \
0   23   25.355  DrugY      1      0        1       0          0   

   Cholesterol_HIGH  Cholesterol_NORMAL  
0                 1                   0  


In [55]:
# Import label encoder
from sklearn import preprocessing

# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

# Encode labels in column 'species'.
df_encoded['Drug'] = label_encoder.fit_transform(df_encoded['Drug'])

## Splitting into Test & Train

In [39]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_encoded, test_size=0.2)

In [56]:
# Feature Scaling
robust_scaler = preprocessing.RobustScaler()
train[['Age', 'Na_to_K']] = robust_scaler.fit_transform(train[['Age', 'Na_to_K']])
test[['Age', 'Na_to_K']] = robust_scaler.transform(test[['Age', 'Na_to_K']])

In [57]:
train['Drug'].value_counts()

0    70
4    46
1    16
2    15
3    13
Name: Drug, dtype: int64

In [58]:
test['Drug'].value_counts()

0    21
4     8
1     7
3     3
2     1
Name: Drug, dtype: int64

In [59]:
y_col = 'Drug'
x_cols = list(df_encoded.columns)
x_cols.remove(y_col)

In [60]:
x_cols

['Age',
 'Na_to_K',
 'Sex_F',
 'Sex_M',
 'BP_HIGH',
 'BP_LOW',
 'BP_NORMAL',
 'Cholesterol_HIGH',
 'Cholesterol_NORMAL']

In [61]:
X_train = train.loc[:, x_cols].values
X_test = test.loc[:, x_cols].values

In [63]:
y_train = train.loc[:, y_col].values
# y_train = y_train.reshape(len(y_train),1)

In [64]:
y_test = test.loc[:, y_col].values
# y_test = y_test.reshape(len(y_test),1)

## Train

In [62]:
from sklearn import metrics
from xgboost import XGBClassifier

In [65]:
xgb_clf = XGBClassifier(objective='multi:softmax', 
                            num_class=5, 
                            early_stopping_rounds=10, 
                            eval_metric=['merror','mlogloss'], 
                            seed=42)
xgb_clf.fit(X_train, 
            y_train,
            verbose=0,
            eval_set=[(X_train, y_train), (X_test, y_test)])

# Making the Confusion Matrix

In [70]:
y_pred = xgb_clf.predict(X_test)

In [72]:
y_test

array([1, 1, 0, 4, 1, 3, 0, 0, 0, 0, 0, 1, 4, 4, 2, 0, 0, 0, 0, 0, 0, 0,
       0, 4, 1, 0, 3, 0, 0, 4, 4, 0, 1, 1, 0, 4, 0, 3, 0, 4])

In [71]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[21  0  0  0  0]
 [ 0  7  0  0  0]
 [ 0  0  1  0  0]
 [ 0  0  0  3  0]
 [ 0  0  0  0  8]]


1.0

In [75]:
cm = confusion_matrix(y_train, xgb_clf.predict(X_train))
print(cm)

accuracy_score(y_train, xgb_clf.predict(X_train))

[[70  0  0  0  0]
 [ 0 16  0  0  0]
 [ 0  0 15  0  0]
 [ 0  0  0 13  0]
 [ 0  0  0  0 46]]


1.0