In [1]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from joblib import dump, load
from sklearn.model_selection import train_test_split

In [2]:
df_X = pd.read_csv("drugs_x_train.csv")
df_y = pd.read_csv("drugs_y_train.csv")

x = df_X.values
y = df_y.values

x_normalized = (x - x.min()) / (x.max() - x.min())

### Determine best classifier to use based on cross validation

In [3]:
# cv_score_MLP = (cross_val_score(MLPClassifier(random_state=2, max_iter=2000), x_normalized, y, cv=10, scoring="accuracy")).mean()
# cv_score_RF = (cross_val_score(RandomForestClassifier(random_state=1), x_normalized, y, cv=10, scoring="accuracy")).mean()
# cv_score_KNN = (cross_val_score(KNeighborsClassifier(n_neighbors=12), x_normalized, y, cv=10, scoring="accuracy")).mean()
# cv_score_DCT = (cross_val_score(DecisionTreeClassifier(random_state=1), x_normalized, y, cv=10, scoring="accuracy")).mean()

# print("MLP: ", cv_score_MLP)
# print("RF: ", cv_score_RF)
# print("KNN: ", cv_score_KNN)
# print("DCT: ", cv_score_DCT)


In [4]:
#Train_Test_Split
#Fit dataset using Train Test Split
x_train, x_test, y_train, y_test = train_test_split(x_normalized,y, test_size=.2, random_state=1)

clf = DecisionTreeClassifier(random_state=1, max_depth=4)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)
print(classification_report(y_test, y_pred)) #Evaluation


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.75      1.00      0.86         3
           2       1.00      0.80      0.89         5
           3       1.00      1.00      1.00         3

   micro avg       0.95      0.95      0.95        21
   macro avg       0.94      0.95      0.94        21
weighted avg       0.96      0.95      0.95        21
 samples avg       0.74      0.74      0.74        21



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Fit training data to model

In [5]:
# Decision Tree Classifier: Fit the training data
clf = DecisionTreeClassifier(random_state=1)
clf.fit(x_normalized, y)

### hyper-parameter tuning using GridSearchCV

In [6]:
param_dist = {
    "criterion":["gini", "entropy"],
    "max_depth":[1,2,3,4,5,6,7,None],
    "splitter":["best", "random"]        
}

grid = GridSearchCV(clf, param_grid = param_dist, cv=10, n_jobs=-1)
grid.fit(x, y)
print("best estimator: ", grid.best_estimator_)
print("best score: ", grid.best_score_)
print("best params: ", grid.best_params_)

best estimator:  DecisionTreeClassifier(max_depth=4, random_state=1)
best score:  0.9851648351648352
best params:  {'criterion': 'gini', 'max_depth': 4, 'splitter': 'best'}


### Fit data to model after parameter tuning


In [9]:
df_X = pd.read_csv("drugs_x_test.csv")
df_y = pd.read_csv("drugs_y_test.csv")

x_test = df_X.values
y_test = df_y.values

# Decision Tree Classifier: Fit the training data
clf = DecisionTreeClassifier(random_state=1, max_depth=4, criterion="gini")
clf.fit(x, y)
#clf=load("ml_drugsdata_v3.joblib")
y_pred = clf.predict(x_test)

print(classification_report(y_test, y_pred)) #Evaluation

cv_score_DCT_best_param = (cross_val_score(DecisionTreeClassifier(random_state=1, max_depth=4, criterion="gini"), x_normalized, y, cv=10, scoring="accuracy")).mean()
cv_score_DCT_best_param


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        29
           1       1.00      1.00      1.00         8
           2       1.00      1.00      1.00         4
           3       1.00      1.00      1.00         6

   micro avg       1.00      1.00      1.00        47
   macro avg       1.00      1.00      1.00        47
weighted avg       1.00      1.00      1.00        47
 samples avg       0.71      0.71      0.71        47



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.9851648351648352

### Dump model file

In [8]:
model_file = "ml_drugsdata_v3.joblib"
dump(clf, model_file)

['ml_drugsdata_v3.joblib']