# Import the libraries required

In [36]:
import pandas as pd


from hyperopt import STATUS_OK,tpe,fmin,hp,Trials,space_eval

import numpy as np

from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score,recall_score,confusion_matrix,f1_score

df = pd.read_csv("/home/harshit/Desktop/PythonDA/Day7/drug-classification.csv")

df

Unnamed: 0,Sex,BP,Cholesterol,Age_binned,Na_to_K_binned,DrugType
0,Male,Low,High,20s,<10,drugX
1,Female,Normal,High,20s,10-20,drugX
2,Male,Low,High,40s,20-30,DrugY
3,Male,High,Normal,20s,>30s,DrugY
4,Male,Normal,High,20s,20-30,DrugY
5,Female,Low,Normal,20s,10-20,drugX
6,Female,Normal,High,20s,<10,drugX
7,Female,Low,Normal,30s,10-20,drugX
8,Male,Low,High,40s,10-20,drugC
9,Male,Normal,Normal,50s,<10,drugX


# Check the data type of each column

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Sex             60 non-null     object
 1   BP              60 non-null     object
 2   Cholesterol     60 non-null     object
 3   Age_binned      60 non-null     object
 4   Na_to_K_binned  60 non-null     object
 5   DrugType        60 non-null     object
dtypes: object(6)
memory usage: 2.9+ KB


# Count of missing values

In [38]:
df.isna().sum()

Sex               0
BP                0
Cholesterol       0
Age_binned        0
Na_to_K_binned    0
DrugType          0
dtype: int64

# Distribution of categories in label

In [40]:
df['DrugType'].value_counts()

drugX    24
DrugY    21
drugA     7
drugC     4
drugB     4
Name: DrugType, dtype: int64

# Separating features & label

In [41]:
features = [        'Sex', 'BP', 'Cholesterol', 'Age_binned', 'Na_to_K_binned'] 

label =    ['DrugType']

# Encoding the categorical features

In [42]:

le = LabelEncoder() 

for col in features:
    df[col]   =  le.fit_transform(  df[col]  )


df

Unnamed: 0,Sex,BP,Cholesterol,Age_binned,Na_to_K_binned,DrugType
0,1,1,0,0,2,drugX
1,0,2,0,0,0,drugX
2,1,1,0,2,1,DrugY
3,1,0,1,0,3,DrugY
4,1,2,0,0,1,DrugY
5,0,1,1,0,0,drugX
6,0,2,0,0,2,drugX
7,0,1,1,1,0,drugX
8,1,1,0,2,0,drugC
9,1,2,1,3,2,drugX


# Over Sample minority class

In [43]:
X_org = df[features].copy()

y_org=df[label].copy()


from imblearn.over_sampling import SMOTE

sm=SMOTE(random_state=42,k_neighbors=3)


X,y = sm.fit_resample(X_org,y_org)

# Split training & testing data set from new oversampled dataset

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train,y_test = train_test_split( X, y,random_state=42, stratify=y )

In [44]:
#training and evaluating the model
from sklearn.metrics import f1_score, precision_score


clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)

#test on original set
X_train, X_test, y_train, y_test = train_test_split(X_org, y_org, test_size=0.2, stratify=y_org, random_state=42)

y_predicted=clf.predict(X_org)

score=f1_score(y_org,y_predicted,average='weighted')

print(score)



0.9165007466401195


# Install HyperOpt

In [45]:
!pip install hyperopt

Defaulting to user installation because normal site-packages is not writeable


# Create search space & minimize objective function to implement HyperOpt

In [46]:
search_space=hp.choice(
    'classifier_type',
    [
        {
            'type':'DTREE',
            
            'criterion': hp.choice('CRITERIA SELECTED', ['gini', 'entropy']), 

            'splitter': hp.choice('splitter SELECTED', ['best', 'random']), 
         
            'max_depth': hp.choice('MAX DEPTH OF TREE', [3,4,7]) # 6,7,8,9,10,11


        }
    ]
)
#real values 0.1 to 1.0: hp.uniform


#objective: To always minimize something! (always selects models which gives minimum of something)(MSE)
def objective(params):
    
    classifier_type=params['type']
    
    del params['type']
    
    if classifier_type == 'DTREE':
        
        #training and evaluating the model
        X_train, X_test, y_train,y_test = train_test_split( X, y,random_state=42, stratify=y,test_size=0.2 )
        clf = DecisionTreeClassifier(**params)
        clf.fit(X_train,y_train)
        
        #test on original set
        X_train, X_test, y_train, y_test = train_test_split(X_org, y_org, test_size=0.2, stratify=y_org, random_state=42)
        y_predicted=clf.predict(X_org)
        score=f1_score(y_org,y_predicted,average='weighted')
    

    return({'loss':-score ,'status':STATUS_OK,'type':classifier_type,'model':clf})

    
trials=Trials()## temporary Database

best = fmin(
    fn = objective,
    space=search_space,
    algo = tpe.suggest,
    trials = trials, #database!
    max_evals=100
)

data=space_eval(search_space,best)

print(data)

100%|██████████| 100/100 [00:03<00:00, 31.75trial/s, best loss: -1.0]
{'criterion': 'entropy', 'max_depth': 7, 'splitter': 'best', 'type': 'DTREE'}


In [47]:
for entry in trials:
  print(entry['result']['loss'])

-0.8340054390054391
-1.0
-0.9828861061419202
-0.7947158532524387
-0.7882980293127352
-0.7854948152622572
-0.9665000000000001
-0.6359920634920635
-0.9828861061419202
-0.6209271284271284
-0.9217585515259933
-0.9665000000000001
-0.9659396051103369
-0.8561701474201475
-0.9665000000000001
-0.9828861061419202
-1.0
-0.8340054390054391
-0.5481253649731911
-0.8340054390054391
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-0.8340054390054391
-1.0
-1.0
-0.9828861061419202
-1.0
-0.8340054390054391
-0.9828861061419202
-1.0
-1.0
-0.9828861061419202
-0.8340054390054391
-0.9665000000000001
-0.9828861061419202
-0.9665000000000001
-0.8340054390054391
-0.9665000000000001
-0.9828861061419202
-1.0
-0.6359920634920635
-1.0
-1.0
-0.8181411496305114
-1.0
-0.8340054390054391
-0.9833001493280238
-1.0
-1.0
-0.9828861061419202
-0.9665000000000001
-0.8340054390054391
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-0.9828861061419202
-0.8340054390054391
-0.9665000000000001
