Thomas Dagier, Quentin Rod

In [15]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn import model_selection
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Binarizer, LabelEncoder, MaxAbsScaler, MinMaxScaler, OneHotEncoder, \
    OrdinalEncoder, RobustScaler, StandardScaler

# Exercise 2 - Debugging - Drugs

The dataset contains a set of patients, all of whom suffered from the same illness.
During their course of treatment, each patient responded to one of 5 medications: Drug A, Drug B, Drug c, Drug x or y.

This complex dataset proposes a set of challenges that you'll try to overcome. A basic ML pipeline is already in place. You have to optimize the performance of the model by applying good practices, debugging pre-processing errors, etc.

**Rules**:
- Do not use other modules than those already imported *(or do it only if your own code is not working. In this case, keep your own code in the notebook, commented, so that we know what you tried)*.
- Explain **all** of your choices. For every task, choose the most appropriate option for this problem and describe your choice.
- You can modify any parts of the code or replace the model by one already used in previous PWs.

**Work to do**:
- [ ] Apply a type of normalization.
- [ ] Encode categorical data.
- [ ] Use all columns in the dataset (or choose the most meaningful features).
- [ ] Choose a more appropriate metric.
- [ ] Optimize hyper-parameters.
- [ ] Test the model performance correctly using a separated test set.
- Apply **two** of those techniques:
  - [ ] Keep relative class frequencies in the train/test sets (check train_test_split docs)
  - [ ] Show which feature(s) are the most correlated to the target.
  - [ ] Use cross-validation.

In [16]:
df = pd.read_csv("drug.csv")
df.head()

Unnamed: 0,age,sex,bp,cholesterol,Na_to_K,drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          200 non-null    int64  
 1   sex          200 non-null    object 
 2   bp           200 non-null    object 
 3   cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [18]:
#Return the pandas frame with the normalized column name
def normalize_column(df, column_name):
    column = df.loc[:, column_name]
    val_min = column.min()
    val_max = column.max()
    df.loc[:, column_name] = (column - val_min) / (val_max - val_min)
    
#Encode column and preserving order
def order_label_encode_column(df, column_name):
    df.loc[df[column_name] == 'HIGH', column_name] = 2
    df.loc[df[column_name] == 'NORMAL', column_name] = 1
    df.loc[df[column_name] == 'LOW', column_name] = 0

#Encode column without preserving order
def label_encode_column(df, column_name):
    column = df.loc[:, column_name]
    df[column_name] = LabelEncoder().fit_transform(column)

#Encode column with one-hot encoding
def one_hot_encode_column(df, column_name):
    column = df.loc[:, column_name].to_numpy()
    column = column.reshape((len(column), 1))
    column = OneHotEncoder().fit_transform(column).toarray()    
    df[column_name] = column

categorical_columns = ["sex", "bp", "cholesterol", "drug"]
normalize_column(df, "age") #Normalize between 0 and 1. There is no outlier.
normalize_column(df, "Na_to_K") #Normalize between 0 and 1. There is no outlier.
order_label_encode_column(df, "bp") #Encode and preserve order
order_label_encode_column(df, "cholesterol") #Encode and preserve order

label_encode_column(df, "sex")  #Encode without preserving order
label_encode_column(df, "drug") #Encode without preserving order

df = df.astype(np.float64) #Change all types to float64

normalize_column(df, "sex") #Normalize between 0 and 1. There is no outlier.
normalize_column(df, "bp") #Normalize between 0 and 1. There is no outlier.
normalize_column(df, "cholesterol") #Normalize between 0 and 1. There is no outlier.

# TODO: you will replace this by a more adapted preprocessing step
df = df.drop(columns=["sex"]) #By testing we saw that this column reduces performance of our model.

In [19]:
X = df.drop(columns="drug")
y = df["drug"]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# TODO: a preprocessing step is likely missing here...

In [22]:
#knn = KNeighborsClassifier(100)
knn = KNeighborsClassifier(1) # We change the number of neighbours because 100 is too much. 
knn.fit(X_train, y_train)

In [23]:
def predict(estimator, X, y):
    # predict
    y_pred = knn.predict(X)
    
    # show confusion matrix
    cm = confusion_matrix(y, y_pred)
    disp=ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=knn.classes_)
    #disp.plot();
    
    # check metric (this one is probably not the best choice...)
    metric = metrics.f1_score(y, y_pred, average=None) #f1 allows to take into account precision and recall
    #print(f"f1: {metric}")
    return np.average(metric)

In [24]:
predict(knn, X_test, y_test)

0.975366568914956

In [26]:
# TODO: any of the 3 proposed task...

# cross validation
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50]

X_train_folds = []
y_train_folds = []

X_train_folds = np.array_split(X_train, num_folds)
y_train_folds = np.array_split(y_train, num_folds)

k_to_f1 = {}

for k in k_choices:
    f1_for_k = []
    for i in range(num_folds):
        classifier = KNeighborsClassifier(k)
        X_train_f = np.concatenate(X_train_folds[:i] + X_train_folds[i+1:])
        y_train_f = np.concatenate(y_train_folds[:i] + y_train_folds[i+1:])
        classifier.fit(X_train_f, y_train_f)
        f1 = predict(classifier, X_test, y_test)
        f1_for_k.append(f1)
    k_to_f1[k] = f1_for_k
        
for k in sorted(k_to_f1):
    print('k = %d, f1 average over folds = %f' % (k, np.average(k_to_f1[k])))



k = 1, f1 average over folds = 0.975367
k = 3, f1 average over folds = 0.975367
k = 5, f1 average over folds = 0.975367
k = 8, f1 average over folds = 0.975367
k = 10, f1 average over folds = 0.975367
k = 12, f1 average over folds = 0.975367
k = 15, f1 average over folds = 0.975367
k = 20, f1 average over folds = 0.975367
k = 50, f1 average over folds = 0.975367


  sub_arys.append(_nx.swapaxes(sary[st:end], axis, 0))
