# Import & Set Variables

In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report

from keras.utils import to_categorical


from keras.models import Sequential
from keras.layers import Dense, Dropout

import keras

from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.svm import SVC

import matplotlib.pyplot as plt

%matplotlib inline

# dataset file name
fname = "cow.csv"

# normal values
NORMAL_TEMPERATURE = 37.8
NORMAL_PULSE = (30, 40)
NORMAL_RESPIRATORY_RATE = (8, 10)
NORMAL_NASOGASTRIC_REFLUX_PH = (3, 4)
NORMAL_PACKED_CELL_VOLUME = (30, 50)
NORMAL_TOTAL_PROTEIN = (6, 7.5)
NORMAL_ABDOMO_APPEARANCE = "clear"

---

# Load Data

In [2]:
df_cow = pd.read_csv(fname)
df_cow.columns

Index(['surgery', 'age', 'hospital_number', 'temperature', 'pulse',
       'respiratory_rate', 'temp_of_extremities', 'peripheral_pulse',
       'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis',
       'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux',
       'nasogastric_reflux_ph', 'rectal_exam_feces', 'abdomen',
       'packed_cell_volume', 'total_protein', 'abdomo_appearance',
       'abdomo_protein', 'outcome Class', 'surgical_lesion', 'lesion'],
      dtype='object')

In [3]:
df_cow.head(5)

Unnamed: 0,surgery,age,hospital_number,temperature,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome Class,surgical_lesion,lesion
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,,decreased,distend_large,45.0,8.4,,,died,no,11300
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,,absent,other,50.0,85.0,cloudy,2.0,euthanized,no,2208
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,,normal,normal,33.0,6.7,,,lived,no,0
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,5.0,decreased,,48.0,7.2,serosanguious,5.3,died,yes,2208
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,,,,74.0,7.4,,,died,no,4300


---

# Funtion

## visualization

In [4]:
def plot_loss_accuracy(history):
    print_score(history)
    
    fig = plt.figure(figsize=(12, 6))
    ax = fig.add_subplot(1, 2, 1)
    ax.plot(history.history["loss"],'r-x', label="Train Loss")
    ax.plot(history.history["val_loss"],'b-x', label="Validation Loss")
    ax.legend()
    ax.set_title('cross_entropy loss')
    ax.grid(True)


    ax = fig.add_subplot(1, 2, 2)
    ax.plot(history.history["accuracy"],'r-x', label="Train Accuracy")
    ax.plot(history.history["val_accuracy"],'b-x', label="Validation Accuracy")
    ax.legend()
    ax.set_title('accuracy')
    ax.grid(True)

def print_score(history):
    print("lastes accuracy and loss")
    print("train_loss : {:.4f}, val_loss : {:.4f}\ntrain_acc : {:.4f}, val_acc : {:.4f}".format(history.history['loss'][-1],
                                                                                               history.history['val_loss'][-1],
                                                                                               history.history['accuracy'][-1],
                                                                                               history.history['val_accuracy'][-1]))

In [5]:
def create_confusion_matrix(model,x_val,y_val,label_names):
    y_probs = model.predict(x_val)
    if(y_probs[0].size > 1):
        y_preds = y_probs.argmax(axis=1)
    else:
        y_preds = y_probs
        
    if(y_val[0].size > 1):
        y_val = y_val.argmax(axis=1)
    else:
        y_val = y_val
    
    #Confusion matrix
    cm= confusion_matrix(y_val,y_preds)

    #Plot
    disp=ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=label_names)
    fig, ax = plt.subplots(figsize=(10,10))
    disp.plot(ax=ax,xticks_rotation=45)

## preprocessing

In [6]:
def extract_lesion_column(lesion):
    site, type_l, subtype, code = 'None', 'None', 'None', 'None'
    if lesion != 0:
        types = re.findall('^([1-9]|11|00)([1-4])([0-2])([1-9]|10|0)',str(lesion))
        if types:
            site, type_l, subtype, code = types[0]
    return pd.Series({'site': site, 'type': type_l, 'subtype': subtype, 'code': code}, dtype='category')


# Preprocess Data

In [7]:
n_name = ["lived","died","euthanized"]

In [8]:
df_cow.loc[:, ['site', 'type', 'subtype', 'code']] = df_cow.lesion.apply(extract_lesion_column).astype('category')

In [9]:
df_cow.head()

Unnamed: 0,surgery,age,hospital_number,temperature,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,total_protein,abdomo_appearance,abdomo_protein,outcome Class,surgical_lesion,lesion,site,type,subtype,code
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,8.4,,,died,no,11300,11.0,3.0,0.0,0.0
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,85.0,cloudy,2.0,euthanized,no,2208,2.0,2.0,0.0,8.0
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,6.7,,,lived,no,0,,,,
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,7.2,serosanguious,5.3,died,yes,2208,2.0,2.0,0.0,8.0
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,7.4,,,died,no,4300,4.0,3.0,0.0,0.0


In [10]:
#drop column ที่ไม่จำเป็น
if "outcome Class" in df_cow.columns:
    y = df_cow['outcome Class']
    df_cow.drop(columns=["outcome Class"], inplace=True)
if "lesion" in df_cow.columns:
    df_cow.drop(columns=['hospital_number',"lesion"], inplace=True)
df_cow.head()

Unnamed: 0,surgery,age,temperature,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,site,type,subtype,code
0,no,adult,38.5,66.0,28.0,cool,reduced,,more_3_sec,extreme_pain,...,distend_large,45.0,8.4,,,no,11.0,3.0,0.0,0.0
1,yes,adult,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,mild_pain,...,other,50.0,85.0,cloudy,2.0,no,2.0,2.0,0.0,8.0
2,no,adult,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,mild_pain,...,normal,33.0,6.7,,,no,,,,
3,yes,young,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,depressed,...,,48.0,7.2,serosanguious,5.3,yes,2.0,2.0,0.0,8.0
4,no,adult,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,,...,,74.0,7.4,,,no,4.0,3.0,0.0,0.0


In [11]:
numerical = list(df_cow._get_numeric_data().columns)
categorical = list(set(df_cow.columns) - set(numerical))

In [12]:
df_cow.fillna(df_cow.mean(numeric_only=True), inplace=True)
df_cow.mean(numeric_only=True)

temperature              38.168619
pulse                    72.000000
respiratory_rate         30.460581
nasogastric_reflux_ph     4.707547
packed_cell_volume       46.307407
total_protein            24.274436
abdomo_protein            3.039604
dtype: float64

In [13]:
df_cow_numerical = df_cow[numerical].copy()
df_cow_numerical.head()

Unnamed: 0,temperature,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein
0,38.5,66.0,28.0,4.707547,45.0,8.4,3.039604
1,39.2,88.0,20.0,4.707547,50.0,85.0,2.0
2,38.3,40.0,24.0,4.707547,33.0,6.7,3.039604
3,39.1,164.0,84.0,5.0,48.0,7.2,5.3
4,37.3,104.0,35.0,4.707547,74.0,7.4,3.039604


In [14]:
df_cow_categorical = pd.DataFrame()

for c in categorical:
    encoded = pd.get_dummies(df_cow[c], dummy_na=True, prefix=c)
    df_cow_categorical = pd.concat([df_cow_categorical, encoded], axis=1)

df_cow_categorical.head()

Unnamed: 0,peristalsis_absent,peristalsis_hypermotile,peristalsis_hypomotile,peristalsis_normal,peristalsis_nan,surgery_no,surgery_yes,surgery_nan,peripheral_pulse_absent,peripheral_pulse_increased,...,pain_alert,pain_depressed,pain_extreme_pain,pain_mild_pain,pain_severe_pain,pain_nan,abdomo_appearance_clear,abdomo_appearance_cloudy,abdomo_appearance_serosanguious,abdomo_appearance_nan
0,1,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,1,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2,0,0,1,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,1,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [15]:
df_cow_data = pd.concat([df_cow_numerical,df_cow_categorical],axis=1)
df_cow_data.head()

Unnamed: 0,temperature,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,peristalsis_absent,peristalsis_hypermotile,peristalsis_hypomotile,...,pain_alert,pain_depressed,pain_extreme_pain,pain_mild_pain,pain_severe_pain,pain_nan,abdomo_appearance_clear,abdomo_appearance_cloudy,abdomo_appearance_serosanguious,abdomo_appearance_nan
0,38.5,66.0,28.0,4.707547,45.0,8.4,3.039604,1,0,0,...,0,0,1,0,0,0,0,0,0,1
1,39.2,88.0,20.0,4.707547,50.0,85.0,2.0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
2,38.3,40.0,24.0,4.707547,33.0,6.7,3.039604,0,0,1,...,0,0,0,1,0,0,0,0,0,1
3,39.1,164.0,84.0,5.0,48.0,7.2,5.3,1,0,0,...,0,1,0,0,0,0,0,0,1,0
4,37.3,104.0,35.0,4.707547,74.0,7.4,3.039604,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [16]:
y = y.map({"lived": 0, "died": 1, "euthanized": 2})
y.head()

0    1
1    2
2    0
3    1
4    1
Name: outcome Class, dtype: int64

In [17]:
X = df_cow_data.values

In [18]:
X.shape, y.shape

((299, 111), (299,))

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y,random_state=1234)

In [20]:
print(np.unique(y_train,return_counts=True))

(array([0, 1, 2], dtype=int64), array([142,  62,  35], dtype=int64))


In [21]:
print(np.unique(y_test,return_counts=True))

(array([0, 1, 2], dtype=int64), array([36, 15,  9], dtype=int64))


In [22]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1234)
X_train, y_train = smote.fit_resample(X_train, y_train)
print(np.unique(y_train,return_counts=True))

(array([0, 1, 2], dtype=int64), array([142, 142, 142], dtype=int64))


In [23]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size =81, stratify=y_train,random_state=1234)

In [24]:
print(np.unique(y_train,return_counts=True))

(array([0, 1, 2], dtype=int64), array([115, 115, 115], dtype=int64))


In [25]:
print(np.unique(y_val,return_counts=True))

(array([0, 1, 2], dtype=int64), array([27, 27, 27], dtype=int64))


In [26]:
y_train = keras.utils.to_categorical(y_train,3)
y_val = keras.utils.to_categorical(y_val,3)
# y_test = keras.utils.to_categorical(y_test,3)

---

# Training Models

## Non-ensemble

### Neural Network

In [27]:
nn_model = Sequential([
    Dense(64, activation="relu", input_shape=X_train[0].shape),
    Dense(64, activation="relu"),
    Dropout(0.5),
    
    Dense(128, activation="relu"),
    Dense(128, activation="relu"),
    Dropout(0.8),
    
    Dense(3, activation="softmax"),
])

nn_model.compile(optimizer='adam',
                 loss="categorical_crossentropy", 
                 metrics=[
                     "accuracy",
                     keras.metrics.Precision()
                     ])

nn_model.fit(X_train, y_train, epochs=100, verbose=1
            ,validation_data=(X_val, y_val))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100


Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100


Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x221002213a0>

In [28]:
y_pred_nn = np.argmax(nn_model.predict(X_test),axis=1)
nn_model_acc = accuracy_score(y_test, y_pred_nn)
nn_model_prec = precision_score(y_test, y_pred_nn, average=None)
nn_model_f1 = f1_score(y_test, y_pred_nn, average=None)
nn_model_recall = recall_score(y_test, y_pred_nn, average=None)



In [30]:
plot_loss_accuracy(nn_model.history)

lastes accuracy and loss


KeyError: 'loss'

### KNN

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

In [None]:
y_pred_knn = np.argmax(knn_model.predict(X_test), axis=1)
knn_model_acc = accuracy_score(y_test, y_pred_knn)
knn_model_prec = precision_score(y_test, y_pred_knn, average=None)
knn_model_f1 = f1_score(y_test, y_pred_knn, average=None)
knn_model_recall = recall_score(y_test, y_pred_knn, average=None)

## Ensemble

### Random Forest

In [None]:
rfc_model = RandomForestClassifier(n_estimators=1000)
rfc_model.fit(X_train, y_train)


In [None]:
y_pred_rf = np.argmax(rfc_model.predict(X_test), axis=1)
rfc_model_acc = accuracy_score(y_test, y_pred_rf)
rfc_model_prec = precision_score(y_test, y_pred_rf, average=None)
rfc_model_f1 = f1_score(y_test, y_pred_rf, average=None)
rfc_model_recall = recall_score(y_test, y_pred_rf, average=None)

### AdaBoost

In [None]:
ada_model = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=1000)
ada_model.fit(X_train, np.argmax(y_train, axis=1))

In [None]:
y_pred_ada =  ada_model.predict(X_test)
ada_model_acc = accuracy_score(y_test, y_pred_ada)
ada_model_prec = precision_score(y_test, y_pred_ada, average=None)
ada_model_f1 = f1_score(y_test, y_pred_ada, average=None)
ada_model_recall = recall_score(y_test, y_pred_ada, average=None)

### Bagging

In [None]:
bag_model = BaggingClassifier(SVC(), n_estimators=1000)
bag_model.fit(X_train,  np.argmax(y_train,axis=1))

In [None]:
y_pred_bag =  bag_model.predict(X_test)
bag_model_acc = accuracy_score(y_test, y_pred_bag)
bag_model_prec = precision_score(y_test, y_pred_bag, average=None)
bag_model_f1 = f1_score(y_test, y_pred_bag, average=None)
bag_model_recall = recall_score(y_test, y_pred_bag, average=None)

---

# Visualisation

## Confusion Matrix

In [None]:
create_confusion_matrix(nn_model,X_test,keras.utils.to_categorical(y_test,3),n_name)

In [None]:
create_confusion_matrix(knn_model,X_test,keras.utils.to_categorical(y_test,3),n_name)

In [None]:
create_confusion_matrix(rfc_model,X_test,keras.utils.to_categorical(y_test,3),n_name)

In [None]:
create_confusion_matrix(ada_model,X_test,y_test,n_name)

In [None]:
create_confusion_matrix(bag_model,X_test,y_test,n_name)

## Accuracy

In [None]:
df = pd.DataFrame({'Models':['Neural Network', 'KNN', 'Random Forest', "AdaBoost", "Bagging"], 'Accuracy':[nn_model_acc, knn_model_acc, rfc_model_acc, ada_model_acc, bag_model_acc]})
ax = df.plot.bar(x='Models', y='Accuracy', rot=0, ylim=(0, 1), title="Accuracy Comparison Between Models")

## Precision

In [None]:
precs = [nn_model_prec, knn_model_prec, rfc_model_prec, ada_model_prec, bag_model_prec]
ind = np.arange(len(precs))
width = 0.25

class_0 = [nn_model_prec[0], knn_model_prec[0], rfc_model_prec[0], ada_model_prec[0], bag_model_prec[0]]
bar1 = plt.bar(ind, class_0, width, color = 'blue')
  
class_1 = [nn_model_prec[1], knn_model_prec[1], rfc_model_prec[1], ada_model_prec[1], bag_model_prec[1]]
bar2 = plt.bar(ind+width, class_1, width, color='g')
  
class_2 = [nn_model_prec[2], knn_model_prec[2], rfc_model_prec[2], ada_model_prec[2], bag_model_prec[2]]
bar3 = plt.bar(ind+width*2, class_2, width, color = 'orange')
  
plt.xlabel("Models")
plt.ylabel('Precision')
plt.title("Precision Comparison between Models")
plt.ylim((0, 1))
  
plt.xticks(ind+width,['Neural Network', 'KNN', 'Random Forest', "AdaBoost", "Bagging"])
plt.legend( (bar1, bar2, bar3), ("lived", "died", "euthanized") )
plt.show()

## F1

In [None]:
f1 = [nn_model_f1, knn_model_f1, rfc_model_f1, ada_model_f1, bag_model_f1]
ind = np.arange(len(f1))
width = 0.25

class_0 = [nn_model_f1[0], knn_model_f1[0], rfc_model_f1[0], ada_model_f1[0], bag_model_f1[0]]
bar1 = plt.bar(ind, class_0, width, color = 'blue')
  
class_1 = [nn_model_f1[1], knn_model_f1[1], rfc_model_f1[1], ada_model_f1[1], bag_model_f1[1]]
bar2 = plt.bar(ind+width, class_1, width, color='g')
  
class_2 = [nn_model_f1[2], knn_model_f1[2], rfc_model_f1[2], ada_model_f1[2], bag_model_f1[2]]
bar3 = plt.bar(ind+width*2, class_2, width, color = 'orange')
  
plt.xlabel("Models")
plt.ylabel('F1')
plt.title("F1 Comparison between Models")
plt.ylim((0, 1))
  
plt.xticks(ind+width,['Neural Network', 'KNN', 'Random Forest', "AdaBoost", "Bagging"])
plt.legend( (bar1, bar2, bar3), ("lived", "died", "euthanized") )
plt.show()

## Recall

In [None]:
recall = [nn_model_recall, knn_model_recall, rfc_model_recall, ada_model_recall, bag_model_recall]
ind = np.arange(len(recall))
width = 0.25

class_0 = [nn_model_recall[0], knn_model_recall[0], rfc_model_recall[0], ada_model_recall[0], bag_model_recall[0]]
bar1 = plt.bar(ind, class_0, width, color = 'blue')
  
class_1 = [nn_model_recall[1], knn_model_recall[1], rfc_model_recall[1], ada_model_recall[1], bag_model_recall[1]]
bar2 = plt.bar(ind+width, class_1, width, color='g')
  
class_2 = [nn_model_recall[2], knn_model_recall[2], rfc_model_recall[2], ada_model_recall[2], bag_model_recall[2]]
bar3 = plt.bar(ind+width*2, class_2, width, color = 'orange')
  
plt.xlabel("Models")
plt.ylabel('Recall')
plt.title("Recall Comparison between Models")
plt.ylim((0, 1))
  
plt.xticks(ind+width,['Neural Network', 'KNN', 'Random Forest', "AdaBoost", "Bagging"])
plt.legend( (bar1, bar2, bar3), ("lived", "died", "euthanized") )
plt.show()

---