In [180]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import Imputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from statistics import mode
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor

__Load Dataset__

In [91]:
df = pd.read_csv('horse.csv')
df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no


In [92]:
df.isnull().sum()

surgery                    0
age                        0
hospital_number            0
rectal_temp               60
pulse                     24
respiratory_rate          58
temp_of_extremities       56
peripheral_pulse          69
mucous_membrane           47
capillary_refill_time     32
pain                      55
peristalsis               44
abdominal_distention      56
nasogastric_tube         104
nasogastric_reflux       106
nasogastric_reflux_ph    246
rectal_exam_feces        102
abdomen                  118
packed_cell_volume        29
total_protein             33
abdomo_appearance        165
abdomo_protein           198
outcome                    0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
dtype: int64

In [121]:
df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,0,0,530101,38.5,66.0,28.0,0,4,0,3,...,45.0,8.4,0,,0,0,11300,0,0,0
1,1,0,534817,39.2,88.0,20.0,0,0,5,2,...,50.0,85.0,2,2.0,1,0,2208,0,0,0
2,0,0,530334,38.3,40.0,24.0,0,3,6,2,...,33.0,6.7,0,,2,0,0,0,0,1
3,1,1,5290409,39.1,164.0,84.0,0,3,3,3,...,48.0,7.2,3,5.3,0,1,2208,0,0,1
4,0,0,530255,37.3,104.0,35.0,0,0,3,3,...,74.0,7.4,0,,0,0,4300,0,0,0


In [94]:
df.nunique()

surgery                    2
age                        2
hospital_number          283
rectal_temp               40
pulse                     52
respiratory_rate          40
temp_of_extremities        4
peripheral_pulse           4
mucous_membrane            6
capillary_refill_time      3
pain                       5
peristalsis                4
abdominal_distention       4
nasogastric_tube           3
nasogastric_reflux         3
nasogastric_reflux_ph     20
rectal_exam_feces          4
abdomen                    5
packed_cell_volume        50
total_protein             80
abdomo_appearance          3
abdomo_protein            37
outcome                    3
surgical_lesion            2
lesion_1                  61
lesion_2                   6
lesion_3                   2
cp_data                    2
dtype: int64

__This dataset contains many categorical features, replace them with label encoding__

In [95]:
df.surgery = LabelEncoder().fit_transform(df.surgery)
df.age = LabelEncoder().fit_transform(df.age)
df.outcome = LabelEncoder().fit_transform(df.outcome)
df.surgical_lesion = LabelEncoder().fit_transform(df.surgical_lesion)

In [112]:
df.temp_of_extremities = pd.Categorical(df.temp_of_extremities).codes
df.peripheral_pulse = pd.Categorical(df.peripheral_pulse).codes
df.mucous_membrane = pd.Categorical(df.mucous_membrane).codes
df.capillary_refill_time = pd.Categorical(df.capillary_refill_time).codes
df.pain = pd.Categorical(df.pain).codes
df.peristalsis = pd.Categorical(df.peristalsis).codes
df.abdominal_distention =  pd.Categorical(df.abdominal_distention).codes
df.nasogastric_tube =  pd.Categorical(df.nasogastric_tube).codes
df.nasogastric_reflux = pd.Categorical(df.nasogastric_reflux).codes
df.rectal_exam_feces = pd.Categorical(df.rectal_exam_feces).codes
df.abdomen =  pd.Categorical(df.abdomen).codes
df.abdomo_appearance =  pd.Categorical(df.abdomo_appearance).codes
df.cp_data =  pd.Categorical(df.cp_data).codes

In [133]:
df.isnull().sum()

surgery                  0
age                      0
hospital_number          0
rectal_temp              0
pulse                    0
respiratory_rate         0
temp_of_extremities      0
peripheral_pulse         0
mucous_membrane          0
capillary_refill_time    0
pain                     0
peristalsis              0
abdominal_distention     0
nasogastric_tube         0
nasogastric_reflux       0
nasogastric_reflux_ph    0
rectal_exam_feces        0
abdomen                  0
packed_cell_volume       0
total_protein            0
abdomo_appearance        0
abdomo_protein           0
outcome                  0
surgical_lesion          0
lesion_1                 0
lesion_2                 0
lesion_3                 0
cp_data                  0
dtype: int64

__Replace the missing values by the most frequent value in each column__

In [132]:
df.rectal_temp=df.rectal_temp.fillna(mode(df.rectal_temp))
df.pulse=df.pulse.fillna(mode(df.pulse))
df.respiratory_rate=df.respiratory_rate.fillna(np.mean(df.respiratory_rate))
df.nasogastric_reflux_ph=df.nasogastric_reflux_ph.fillna(mode(df.nasogastric_reflux_ph))
df.packed_cell_volume=df.packed_cell_volume.fillna(np.mean(df.packed_cell_volume))
df.total_protein=df.total_protein.fillna(np.mean(df.total_protein))
df.abdomo_protein=df.abdomo_protein.fillna(mode(df.abdomo_protein))

__Create Training and Test Dataset__

In [170]:
x = df.drop('outcome',axis=1)
y = df.outcome

# x = StandardScaler().fit_transform(x)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.3,random_state=1)

### Create Decision Tree Model ###

In [173]:
dt = DecisionTreeClassifier(criterion="entropy")

dt.fit(x_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

__Evaluating Model__

In [174]:
y_pred = dt.predict(x_test)
print("Accuracy:", round(metrics.accuracy_score(y_test, y_pred),2)*100,'%')

Accuracy: 66.0 %


In [185]:
dt.score(x_test,y_test)

0.6555555555555556

### Using Random Forest ###

In [191]:
rf =RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(x_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [192]:
y_pred = rf.predict(x_test)
rf.score(x_test,y_test)
# print("Accuracy:", round(metrics.accuracy_score(y_test, y_pred),2)*100,'%')

0.4330670982745686