In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('horse.csv')
data.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surgery                299 non-null    object 
 1   age                    299 non-null    object 
 2   hospital_number        299 non-null    int64  
 3   rectal_temp            239 non-null    float64
 4   pulse                  275 non-null    float64
 5   respiratory_rate       241 non-null    float64
 6   temp_of_extremities    243 non-null    object 
 7   peripheral_pulse       230 non-null    object 
 8   mucous_membrane        252 non-null    object 
 9   capillary_refill_time  267 non-null    object 
 10  pain                   244 non-null    object 
 11  peristalsis            255 non-null    object 
 12  abdominal_distention   243 non-null    object 
 13  nasogastric_tube       195 non-null    object 
 14  nasogastric_reflux     193 non-null    object 
 15  nasoga

In [3]:
data.isna().sum()

surgery                    0
age                        0
hospital_number            0
rectal_temp               60
pulse                     24
respiratory_rate          58
temp_of_extremities       56
peripheral_pulse          69
mucous_membrane           47
capillary_refill_time     32
pain                      55
peristalsis               44
abdominal_distention      56
nasogastric_tube         104
nasogastric_reflux       106
nasogastric_reflux_ph    246
rectal_exam_feces        102
abdomen                  118
packed_cell_volume        29
total_protein             33
abdomo_appearance        165
abdomo_protein           198
outcome                    0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
dtype: int64

In [4]:
#Target Class
data.outcome.value_counts()

lived         178
died           77
euthanized     44
Name: outcome, dtype: int64

In [5]:
features = data.drop(['outcome'], axis = 1)
target = data[['outcome']]

In [6]:
features.shape,target.shape

((299, 27), (299, 1))

In [7]:
features.dtypes

surgery                   object
age                       object
hospital_number            int64
rectal_temp              float64
pulse                    float64
respiratory_rate         float64
temp_of_extremities       object
peripheral_pulse          object
mucous_membrane           object
capillary_refill_time     object
pain                      object
peristalsis               object
abdominal_distention      object
nasogastric_tube          object
nasogastric_reflux        object
nasogastric_reflux_ph    float64
rectal_exam_feces         object
abdomen                   object
packed_cell_volume       float64
total_protein            float64
abdomo_appearance         object
abdomo_protein           float64
surgical_lesion           object
lesion_1                   int64
lesion_2                   int64
lesion_3                   int64
cp_data                   object
dtype: object

In [8]:
features_transformed = pd.get_dummies(features)

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [10]:
X_train , X_test, y_train, y_test = train_test_split(features_transformed, target, random_state = 10)

In [11]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(224, 67)
(75, 67)
(224, 1)
(75, 1)


In [12]:
from sklearn.impute import SimpleImputer

In [13]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [14]:
X_train = imputer.fit_transform(X_train)
X_test = imputer.fit_transform(X_test)

In [15]:
my_DT_model = DecisionTreeClassifier(criterion='entropy', random_state=2, max_depth = 1)

In [16]:
my_DT_model.fit(X_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=2)

### Using GridSearchCV to find best params

In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
params = {'criterion':['gini', 'entropy'], 'max_depth':[1,2,3,10], 'splitter' :['best', 'random']}

In [19]:
grid_search = GridSearchCV(my_DT_model, params, cv = 3, n_jobs = -1)

In [20]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=DecisionTreeClassifier(criterion='entropy', max_depth=1,
                                              random_state=2),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 10],
                         'splitter': ['best', 'random']})

In [21]:
grid_search.best_params_

{'criterion': 'gini', 'max_depth': 3, 'splitter': 'best'}

In [22]:
my_DT_model = DecisionTreeClassifier(criterion='gini', random_state=2, max_depth = 3, splitter = 'best')

In [23]:
my_DT_model.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3, random_state=2)

In [24]:
my_preds = my_DT_model.predict(X_test)

In [25]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [26]:
accuracy_score(y_test, my_preds)

0.6666666666666666

In [27]:
print(confusion_matrix(y_test, my_preds, ))

[[ 3  0 12]
 [ 1  1  9]
 [ 3  0 46]]


In [28]:
print(classification_report(y_test, my_preds))

              precision    recall  f1-score   support

        died       0.43      0.20      0.27        15
  euthanized       1.00      0.09      0.17        11
       lived       0.69      0.94      0.79        49

    accuracy                           0.67        75
   macro avg       0.71      0.41      0.41        75
weighted avg       0.68      0.67      0.60        75



# Voting Classifiers

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [30]:
rf_clf = RandomForestClassifier()
log_clf = LogisticRegression()
svm_clf = SVC()

In [31]:
voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rf_clf), ('svc', svm_clf)])

In [32]:
voting_clf.fit(X_train, y_train)

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

In [33]:
from sklearn.metrics import accuracy_score

In [34]:
for clf in (log_clf, rf_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))


  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  


LogisticRegression 0.6266666666666667
RandomForestClassifier 0.7333333333333333
SVC 0.6533333333333333


  return f(*args, **kwargs)
  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


VotingClassifier 0.64


In [35]:
from sklearn.ensemble import BaggingClassifier

In [36]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=100)

In [37]:
bag_clf.fit(X_train,y_train)

  return f(*args, **kwargs)


BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100)

In [38]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_pred, y_test)

0.7333333333333333

In [39]:
my_rf_classifier = RandomForestClassifier()

In [40]:
my_rf_classifier.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestClassifier()

In [41]:
my_predictions = my_rf_classifier.predict(X_test)

In [42]:
print(accuracy_score(y_test, my_predictions))

0.7466666666666667


In [43]:
print(confusion_matrix(y_test, my_predictions))

[[ 8  0  7]
 [ 1  2  8]
 [ 3  0 46]]


In [44]:
print(classification_report(y_test, my_predictions))

              precision    recall  f1-score   support

        died       0.67      0.53      0.59        15
  euthanized       1.00      0.18      0.31        11
       lived       0.75      0.94      0.84        49

    accuracy                           0.75        75
   macro avg       0.81      0.55      0.58        75
weighted avg       0.77      0.75      0.71        75



In [45]:
from sklearn.ensemble import VotingClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression

In [46]:
my_logreg_clf = LogisticRegression()

In [47]:
my_vt_clf = VotingClassifier(estimators=[('lr', my_logreg_clf), ('rf', my_rf_classifier)],
                            voting = 'hard')

In [48]:
my_vt_clf.fit(X_train, y_train)

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier())])

In [49]:
my_bagging_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=100,
                                      max_samples=100, bootstrap=True)

my_bagging_clf.fit(X_train, y_train)

  return f(*args, **kwargs)


BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=100,
                  n_estimators=100)

In [50]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators= 100)

In [51]:
ada_clf.fit(X_train, y_train)

  return f(*args, **kwargs)


AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100)

In [52]:
from sklearn.ensemble import GradientBoostingClassifier

In [53]:
#learning_rate = 0.3, max_depth=5, n_estimators=1100, n_iter_no_change=10
gbc_clf = GradientBoostingClassifier()

In [54]:
gbc_clf.fit(X_train, y_train)

  return f(*args, **kwargs)


GradientBoostingClassifier()

In [55]:
gbc_clf.n_estimators_

100

In [56]:
import xgboost, time

In [57]:
xgb_clf = xgboost.XGBClassifier()

In [58]:
start = time.time()
xgb_clf.fit(X_train, y_train)
end = time.time()

time_elapsed = end - start
print(time_elapsed)

  return f(*args, **kwargs)


88.5109612941742


In [59]:
y_pred = xgb_clf.predict(X_test)

In [60]:
accuracy_score(y_pred, y_test)

0.7466666666666667

In [None]:
params = {'n_estimators':[100, 200, 400, 800], 'max_depth':[1,2,3,6,10], 'learning_rate' :[0.1, 0.2, 0.3, 0.5], 'min_child_weight' : [1, 2, 3, 4, 5], 'subsample' : [0.5, 0.6, 0.7, 0.8, 1.0]}
grid_search = GridSearchCV(xgb_clf, params, cv = 3, n_jobs = -1)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
xgb_clf = xgboost.XGBClassifier(learning_rate = 0.1, max_depth = 3, min_child_weight = 5, n_estimators = 200, subsample = 0.6)

In [None]:
xgb_clf.fit(X_train, y_train)

In [None]:
y_pred = xgb_clf.predict(X_test)
accuracy_score(y_pred, y_test)