# Horse Survival Prediction

In [29]:
import numpy as np
#pandas 
import pandas as pd
#StandardScaler
from sklearn.preprocessing import StandardScaler
#train_test_split
from sklearn.model_selection import train_test_split
#DecisionTree
from sklearn.tree import DecisionTreeClassifier
#RandomForest
from sklearn.ensemble import RandomForestClassifier

# Loading the Dataset

In [30]:
df=pd.read_csv('/kaggle/input/horse-colic/horse.csv')
#showing the dataset
df

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,yes,adult,533886,,120.0,70.0,cold,,pale_cyanotic,more_3_sec,...,55.0,65.0,,,euthanized,no,3205,0,0,no
295,no,adult,527702,37.2,72.0,24.0,cool,increased,pale_cyanotic,more_3_sec,...,44.0,,serosanguious,3.3,euthanized,yes,2208,0,0,yes
296,yes,adult,529386,37.5,72.0,30.0,cold,reduced,pale_cyanotic,less_3_sec,...,60.0,6.8,,,died,yes,3205,0,0,no
297,yes,adult,530612,36.5,100.0,24.0,cool,reduced,pale_pink,less_3_sec,...,50.0,6.0,serosanguious,3.4,lived,yes,2208,0,0,yes


# Getting the Basic Information about the Dataset

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surgery                299 non-null    object 
 1   age                    299 non-null    object 
 2   hospital_number        299 non-null    int64  
 3   rectal_temp            239 non-null    float64
 4   pulse                  275 non-null    float64
 5   respiratory_rate       241 non-null    float64
 6   temp_of_extremities    243 non-null    object 
 7   peripheral_pulse       230 non-null    object 
 8   mucous_membrane        252 non-null    object 
 9   capillary_refill_time  267 non-null    object 
 10  pain                   244 non-null    object 
 11  peristalsis            255 non-null    object 
 12  abdominal_distention   243 non-null    object 
 13  nasogastric_tube       195 non-null    object 
 14  nasogastric_reflux     193 non-null    object 
 15  nasoga

# Preprocessing the Dataset

In [32]:
def binary_encode(df,columns,positive_values):
    df=df.copy()
    for column,positive_value in zip(columns,positive_values):
        df[column]=df[column].apply(lambda x:1 if x==positive_value else 0)
    return df

def ordinal_encode(df,columns,orderings):
    df=df.copy()
    for column,ordering in zip(columns,orderings):
        df[column]=df[column].apply(lambda x:ordering.index(x))
        
    return df

def onehot_encode(df,columns,prefixes):
    df=df.copy()
    for column,prefix in zip(columns,prefixes):
        dummies=pd.get_dummies(df[column])
        df=pd.concat([df,dummies],axis=1)
        df=df.drop(column,axis=1)
    return df
    

In [33]:
def preprocess_inputs(df):
    df=df.copy()
    #Split df into x and y
    binary_features=['surgery','age',
                     'surgical_lesion',
                     'cp_data'
        
    ]
    positive_values=[
        'yes','adult','yes','yes'
    ]
    ordinal_features=['temp_of_extremities',
                      'peripheral_pulse',
                      'capillary_refill_time',
                      'pain',
                      'peristalsis',
                      'abdominal_distention',
                      'nasogastric_tube',
                      'nasogastric_reflux',
                      'rectal_exam_feces'
        
    ]
    orderings=[
        ['cold','cool','normal','warm'],
        ['absent','reduced','normal','increased'],
        ['less_3_sec','3','more_3_sec'],
        ['alert','depressed','mild_pain','severe_pain','extreme_pain'],
        ['absent','hypomotile','normal','hypermotile'],
        ['none','slight','moderate','severe'],
        ['none','slight','significant'],
        ['none','less_1_liter','more_1_liter'],
        ['absent','decreased','normal','increased']
        
    ]
    nominal_features=['mucous_membrane',
                      'hospital_number',
                      'abdomen',
                      'abdomo_appearance'
        
    ]
    prefixes=[
        "MM",
        "HN",
        'AB',
        'AA'
    ]
    for column in df.columns:
        if column in df.select_dtypes('object').columns:
            if column not in nominal_features:
                df[column]=df[column].fillna(df[column].mode()[0])
        else:
            df[column]=df[column].fillna(df[column].mean())
    df=binary_encode(df,columns=binary_features,
                    positive_values=positive_values)
    df=ordinal_encode(df,columns=ordinal_features,
                     orderings=orderings)
    df=onehot_encode(df,columns=nominal_features,
                     prefixes=prefixes)
    #Encode labels
    label_encoding={'lived':0,'died':1,'euthanized':2}
    df['outcome']=df['outcome'].replace(label_encoding)
    
    y=df['outcome'].copy()
    x=df.drop('outcome',axis=1)
    #scaling
    scaler=StandardScaler()
    x=pd.DataFrame(scaler.fit_transform(x),columns=x.columns)
    
    #train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7)
    return x_train,x_test,y_train,y_test

In [34]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

NameError: name 'x_train' is not defined

In [35]:
x_train,x_test,y_train,y_test=preprocess_inputs(df)



In [36]:
models={
    'Decision Tree':DecisionTreeClassifier(),
    'Random Forest Classifier':RandomForestClassifier()
}

In [39]:
for name,model in models.items():
    model.fit(x_train,y_train)
    print(name)
    print(model.score(x_test,y_test))

Decision Tree
0.7555555555555555
Random Forest Classifier
0.7888888888888889




In [None]:
y.value_counts()

In [None]:
{column:list(x[column].unique()) for column in x.select_dtypes('object').columns}

In [None]:
x.isna().sum()

In [None]:
y.unique()