In [15]:
#importing library
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score

In [16]:
#importing data
df=pd.read_csv('patient.csv')
df

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SEX,SOURCE
0,35.1,11.8,4.65,6.3,310,25.4,33.6,75.5,1,F,out
1,43.5,14.8,5.39,12.7,334,27.5,34.0,80.7,1,F,out
2,33.5,11.3,4.74,13.2,305,23.8,33.7,70.7,1,F,out
3,39.1,13.7,4.98,10.5,366,27.5,35.0,78.5,1,F,out
4,30.9,9.9,4.23,22.1,333,23.4,32.0,73.0,1,M,out
...,...,...,...,...,...,...,...,...,...,...,...
4407,32.8,10.4,3.49,8.1,72,29.8,31.7,94.0,92,F,in
4408,33.7,10.8,3.67,6.7,70,29.4,32.0,91.8,92,F,in
4409,33.2,11.2,3.47,7.2,235,32.3,33.7,95.7,93,F,out
4410,31.5,10.4,3.15,9.1,187,33.0,33.0,100.0,98,F,in


In [17]:
df=df.head(350)
df

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SEX,SOURCE
0,35.1,11.8,4.65,6.3,310,25.4,33.6,75.5,1,F,out
1,43.5,14.8,5.39,12.7,334,27.5,34.0,80.7,1,F,out
2,33.5,11.3,4.74,13.2,305,23.8,33.7,70.7,1,F,out
3,39.1,13.7,4.98,10.5,366,27.5,35.0,78.5,1,F,out
4,30.9,9.9,4.23,22.1,333,23.4,32.0,73.0,1,M,out
...,...,...,...,...,...,...,...,...,...,...,...
345,40.7,14.0,5.12,2.1,148,27.3,34.4,79.5,14,M,in
346,42.5,13.7,5.37,4.8,152,25.5,32.2,79.1,14,M,in
347,41.8,14.2,5.08,2.4,130,28.0,34.0,82.3,14,M,in
348,46.2,15.2,5.79,3.2,214,26.3,32.9,79.8,14,M,out


In [10]:
df.info

<bound method DataFrame.info of      HAEMATOCRIT  HAEMOGLOBINS  ERYTHROCYTE  LEUCOCYTE  THROMBOCYTE   MCH  \
0           35.1          11.8         4.65        6.3          310  25.4   
1           43.5          14.8         5.39       12.7          334  27.5   
2           33.5          11.3         4.74       13.2          305  23.8   
3           39.1          13.7         4.98       10.5          366  27.5   
4           30.9           9.9         4.23       22.1          333  23.4   
..           ...           ...          ...        ...          ...   ...   
345         40.7          14.0         5.12        2.1          148  27.3   
346         42.5          13.7         5.37        4.8          152  25.5   
347         41.8          14.2         5.08        2.4          130  28.0   
348         46.2          15.2         5.79        3.2          214  26.3   
349         40.5          13.4         5.14       12.9          200  26.1   

     MCHC   MCV  AGE SEX SOURCE  
0    33.6

In [26]:
#replace string with numbers
def preprocess_input(df):
    df=df.copy()
    df['SEX']=df['SEX'].replace({'F':0,'M':1})
    #split data in x and y
    y=df['SOURCE']
    x=df.drop('SOURCE',axis=1)
    #train 
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,shuffle=True,random_state=1)
    #Scale x
    scaler=StandardScaler()
    scaler.fit(x_train)
    x_train=pd.DataFrame(scaler.transform(x_train),index=x_train.index,columns=x_train.columns)
    x_test=pd.DataFrame(scaler.transform(x_test),index=x_test.index,columns=x_test.columns)
    return x_train,x_test,y_train,y_test


In [34]:
x_train,x_test,y_train,y_test=preprocess_input(df)
y_test

192    out
256    out
169    out
67     out
201     in
      ... 
199     in
70     out
228    out
122    out
41      in
Name: SOURCE, Length: 106, dtype: object

In [35]:
y_train.value_counts()

out    149
in      95
Name: SOURCE, dtype: int64

In [40]:
models={
    'Logistic Regresion': LogisticRegression(),
    'Decision Tree' : DecisionTreeClassifier(),
    'Neural Network': MLPClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
    
}

In [41]:
models

{'Logistic Regresion': LogisticRegression(),
 'Decision Tree': DecisionTreeClassifier(),
 'Neural Network': MLPClassifier(),
 'Gradient Boosting': GradientBoostingClassifier()}

In [42]:
for name,model in models.items():
    model.fit(x_train,y_train)
    print(name+'Trained')
    
    

Logistic RegresionTrained
Decision TreeTrained




Neural NetworkTrained
Gradient BoostingTrained


In [46]:
for name,model in models.items():
    y_pred=model.predict(x_test)
    acc=accuracy_score(y_test,y_pred)
    print(name+'Accuracy:{:2f}%'.format(acc*100))

Logistic RegresionAccuracy:80.188679%
Decision TreeAccuracy:80.188679%
Neural NetworkAccuracy:83.018868%
Gradient BoostingAccuracy:85.849057%


In [49]:
for name,model in models.items():
    y_pred=model.predict(x_test)
    f1=f1_score(y_test,y_pred,pos_label='in')
    print(name+'f1:{:2f}%'.format(f1))

Logistic Regresionf1:0.704225%
Decision Treef1:0.727273%
Neural Networkf1:0.742857%
Gradient Boostingf1:0.782609%
