In [1]:
#importing library
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score,f1_score


In [2]:
#loading the data
df=pd.read_csv('../input/patient-treatment-classification/data-ori.csv')
df.head()

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SEX,SOURCE
0,35.1,11.8,4.65,6.3,310,25.4,33.6,75.5,1,F,out
1,43.5,14.8,5.39,12.7,334,27.5,34.0,80.7,1,F,out
2,33.5,11.3,4.74,13.2,305,23.8,33.7,70.7,1,F,out
3,39.1,13.7,4.98,10.5,366,27.5,35.0,78.5,1,F,out
4,30.9,9.9,4.23,22.1,333,23.4,32.0,73.0,1,M,out


In [3]:
df.shape

(4412, 11)

In [4]:
#gettting column information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4412 entries, 0 to 4411
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HAEMATOCRIT   4412 non-null   float64
 1   HAEMOGLOBINS  4412 non-null   float64
 2   ERYTHROCYTE   4412 non-null   float64
 3   LEUCOCYTE     4412 non-null   float64
 4   THROMBOCYTE   4412 non-null   int64  
 5   MCH           4412 non-null   float64
 6   MCHC          4412 non-null   float64
 7   MCV           4412 non-null   float64
 8   AGE           4412 non-null   int64  
 9   SEX           4412 non-null   object 
 10  SOURCE        4412 non-null   object 
dtypes: float64(7), int64(2), object(2)
memory usage: 379.3+ KB


In [5]:
def preprocess_input(df):
    df=df.copy()
    df['SEX']=df['SEX'].replace({'F':0,'M':1})
    #Scaling the data
    y=df['SOURCE']
    x=df.drop('SOURCE',axis=1)
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,shuffle=True,random_state=1)
    scaler=StandardScaler()
    scaler.fit(x_train)
    x_train=pd.DataFrame(scaler.transform(x_train),index=x_train.index,columns=x_train.columns)
    x_test=pd.DataFrame(scaler.transform(x_test),index=x_test.index,columns=x_train.columns)
    
    return x_train,x_test,y_train,y_test

In [6]:
x_train,x_test,y_train,y_test=preprocess_input(df)
x_train.shape

(3088, 10)

In [7]:
x_train

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SEX
2275,1.521571,0.918324,4.205082,-0.507635,-0.368181,-3.543672,-1.886871,-3.523787,0.113088,-1.043023
4093,-0.590225,-0.613524,-0.673427,-0.468456,-0.184678,0.106741,-0.267275,0.263954,1.411455,-1.043023
1727,-1.512041,-1.618799,-1.463160,0.099634,1.195966,-0.228501,-0.996093,0.191392,-0.350615,-1.043023
615,0.817639,0.870454,0.765924,-0.488046,-0.140986,0.032243,0.380563,-0.127881,-1.138910,0.958752
1610,0.348351,0.391752,-0.036546,-0.311742,0.260973,0.665478,0.299583,0.670302,-0.443356,-1.043023
...,...,...,...,...,...,...,...,...,...,...
2895,0.029906,0.056660,-0.240348,-0.781885,0.531859,0.479233,0.137624,0.510665,0.576790,-1.043023
2763,-0.003615,-0.278432,-0.533314,-0.703528,0.182329,0.479233,-1.158053,1.149211,0.484050,-1.043023
905,1.320448,1.444897,0.753186,-0.488046,0.907604,0.926222,0.704482,0.742864,-0.953429,0.958752
3980,-0.539944,-0.565653,-0.558789,-0.703528,-0.420611,-0.005006,-0.267275,0.089805,1.318715,0.958752


In [8]:
y_train.value_counts()

out    1834
in     1254
Name: SOURCE, dtype: int64

In [9]:
#Training the data
models={
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Neural Network': MLPClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()

    
}

In [10]:
for name,model in models.items():
    model.fit(x_train,y_train)
    print(name+'trained.')

Logistic Regressiontrained.
Decision Treetrained.




Neural Networktrained.
Random Foresttrained.
Gradient Boostingtrained.


In [11]:
for name,model in models.items():
    y_pred=model.predict(x_test)
    f1=f1_score(y_test,y_pred,pos_label='in')
    print(name+'F1 Score: {:5f}'.format(f1))

Logistic RegressionF1 Score: 0.597046
Decision TreeF1 Score: 0.594296
Neural NetworkF1 Score: 0.667957
Random ForestF1 Score: 0.666667
Gradient BoostingF1 Score: 0.647830
