# importing Library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Importing Dataset

In [2]:
df = pd.read_csv('data.csv')


# Checking data types

In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617 entries, 0 to 616
Data columns (total 31 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Infect            617 non-null    int64  
 1   qHctSD            617 non-null    float64
 2   qPotassSD         617 non-null    float64
 3   qAlbuminSD        617 non-null    float64
 4   qHgbSD            617 non-null    float64
 5   qCaTotalSD        617 non-null    float64
 6   qPhosphSD         617 non-null    float64
 7   qDiastolicSD      617 non-null    float64
 8   qSodiumSD         617 non-null    float64
 9   qWhBldCntLast     617 non-null    float64
 10  qBicarbSD         617 non-null    float64
 11  qBicarbTrend      617 non-null    float64
 12  qPotassLast       617 non-null    float64
 13  qSodiumTrend      617 non-null    float64
 14  qSystolicTrend    617 non-null    float64
 15  qSystolicLast     617 non-null    int64  
 16  qNeutrophilsLast  617 non-null    float64
 1

# Checking for NaN data and Null Data 

In [4]:
print(f"checking for NaN:\n{df.isna().sum()}")
print("##################################")
print(f"checking for null:\n{df.isnull().sum()}")


checking for NaN:
Infect              0
qHctSD              0
qPotassSD           0
qAlbuminSD          0
qHgbSD              0
qCaTotalSD          0
qPhosphSD           0
qDiastolicSD        0
qSodiumSD           0
qWhBldCntLast       0
qBicarbSD           0
qBicarbTrend        0
qPotassLast         0
qSodiumTrend        0
qSystolicTrend      0
qSystolicLast       0
qNeutrophilsLast    0
qPulseSD            0
qBicarbLast         0
qHctTrend           0
qDiastolicLast      0
qCaTotalTrend       0
qCaTotalLast        0
qWhBldCntTrend      0
qGlucoseSD          0
qSystolicSD         0
qLymphSD            0
qWeightVitalSD      0
qLymphLast          0
PatientAge          0
PDVintage           0
dtype: int64
##################################
checking for null:
Infect              0
qHctSD              0
qPotassSD           0
qAlbuminSD          0
qHgbSD              0
qCaTotalSD          0
qPhosphSD           0
qDiastolicSD        0
qSodiumSD           0
qWhBldCntLast       0
qBicarbSD    

# Showing the all column name

In [5]:
print(df.columns)

Index(['Infect', 'qHctSD', 'qPotassSD', 'qAlbuminSD', 'qHgbSD', 'qCaTotalSD',
       'qPhosphSD', 'qDiastolicSD', 'qSodiumSD', 'qWhBldCntLast', 'qBicarbSD',
       'qBicarbTrend', 'qPotassLast', 'qSodiumTrend', 'qSystolicTrend',
       'qSystolicLast', 'qNeutrophilsLast', 'qPulseSD', 'qBicarbLast',
       'qHctTrend', 'qDiastolicLast', 'qCaTotalTrend', 'qCaTotalLast',
       'qWhBldCntTrend', 'qGlucoseSD', 'qSystolicSD', 'qLymphSD',
       'qWeightVitalSD', 'qLymphLast', 'PatientAge', 'PDVintage'],
      dtype='object')


# Seperate the data to X,Y

In [6]:
X = df.iloc[:,1:].values
Y = df.iloc[:,0].values

# Checking "Infect" Value 

In [7]:
df.Infect.value_counts()

0    469
1    148
Name: Infect, dtype: int64

# Dealing with imbalanced data


# Splitting Dataset Into Training Set And Test Set  

In [8]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y) 

# Feature Scaling

In [9]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
xtrain_scaled = sc.fit_transform(xtrain)
xtest_scaled = sc.transform(xtest)

In [10]:
print(xtrain)
print("------------------------------------------------")
print(xtrain_scaled)
print("######################################")
print(xtest)
print("-------------------------------------------------")
print(xtest_scaled)

[[1.69967317e-01 1.63299316e-01 4.78423340e-02 ... 3.00000000e+01
  6.00000000e+01 1.68377823e+00]
 [1.50000000e-01 5.95000000e-01 0.00000000e+00 ... 2.14000000e+01
  5.40000000e+01 5.47570200e-03]
 [3.19958493e+00 3.46184561e-01 3.73619909e-01 ... 1.13000000e+01
  6.90000000e+01 1.17453799e+00]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 1.52000000e+01
  4.40000000e+01 2.62012320e+00]
 [1.88679623e+00 5.55787729e-01 1.93907194e-01 ... 2.80000000e+01
  4.90000000e+01 4.00821355e+00]
 [2.55000000e+00 1.00000000e-01 4.10000000e-01 ... 6.00000000e+01
  5.10000000e+01 6.70773443e-01]]
------------------------------------------------
[[-1.18688064 -0.76599042 -0.91192369 ... -0.0251345   0.30201667
  -0.02527925]
 [-1.19728889  0.72034759 -1.23473673 ... -0.04905896 -0.13784808
  -0.88241113]
 [ 0.39235168 -0.13631967  1.28623934 ... -0.07715629  0.96181379
  -0.28535515]
 ...
 [-1.27547858 -1.32822714 -1.23473673 ... -0.06630682 -0.87095599
   0.45292483]
 [-0.29195856  0.58534

In [11]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)

Xsm, Ysm = sm.fit_resample(xtrain,ytrain)


ModuleNotFoundError: No module named 'imblearn'

In [None]:
np.unique(Ysm,return_counts=True)

# Creating Model (Logistic Regression)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(Xsm, Ysm)

# Predicting The Test Set Result

In [None]:
ypred = classifier.predict(xtest_scaled)
print(np.concatenate((ypred.reshape(len(ypred),1),ytest.reshape(len(ytest),1)),1))

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score,classification_report
cm = confusion_matrix(ytest,ypred)
print(cm)
print(accuracy_score(ytest,ypred))
print(precision_score(ytest,ypred))
print(recall_score(ytest,ypred))
print(f1_score(ytest,ypred))

print(classification_report(ytest, ypred))