In [1]:
import pandas as pd

# Reading Data

In [7]:
df = pd.read_csv('Data.csv')

In [8]:
df.head()

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb,Unnamed: 6
0,100.185952,0,55,1,1,1,
1,99.808454,1,2,0,-1,0,
2,100.604052,1,52,1,1,0,
3,99.260113,0,86,0,0,1,
4,99.098874,1,20,0,-1,1,


In [9]:
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

In [10]:
df.head()

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
0,100.185952,0,55,1,1,1
1,99.808454,1,2,0,-1,0
2,100.604052,1,52,1,1,0
3,99.260113,0,86,0,0,1
4,99.098874,1,20,0,-1,1


In [11]:
df.tail()

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
2494,98.926377,1,52,1,1,1
2495,99.574632,1,10,1,-1,1
2496,99.090519,1,99,0,0,1
2497,101.845342,1,96,1,-1,1
2498,101.345204,0,35,0,-1,0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2499 entries, 0 to 2498
Data columns (total 6 columns):
fever            2499 non-null float64
bodyPain         2499 non-null int64
age              2499 non-null int64
runnyNose        2499 non-null int64
diffBreath       2499 non-null int64
infectionProb    2499 non-null int64
dtypes: float64(1), int64(5)
memory usage: 117.3 KB


In [14]:
df['diffBreath'].value_counts()

 1    845
-1    829
 0    825
Name: diffBreath, dtype: int64

In [15]:
df.describe()

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
count,2499.0,2499.0,2499.0,2499.0,2499.0,2499.0
mean,100.006317,0.495798,50.261305,0.494198,0.006403,0.509804
std,1.14197,0.500082,28.947112,0.500066,0.818593,0.500004
min,98.000394,0.0,1.0,0.0,-1.0,0.0
25%,99.025201,0.0,25.0,0.0,-1.0,0.0
50%,99.992198,0.0,50.0,0.0,0.0,1.0
75%,101.002909,1.0,74.5,1.0,1.0,1.0
max,101.999689,1.0,100.0,1.0,1.0,1.0


## Train Test Split

In [16]:
import numpy as np

In [24]:
def data_split(data,ratio):  # ratio = % of training or testing data
    np.random.seed(42)
    shuffled = np.random.permutation(len(data))  # It will pick random values from data frame
    test_set_size = int(len(data) * ratio)
    test_indices = shuffled[:test_set_size]# Those rows which are my test data rows, takes all rows and colums from test_set_size
    train_indices = shuffled[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]
    

In [25]:
train,test = data_split(df,0.2)

In [26]:
train

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
461,99.719627,0,19,1,0,0
109,101.542802,1,26,1,1,0
2296,101.339116,1,72,1,-1,0
354,101.913821,0,58,0,-1,0
266,98.304164,0,52,0,1,1
...,...,...,...,...,...,...
1638,99.479629,0,73,0,-1,1
1095,98.914481,1,24,1,0,1
1130,101.708958,1,81,1,-1,1
1294,98.777846,1,100,0,1,1


In [27]:
test

Unnamed: 0,fever,bodyPain,age,runnyNose,diffBreath,infectionProb
2319,101.051983,0,63,1,0,0
1865,101.708642,1,40,0,1,1
902,99.030740,1,18,1,1,0
2240,98.471943,0,72,1,-1,1
1285,99.935871,1,26,1,1,1
...,...,...,...,...,...,...
1037,100.809658,0,4,1,0,1
2054,98.932122,0,46,1,0,1
1860,99.534454,0,76,1,-1,1
1862,100.700169,1,25,1,-1,1


In [28]:
X_train = train[['fever','bodyPain','age','runnyNose','diffBreath']]

In [30]:
X_train.to_numpy()

array([[ 99.71962652,   0.        ,  19.        ,   1.        ,
          0.        ],
       [101.5428017 ,   1.        ,  26.        ,   1.        ,
          1.        ],
       [101.3391155 ,   1.        ,  72.        ,   1.        ,
         -1.        ],
       ...,
       [101.708958  ,   1.        ,  81.        ,   1.        ,
         -1.        ],
       [ 98.77784614,   1.        , 100.        ,   0.        ,
          1.        ],
       [100.1503644 ,   0.        ,  80.        ,   0.        ,
         -1.        ]])

In [32]:
X_test = test[['fever','bodyPain','age','runnyNose','diffBreath']]

In [33]:
X_test.to_numpy()

array([[101.0519826 ,   0.        ,  63.        ,   1.        ,
          0.        ],
       [101.7086422 ,   1.        ,  40.        ,   0.        ,
          1.        ],
       [ 99.03074038,   1.        ,  18.        ,   1.        ,
          1.        ],
       ...,
       [ 99.53445429,   0.        ,  76.        ,   1.        ,
         -1.        ],
       [100.7001687 ,   1.        ,  25.        ,   1.        ,
         -1.        ],
       [ 98.08139358,   1.        ,  51.        ,   0.        ,
         -1.        ]])

In [36]:
Y_train = train[['infectionProb']].to_numpy().reshape(1,-1)
Y_test = test[['infectionProb']].to_numpy().reshape(1,-1)

In [37]:
Y_train

array([[0, 0, 0, ..., 1, 1, 0]], dtype=int64)