In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Mounted at /gdrive
/gdrive


In [None]:
# importing important libraries
import numpy as np
import pandas as pd
from imblearn.combine import SMOTETomek



In [None]:
#importing covid dataset
covid_df = pd.read_csv("/gdrive/MyDrive/Covid Prediction AI Project/covid_cleaned_data.csv")
covid_df.head()

Unnamed: 0.1,Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,Result,Female,Male,Age above 60,Abroad,Contact with confirmed
0,0,0,0,0,0,0,0,1,0,0,0,0
1,1,1,0,0,0,0,0,1,0,0,0,0
2,2,0,1,0,0,0,0,0,1,0,0,0
3,3,1,0,0,0,0,0,1,0,0,0,0
4,4,1,0,0,0,0,0,0,1,0,0,0


In [None]:
# remove 'unnamed:0' column from the dataset
covid_df.drop(columns=['Unnamed: 0'],inplace=True)
covid_df.head()

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,Result,Female,Male,Age above 60,Abroad,Contact with confirmed
0,0,0,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,1,0,0,0,0
2,0,1,0,0,0,0,0,1,0,0,0
3,1,0,0,0,0,0,1,0,0,0,0
4,1,0,0,0,0,0,0,1,0,0,0


# Handling Imbalanced Dataset

In [None]:
covid_df.Result.value_counts()

0    260008
1     14694
Name: Result, dtype: int64

- Here you can see that negative class samples are very higher than positive class samples. 
- This is called Imbalanced dataset i.e the ratio of both the classes differs very much.
- So this should be fixed to get accurate results.

## Handling Imbalanced Data - Using SMOTE (Synthetic Minority Over Sampling Technique) Method

In [None]:
# assign independent and dependent variables to x and y respectively
x = covid_df.drop(columns=['Result'])
y = covid_df['Result']

In [None]:
x

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,Female,Male,Age above 60,Abroad,Contact with confirmed
0,0,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,1,0,0,0,0
2,0,1,0,0,0,0,1,0,0,0
3,1,0,0,0,0,1,0,0,0,0
4,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
274697,0,0,0,0,0,0,0,0,0,0
274698,0,0,0,0,0,0,0,0,0,0
274699,0,0,0,0,0,0,0,0,0,0
274700,0,0,0,0,0,0,0,0,0,1


In [None]:
y

0         0
1         0
2         0
3         0
4         0
         ..
274697    0
274698    0
274699    0
274700    1
274701    0
Name: Result, Length: 274702, dtype: int64

In [None]:
# Implementing SMOTE method for Handling Imbalanced data
smk = SMOTETomek(random_state=42)
X_res,y_res=smk.fit_sample(x,y)



In [None]:
X_res.shape, 

(520016, 10)

In [None]:
# convert the values in array to dataframe
X = pd.DataFrame(X_res)
Y = pd.DataFrame(y_res)

In [None]:
# renaming the X dataframe columns
X.columns = x.columns.to_list()

In [None]:
# renaming the Y dataframe columns
Y.columns = ["Result"]

In [None]:
X

Unnamed: 0,cough,fever,sore_throat,shortness_of_breath,head_ache,Female,Male,Age above 60,Abroad,Contact with confirmed
0,0,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,1,0,0,0,0
2,0,1,0,0,0,0,1,0,0,0
3,1,0,0,0,0,1,0,0,0,0
4,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
520011,1,0,0,0,0,0,1,0,0,1
520012,1,0,0,0,1,1,0,0,0,0
520013,0,0,0,0,0,0,1,0,0,0
520014,0,1,0,0,0,0,1,0,0,0


In [None]:
Y

Unnamed: 0,Result
0,0
1,0
2,0
3,0
4,0
...,...
520011,1
520012,1
520013,1
520014,1


Result after handling the imbalanced data

In [None]:
Y.Result.value_counts()

1    260008
0    260008
Name: Result, dtype: int64

Here both the classess have same no. of records.

## Save the Balanced Dataset for future Use


In [None]:
X.to_csv("/gdrive/MyDrive/Covid Prediction AI Project/X.csv")
Y.to_csv("/gdrive/MyDrive/Covid Prediction AI Project/Y.csv")