# [CM1] Data Pre-processing and Preparation

Importing all necessary libraries.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import tree
import scipy

In [2]:
df = pd.read_csv('dkmacovid_train.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'dkmacovid_train.csv'

In [None]:
df.shape

In [None]:
df.head(47)

## We can see that states with State Id 14, 16, 43, 44 and 48 are missing in the data. That's why the total count of states in the data is 46. 

In [None]:
df.tail(5)

In [None]:
df.describe(include = 'all')

### As the columns 'Resident Population 2020 Census' and 'Population Density 2020 Census' have commas in the data , we will be removing it for computation purposes. 

In [None]:
df['Resident Population 2020 Census'] = df['Resident Population 2020 Census'].str.replace(',','').astype(int)
df['Population Density 2020 Census'] = df['Population Density 2020 Census'].str.replace(',','').astype(float)

In [None]:
for i in set(df.loc[:,"State"]):
    lat = set(df[df.loc[:,"State"]==i].loc[:,"Lat"])
    lon = set(df[df.loc[:,"State"]==i].loc[:,"Long_"])
    print(i,"\t",lat,lon)
print("No of sets ",len(set(df.loc[:,"State"])))

### From the above, we can say that every State corresponds to a unique set of co-ordinates represented by Latitude and Longitude.  We'll keep the Lat and Long for better data understanding.

#### 1. Check for Null values

In [None]:
df.isna().sum()

<b> No Null values in the dataset as mentioned in the question </b>

#### 2. Checking for negative values

In [None]:
(df.iloc[:,3:-3]<0).sum()

<b> No negative values in the dataset. </b>

## 3. Checking Outliers and Removing them

In [None]:
df_gstate = df.groupby('State')
z = df_gstate.get_group('Florida')['Case_Fatality_Ratio']
sns.boxplot(x = z)

In [None]:
#The outliers are checked and removed by grouping the dataset according to the State
for key,value in df_gstate:
    groups = df_gstate.get_group(key)
    temp = groups.iloc[:,5:10]
    for columns in temp:
        Q1 = np.percentile(temp[columns],25)
        Q3 = np.percentile(temp[columns],75)
        IQR = Q3 - Q1
        right_limit = Q3 + 1.5*IQR
        left_limit = Q1 - 1.5*IQR
        outlier_right_index = groups[groups[columns] > right_limit][columns].index
        outlier_left_index = groups[groups[columns] < left_limit][columns].index
        n_outliers = len(outlier_right_index) + len(outlier_left_index)
        if(n_outliers > 0):
            print(key,columns,n_outliers)
            df.loc[outlier_right_index,columns] = right_limit
            df.loc[outlier_left_index,columns] = left_limit

In [None]:
z = df_gstate.get_group('Florida')['Case_Fatality_Ratio']
sns.boxplot(x = z)

## What do you do with ”Day”, ”State” and ”State ID”?

In [None]:
print(df.State.nunique())
print(df['State ID'].nunique())
print(df.Day.nunique())

<b> The dataset contains data for day 2 to 31 total 30 days of covid data for 46 unique states. </b>

## As State ID and State both are giving a unique identity to the dataset we can remove state column.

In [None]:
#dropping State Id Column
df = df.drop(columns=['State'])

In [None]:
df.columns

## Normalization

In [None]:
normalization = df[['Lat','Long_','Active','Incident_Rate','Total_Test_Results','Case_Fatality_Ratio',
            'Testing_Rate','Resident Population 2020 Census','Population Density 2020 Census',
            'Density Rank 2020 Census','SexRatio']]
normalization.dtypes

In [None]:
# Z-score normalization
normalization = (normalization - normalization.mean()) / normalization.std()
normalization

In [None]:
df[['Lat','Long_','Active','Incident_Rate','Total_Test_Results','Case_Fatality_Ratio',
            'Testing_Rate','Resident Population 2020 Census','Population Density 2020 Census',
            'Density Rank 2020 Census','SexRatio']] = normalization
df

In [None]:
df.to_csv("cleaned_normalized_coviddata.csv",index = False)

# Summary Report

#### 1) The dataset that is imported goes though a series of preprocessing steps. It is initially checked for NAN and negative values, there were no NAN values in the dataset. 
#### 2) The 'Resident Population 2020 Census' and 'Population Density 2020 Census' were of type object because there were commas between numbers and thus the data was stored as string type. The commas were removed and these columns were converted to numerical datatype. 
#### 3) The datset is then checked for outliers. The outliers were to be considered based on grouping the dataset by 'State'. This resulted in 25 outliers which were replaced by their upper and lower limit values. 
#### 4) The state and stateId represented the same information so the 'State' column was dropped. We have kept the 'Day' and 'State ID' so that the data may be grouped according to state for any future reference in the following CM's. However, these values will not be used for calculation purposes.
#### 5) We used z-score normalization since the columns would be normally distributed with a specified range of values and most of the classifiers calculate the distance between points for classification. Min Max scaler is not used as presence of outlier might affect its values and since the data is generated over a specific population, there might be chances of outliers.