# ECE 884 Deep Learning - Final Project
*Notebook 2 of 3 - Sheet 2*<br>
**Name:** Syed Kashif Mujtaba Kamoonpuri<br>
**Email:** kamoonpu@msu.edu

**Objective:**<br>
Pre-process data. This includes:
- Dealing with missing values
- Scaling numerical variables

**Importing required libraries**

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

**Loading dataset**

In [34]:
train = pd.read_csv('train.csv', parse_dates=['Date'])
test = pd.read_csv('test.csv', parse_dates=['Date'])
print('Dimensions of training set:', train.shape)
print('Dimensions of testing set:', test.shape)

Dimensions of training set: (35995, 6)
Dimensions of testing set: (13459, 4)


In [35]:
train.head(10)

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities
0,1,,Afghanistan,2020-01-22,0.0,0.0
1,2,,Afghanistan,2020-01-23,0.0,0.0
2,3,,Afghanistan,2020-01-24,0.0,0.0
3,4,,Afghanistan,2020-01-25,0.0,0.0
4,5,,Afghanistan,2020-01-26,0.0,0.0
5,6,,Afghanistan,2020-01-27,0.0,0.0
6,7,,Afghanistan,2020-01-28,0.0,0.0
7,8,,Afghanistan,2020-01-29,0.0,0.0
8,9,,Afghanistan,2020-01-30,0.0,0.0
9,10,,Afghanistan,2020-01-31,0.0,0.0


**Checking for missing values in train and test set**

In [36]:
print('# of missing values in training set:')
print(train.isnull().sum())
print('\n# of missing values in test set:')
print(test.isnull().sum())

# of missing values in training set:
Id                    0
Province_State    20700
Country_Region        0
Date                  0
ConfirmedCases        0
Fatalities            0
dtype: int64

# of missing values in test set:
ForecastId           0
Province_State    7740
Country_Region       0
Date                 0
dtype: int64


### Dealing with missing values in 'Province_State'

In [37]:
train.set_index('Id', inplace=True)
train['Region'] = np.nan
train.loc[train.Province_State.isnull(), 'Region'] = train.loc[train.Province_State.isnull(), 'Country_Region']
train.loc[train.Province_State.notnull(), 'Region'] = train.loc[train.Province_State.notnull(), 'Country_Region']\
                                + ' ' + train.loc[train.Province_State.notnull(),'Province_State'].astype('str')
train = train.loc[:, ['Date', 'Region', 'ConfirmedCases', 'Fatalities']]
train.loc[500:1000,:]

Unnamed: 0_level_0,Date,Region,ConfirmedCases,Fatalities
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
500,2020-03-01,Angola,0.0,0.0
501,2020-03-02,Angola,0.0,0.0
502,2020-03-03,Angola,0.0,0.0
503,2020-03-04,Angola,0.0,0.0
504,2020-03-05,Angola,0.0,0.0
...,...,...,...,...
996,2020-04-06,Australia Australian Capital Territory,96.0,2.0
997,2020-04-07,Australia Australian Capital Territory,96.0,2.0
998,2020-04-08,Australia Australian Capital Territory,99.0,2.0
999,2020-04-09,Australia Australian Capital Territory,100.0,2.0


In [38]:
test.set_index('ForecastId', inplace=True)
test['Region'] = np.nan
test.loc[test.Province_State.isnull(), 'Region'] = test.loc[test.Province_State.isnull(), 'Country_Region']
test.loc[test.Province_State.notnull(), 'Region'] = test.loc[test.Province_State.notnull(), 'Country_Region']\
                                + ' ' + test.loc[test.Province_State.notnull(),'Province_State'].astype('str')
test = test.loc[:, ['Date', 'Region']]
test.loc[500:1000,:]

Unnamed: 0_level_0,Date,Region
ForecastId,Unnamed: 1_level_1,Unnamed: 2_level_1
500,2020-04-28,Australia Queensland
501,2020-04-29,Australia Queensland
502,2020-04-30,Australia Queensland
503,2020-05-01,Australia Queensland
504,2020-05-02,Australia Queensland
...,...,...
996,2020-04-08,Belgium
997,2020-04-09,Belgium
998,2020-04-10,Belgium
999,2020-04-11,Belgium


**Checking for missing values after imputation**

In [39]:
print('# of missing values in training set:')
print(train.isnull().sum())
print('\n# of missing values in test set:')
print(test.isnull().sum())

# of missing values in training set:
Date              0
Region            0
ConfirmedCases    0
Fatalities        0
dtype: int64

# of missing values in test set:
Date      0
Region    0
dtype: int64


### Using MinMaxScaler to scale values in 'ConfirmedCases' and 'Fatalities'

In [40]:
scaler = MinMaxScaler()
scaler.fit(train.loc[:, ['ConfirmedCases', 'Fatalities']])
train.loc[:, ['ConfirmedCases', 'Fatalities']] = scaler.transform(train.loc[:, ['ConfirmedCases', 'Fatalities']])

In [41]:
# After Scaling
train.loc[500:1000,:]

Unnamed: 0_level_0,Date,Region,ConfirmedCases,Fatalities
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
500,2020-03-01,Angola,0.000000,0.000000
501,2020-03-02,Angola,0.000000,0.000000
502,2020-03-03,Angola,0.000000,0.000000
503,2020-03-04,Angola,0.000000,0.000000
504,2020-03-05,Angola,0.000000,0.000000
...,...,...,...,...
996,2020-04-06,Australia Australian Capital Territory,0.000278,0.000059
997,2020-04-07,Australia Australian Capital Territory,0.000278,0.000059
998,2020-04-08,Australia Australian Capital Territory,0.000286,0.000059
999,2020-04-09,Australia Australian Capital Territory,0.000289,0.000059


### Next step:
Create a base time-series model using RNN

In [60]:
regions = list(train.Region.unique())

train_df = pd.DataFrame()
for region in regions:
    new_df = train.loc[train.Region == region,:]
    for i in range(1,16):
        cc_col = 'ConfirmedCases_' + str(i)
        f_col = 'Fatalities_' + str(i)
        new_df[cc_col] = new_df.ConfirmedCases.shift(i)
        new_df[f_col] = new_df.Fatalities.shift(i)
        #new_df = new_df.iloc[15:,:]
    train_df = pd.concat([train_df, new_df.iloc[15:,:]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [61]:
train_df[train_df.Region == 'India'].tail(20)

Unnamed: 0_level_0,Date,Region,ConfirmedCases,Fatalities,ConfirmedCases_1,Fatalities_1,ConfirmedCases_2,Fatalities_2,ConfirmedCases_3,Fatalities_3,...,ConfirmedCases_11,Fatalities_11,ConfirmedCases_12,Fatalities_12,ConfirmedCases_13,Fatalities_13,ConfirmedCases_14,Fatalities_14,ConfirmedCases_15,Fatalities_15
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16196,2020-04-26,India,0.080651,0.025913,0.076004,0.024266,0.070934,0.022943,0.066733,0.021207,...,0.035632,0.011912,0.033217,0.01156,0.030227,0.01053,0.026618,0.009736,0.024424,0.008471
16197,2020-04-27,India,0.085165,0.027619,0.080651,0.025913,0.076004,0.024266,0.070934,0.022943,...,0.038836,0.013177,0.035632,0.011912,0.033217,0.01156,0.030227,0.01053,0.026618,0.009736
16198,2020-04-28,India,0.090581,0.029649,0.085165,0.027619,0.080651,0.025913,0.076004,0.024266,...,0.041502,0.014295,0.038836,0.013177,0.035632,0.011912,0.033217,0.01156,0.030227,0.01053
16199,2020-04-29,India,0.095607,0.031737,0.090581,0.029649,0.085165,0.027619,0.080651,0.025913,...,0.045464,0.015324,0.041502,0.014295,0.038836,0.013177,0.035632,0.011912,0.033217,0.01156
16200,2020-04-30,India,0.100815,0.033943,0.095607,0.031737,0.090581,0.029649,0.085165,0.027619,...,0.050938,0.016442,0.045464,0.015324,0.041502,0.014295,0.038836,0.013177,0.035632,0.011912
16201,2020-05-01,India,0.107737,0.035973,0.100815,0.033943,0.095607,0.031737,0.090581,0.029649,...,0.05361,0.017413,0.050938,0.016442,0.045464,0.015324,0.041502,0.014295,0.038836,0.013177
16202,2020-05-02,India,0.114799,0.038914,0.107737,0.035973,0.100815,0.033943,0.095607,0.031737,...,0.058066,0.018972,0.05361,0.017413,0.050938,0.016442,0.045464,0.015324,0.041502,0.014295
16203,2020-05-03,India,0.122913,0.040914,0.114799,0.038914,0.107737,0.035973,0.100815,0.033943,...,0.061796,0.020031,0.058066,0.018972,0.05361,0.017413,0.050938,0.016442,0.045464,0.015324
16204,2020-05-04,India,0.134284,0.046062,0.122913,0.040914,0.114799,0.038914,0.107737,0.035973,...,0.066733,0.021207,0.061796,0.020031,0.058066,0.018972,0.05361,0.017413,0.050938,0.016442
16205,2020-05-05,India,0.142852,0.049797,0.134284,0.046062,0.122913,0.040914,0.114799,0.038914,...,0.070934,0.022943,0.066733,0.021207,0.061796,0.020031,0.058066,0.018972,0.05361,0.017413


In [64]:
test_df = train_df.loc[(train_df.Date >= min(test.Date)) & (train_df.Date <= max(test.Date)),:]
test_df.head()

Unnamed: 0_level_0,Date,Region,ConfirmedCases,Fatalities,ConfirmedCases_1,Fatalities_1,ConfirmedCases_2,Fatalities_2,ConfirmedCases_3,Fatalities_3,...,ConfirmedCases_11,Fatalities_11,ConfirmedCases_12,Fatalities_12,ConfirmedCases_13,Fatalities_13,ConfirmedCases_14,Fatalities_14,ConfirmedCases_15,Fatalities_15
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
72,2020-04-02,Afghanistan,0.000789,0.000176,0.000685,0.000118,0.000503,0.000118,0.000492,0.000118,...,0.000116,2.9e-05,6.9e-05,0.0,6.9e-05,0.0,6.4e-05,0.0,6.4e-05,0.0
73,2020-04-03,Afghanistan,0.000813,0.000176,0.000789,0.000176,0.000685,0.000118,0.000503,0.000118,...,0.000116,2.9e-05,0.000116,2.9e-05,6.9e-05,0.0,6.9e-05,0.0,6.4e-05,0.0
74,2020-04-04,Afghanistan,0.000865,0.000206,0.000813,0.000176,0.000789,0.000176,0.000685,0.000118,...,0.000214,2.9e-05,0.000116,2.9e-05,0.000116,2.9e-05,6.9e-05,0.0,6.9e-05,0.0
75,2020-04-05,Afghanistan,0.001009,0.000206,0.000865,0.000206,0.000813,0.000176,0.000789,0.000176,...,0.000243,5.9e-05,0.000214,2.9e-05,0.000116,2.9e-05,0.000116,2.9e-05,6.9e-05,0.0
76,2020-04-06,Afghanistan,0.001061,0.000324,0.001009,0.000206,0.000865,0.000206,0.000813,0.000176,...,0.000272,0.000118,0.000243,5.9e-05,0.000214,2.9e-05,0.000116,2.9e-05,0.000116,2.9e-05


In [67]:
test_df.index = test.index
test_df.head()

Unnamed: 0_level_0,Date,Region,ConfirmedCases,Fatalities,ConfirmedCases_1,Fatalities_1,ConfirmedCases_2,Fatalities_2,ConfirmedCases_3,Fatalities_3,...,ConfirmedCases_11,Fatalities_11,ConfirmedCases_12,Fatalities_12,ConfirmedCases_13,Fatalities_13,ConfirmedCases_14,Fatalities_14,ConfirmedCases_15,Fatalities_15
ForecastId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2020-04-02,Afghanistan,0.000789,0.000176,0.000685,0.000118,0.000503,0.000118,0.000492,0.000118,...,0.000116,2.9e-05,6.9e-05,0.0,6.9e-05,0.0,6.4e-05,0.0,6.4e-05,0.0
2,2020-04-03,Afghanistan,0.000813,0.000176,0.000789,0.000176,0.000685,0.000118,0.000503,0.000118,...,0.000116,2.9e-05,0.000116,2.9e-05,6.9e-05,0.0,6.9e-05,0.0,6.4e-05,0.0
3,2020-04-04,Afghanistan,0.000865,0.000206,0.000813,0.000176,0.000789,0.000176,0.000685,0.000118,...,0.000214,2.9e-05,0.000116,2.9e-05,0.000116,2.9e-05,6.9e-05,0.0,6.9e-05,0.0
4,2020-04-05,Afghanistan,0.001009,0.000206,0.000865,0.000206,0.000813,0.000176,0.000789,0.000176,...,0.000243,5.9e-05,0.000214,2.9e-05,0.000116,2.9e-05,0.000116,2.9e-05,6.9e-05,0.0
5,2020-04-06,Afghanistan,0.001061,0.000324,0.001009,0.000206,0.000865,0.000206,0.000813,0.000176,...,0.000272,0.000118,0.000243,5.9e-05,0.000214,2.9e-05,0.000116,2.9e-05,0.000116,2.9e-05


In [68]:
test.head()

Unnamed: 0_level_0,Date,Region
ForecastId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2020-04-02,Afghanistan
2,2020-04-03,Afghanistan
3,2020-04-04,Afghanistan
4,2020-04-05,Afghanistan
5,2020-04-06,Afghanistan
