In [90]:
import pandas as pd
import numpy as np
import statsmodels as sm
import matplotlib.pyplot as plt
import math
from sklearn.metrics import mean_squared_error

In [3]:
%matplotlib inline

In [4]:
from sklearn import preprocessing

In [119]:
df = pd.read_csv("us_counties.csv", parse_dates=['Date'])

In [120]:
df.dtypes


Date                     datetime64[ns]
Days Since 2019-12-31             int64
CountryName                      object
Region                           object
County                           object
Confirmed                         int64
Deaths                            int64
dtype: object

In [121]:
df["area"] = df["CountryName"] + df["Region"] + df["County"]

In [122]:
df.drop(['County', 'CountryName', 'Region'], axis=1, inplace=True)

In [123]:
df.head()

Unnamed: 0,Date,Days Since 2019-12-31,Confirmed,Deaths,area
0,2020-01-21,21,1,0,United States of AmericaWashingtonSnohomish
1,2020-01-22,22,1,0,United States of AmericaWashingtonSnohomish
2,2020-01-23,23,1,0,United States of AmericaWashingtonSnohomish
3,2020-01-24,24,1,0,United States of AmericaIllinoisCook
4,2020-01-24,24,1,0,United States of AmericaWashingtonSnohomish


In [129]:
le = preprocessing.LabelEncoder()
le.fit(df['area'])
le.transform(df['area']) 

array([1713, 1713, 1713, ..., 1794, 1795, 1796])

In [130]:
df["areaCode"] = le.transform(df['area'])

In [124]:
df['Date'] = pd.to_datetime(df.Date , format = '%Y/%m/%d')

In [110]:
data = df.drop(['Date'], axis=1)

In [131]:
df.drop('area', axis=1, inplace=True)

In [113]:
df.drop('Date', axis=1, inplace=True)

In [125]:
df.rename(columns = {'Days Since 2019-12-31':'daysSince'}, inplace = True) 

In [132]:
df.head()

Unnamed: 0,Date,daysSince,Confirmed,Deaths,areaCode
0,2020-01-21,21,1,0,1713
1,2020-01-22,22,1,0,1713
2,2020-01-23,23,1,0,1713
3,2020-01-24,24,1,0,407
4,2020-01-24,24,1,0,1713


In [133]:
df.to_csv('cleanedUSA.csv') 

In [67]:
df["daysSince"] = df["Days Since 2019-12-31"] 

In [71]:
data.drop('Days Since 2019-12-31', axis=1, inplace=True)

In [77]:

data

Unnamed: 0_level_0,Confirmed,Deaths,areaCode,daysSince
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-12-31,0,0,1,0
2019-12-31,0,0,2,0
2019-12-31,0,0,6,0
2019-12-31,0,0,10,0
2019-12-31,0,0,20,0
...,...,...,...,...
2020-03-28,16,0,316,88
2020-03-28,5,1,317,88
2020-03-29,560,4,57,89
2020-03-29,132,2,63,89


In [74]:
data.to_csv('cleanedComplete.csv') 

In [46]:
df = pd.read_csv("cleanedComplete.csv")

In [75]:
data.index = df.Date

In [76]:
data.drop('Date', axis=1, inplace=True)

In [78]:
train = data[:int(0.9*(len(data)))]
valid = data[int(0.9*(len(data))):]

In [79]:
from statsmodels.tsa.vector_ar.var_model import VAR

model = VAR(endog=train)
model_fit = model.fit()



In [80]:
prediction = model_fit.forecast(model_fit.y, steps=len(valid))

In [92]:
def rmsle(ytrue, ypred):
    return np.sqrt(mean_squared_log_error(ytrue, ypred))

In [91]:
cols = data.columns
pred = pd.DataFrame(index=range(0,len(prediction)),columns=[cols])
for j in range(0,3):
    for i in range(0, len(prediction)):
       pred.iloc[i][j] = prediction[i][j]

#check rmse
for i in cols:
    print('rmse value for', i, 'is : ', math.sqrt(mean_squared_error(pred[i], valid[i])))


TypeError: only integer scalar arrays can be converted to a scalar index

In [93]:
data


Unnamed: 0_level_0,Confirmed,Deaths,areaCode,daysSince
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-12-31,0,0,1,0
2019-12-31,0,0,2,0
2019-12-31,0,0,6,0
2019-12-31,0,0,10,0
2019-12-31,0,0,20,0
...,...,...,...,...
2020-03-28,16,0,316,88
2020-03-28,5,1,317,88
2020-03-29,560,4,57,89
2020-03-29,132,2,63,89


In [94]:
mean_error = []
for days in range(0,89):
    train = data[data['daysSince'] < days]
    val = data[data['daysSince'] == days]

    p = val['Deaths'].values

    error = rmsle(val['Deaths'].values, p)
    print('Days Since %d - Error %.5f' % (days, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

NameError: name 'mean_squared_log_error' is not defined

In [95]:
data.reset_index()

Unnamed: 0,Date,Confirmed,Deaths,areaCode,daysSince
0,2019-12-31,0,0,1,0
1,2019-12-31,0,0,2,0
2,2019-12-31,0,0,6,0
3,2019-12-31,0,0,10,0
4,2019-12-31,0,0,20,0
...,...,...,...,...,...
10728,2020-03-28,16,0,316,88
10729,2020-03-28,5,1,317,88
10730,2020-03-29,560,4,57,89
10731,2020-03-29,132,2,63,89


In [100]:
data2 = data[['Confirmed', 'Deaths', 'areaCode', 'daysSince']].copy()


In [101]:
data2.index = data['daysSince']

In [102]:
data

Unnamed: 0_level_0,Confirmed,Deaths,areaCode,daysSince
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-12-31,0,0,1,0
2019-12-31,0,0,2,0
2019-12-31,0,0,6,0
2019-12-31,0,0,10,0
2019-12-31,0,0,20,0
...,...,...,...,...
2020-03-28,16,0,316,88
2020-03-28,5,1,317,88
2020-03-29,560,4,57,89
2020-03-29,132,2,63,89


In [134]:
df = pd.read_csv("complete_df.csv", parse_dates=['Date'])
df["area"] = df["CountryCode"] + df["CountryName"] + df["Region"]
le = preprocessing.LabelEncoder()
le.fit(df['area'])
le.transform(df['area']) 


array([ 1,  2,  6, ..., 57, 63, 64])

In [135]:
dfres = pd.read_csv('globalresultsDeathsraw.csv')

In [136]:
dfres['area'] = le.inverse_transform(dfres['areaCode'])

In [157]:
dfres.to_csv('globalresV2.csv') 

In [156]:
dfres.head()

Unnamed: 0,Time,Confirmed,Target,areaCode,predicted_Target,area,date
0,118,5,0,39,0.011729,CFCentral African RepublicCentral African Repu...,2020-04-27 00:00:00
1,85,1205,1,75,1.359782,CNChinaZhejiang,2020-03-25 00:00:00
2,81,0,0,192,0.001423,MKMacedoniaMacedonia,2020-03-21 00:00:00
3,74,222,1,51,2.28752,CNChinaGuangxi,2020-03-14 00:00:00
4,32,0,0,168,0.003343,JPJapanJapan,2020-02-01 00:00:00


In [155]:
import datetime
date_1 = datetime.datetime.strptime("12/31/19", "%m/%d/%y")
dfres['date'] = ""
end_date = date_1 + datetime.timedelta(days=10)
str(end_date)
for d in dfres.index:
    print(dfres['Time'][d])
    dfres['date'][d] = str(date_1 + datetime.timedelta(days=int(dfres['Time'][d])))


118
85
81
74
32
107
102
99
93
76
37
108
83
91
98
45
102
52
103
116
104
112
82
115
81
106
44
112
78
112
72
85
118
83
43
70
41
68
75
89
49
91
69
98
114
107
69
79
52
34
43
100
55
52
32
30
62
49
109
105
107
104
115
53
68
117
105
75
63
85
113
87
91
117
81
98
102
46
115
107
99
81
112
49
106
101
106
72
117
118
98
48
87
94
112
105
53
53
89
109
109
115
107
87
37
102
61
89
103
51
55
106
84
101
97
82
39
61
72
115
61
50
113
99
107
107
79
114
60
44
109
40
112
88
81
56
117
117
102
68
63
102
58
118
112
101
92
106
42
106
114
117
82
99
90
117
107
91
49
94
97
107
97
49
116
101
108
57
55
86
47
45
99
80
67
86
116
34
51
83
62
105
68
32
60
36
73
88
114
100
110
86
38
84
50
93
102
97
31
106
99
104
116
46
97
112
109
59
98
51
61
30
115
63
52
99
69
67
37
68
112
94
111
115
77
44
92
41
75
111
110
37
101
108
103
62
95
79
60
98
115
47
45
79
41
80
50
118
70
105
100
76
76
106
96
75
93
110
39
77
73
35
113
112
104
96
43
96
103
56
57
103
118
43
61
61
115
68
117
112
52
114
99
107
92
70
84
36
99
65
57
99
86
34
73
63
112
11

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  



111
72
112
95
98
39
44
96
110
33
86
107
69
71
111
110
80
39
81
101
103
114
95
34
107
111
73
64
70
33
102
108
106
80
53
97
116
69
97
76
59
98
114
107
108
48
114
112
97
66
86
115
99
109
57
90
61
115
87
82
40
95
82
98
44
115
76
116
54
33
44
71
104
110
105
118
71
75
110
82
74
108
59
62
107
111
55
110
39
89
44
90
109
108
101
85
112
110
57
106
76
49
117
114
109
43
63
61
103
39
69
109
112
60
87
110
97
41
31
107
77
117
92
115
116
109
31
89
52
115
118
106
52
36
105
55
30
84
117
107
69
87
95
97
70
73
111
95
90
96
108
110
85
112
110
108
30
104
97
63
45
70
110
99
111
92
32
111
107
102
48
110
49
61
58
85
103
101
37
79
110
48
114
115
81
95
75
90
59
71
45
103
66
108
116
117
114
95
118
49
90
111
40
55
86
93
77
115
90
115
116
64
108
119
41
91
80
96
110
108
102
107
85
107
32
117
99
87
115
104
57
92
111
113
113
91
69
41
66
118
85
75
100
99
97
102
55
101
77
90
80
107
117
30
112
56
36
113
62
84
117
98
54
115
77
106
50
42
116
101
32
113
99
84
91
86
110
87
100
30
93
58
85
77
108
61
37
108
80
113
118
64
109


105
85
40
73
95
57
57
112
46
100
95
63
118
108
72
73
61
54
74
109
76
116
98
111
41
63
67
99
92
102
110
79
99
117
77
54
112
96
114
51
97
53
69
72
95
90
106
76
90
95
106
86
115
93
97
32
48
78
47
117
57
89
109
92
85
118
35
61
80
113
60
35
111
113
100
56
88
109
111
74
70
102
90
100
63
83
116
35
53
108
113
85
84
115
113
93
107
117
101
31
46
98
85
98
92
41
63
102
53
101
107
104
118
111
105
92
76
98
116
114
99
60
45
49
112
32
109
101
111
103
84
109
81
68
54
114
115
74
55
65
107
110
75
113
85
112
75
114
102
77
80
32
98
66
86
41
105
117
92
39
106
102
102
48
97
113
84
85
99
107
34
99
39
32
112
83
70
56
104
30
64
92
112
64
118
59
85
117
108
40
102
34
102
91
85
116
68
114
35
65
79
112
117
109
55
116
117
104
113
107
87
112
108
67
41
104
104
38
54
32
108
97
52
40
115
105
112
97
109
97
89
115
67
38
82
105
56
115
71
115
95
47
100
100
115
103
91
58
103
101
71
75
46
58
105
88
110
74
34
83
113
52
102
38
97
93
72
117
106
85
117
103
81
117
114
107
108
37
94
110
53
89
115
115
113
62
89
102
59
114
76
114
38


90
114
96
118
115
97
101
55
113
63
101
112
110
48
40
62
117
117
94
94
92
63
68
90
77
73
100
112
110
58
103
81
107
115
50
62
111
53
99
61
86
55
116
108
111
35
115
56
94
65
115
76
113
87
52
55
52
105
97
107
115
59
97
115
117
114
117
79
116
77
62
82
66
73
87
118
71
33
80
93
94
113
56
72
49
99
109
112
41
62
108
109
104
92
31
116
100
41
115
42
59
118
100
58
113
37
30
82
76
81
114
67
40
84
111
84
100
65
105
44
107
80
51
67
65
77
104
101
118
71
119
96
69
114
97
53
104
113
109
74
97
92
106
106
68
92
80
107
111
104
89
111
61
70
117
88
108
108
103
56
87
54
113
117
34
88
106
89
52
106
34
103
42
103
63
55
73
43
90
76
58
109
109
83
52
98
43
113
71
113
88
37
91
74
35
77
107
101
112
84
112
89
45
108
112
69
107
101
110
90
111
74
82
50
52
116
113
67
95
116
74
68
32
31
36
95
76
96
42
62
103
32
67
68
30
73
81
108
76
90
46
110
118
99
110
99
115
38
58
84
100
81
118
82
57
74
55
107
90
111
68
34
90
95
73
102
82
83
50
84
101
105
59
113
95
113
91
95
91
107
63
67
109
63
68
57
113
72
49
114
115
73
105
107
116
10

108
99
113
92
62
111
67
92
109
34
107
100
112
78
112
109
85
98
81
71
111
96
107
113
99
45
88
116
86
93
73
46
118
100
100
107
106
53
107
105
117
91
98
103
72
109
102
103
111
51
62
94
115
108
66
92
87
101
107
108
108
114
100
33
82
114
87
114
103
95
106
118
116
112
105
106
35
68
84
90
115
118
116
64
67
82
45
45
111
32
78
103
103
112
108
115
106
56
83
114
47
101
109
52
86
79
85
60
36
113
96
115
73
116
99
118
67
66
64
117
76
66
115
99
52
118
102
65
110
105
108
89
115
71
115
79
114
52
110
117
46
106
79
117
93
94
79
71
108
61
106
109
74
105
72
113
80
89
116
72
115
70
92
85
111
53
88
109
109
94
30
102
63
109
91
106
115
110
62
101
108
97
58
75
44
106
74
82
35
54
106
113
81
39
108
40
72
43
109
33
110
117
111
118
86
95
55
104
85
112
103
58
75
99
105
101
75
54
112
80
107
78
51
102
60
54
36
104
108
90
55
99
34
92
116
97
74
52
99
58
59
55
93
112
69
109
87
67
62
88
97
87
65
69
100
99
36
95
66
96
85
82
110
111
96
116
75
105
110
103
84
53
82
57
65
109
96
105
38
68
115
118
111
115
44
61
100
111
72
114
3

81
112
108
109
118
99
70
30
47
107
118
109
34
85
65
113
116
56
99
114
103
96
113
51
66
113
51
40
100
95
99
73
112
62
111
65
108
58
101
89
113
114
102
80
86
47
60
64
102
65
87
58
108
115
111
112
109
98
35
118
69
37
116
111
86
102
108
105
56
87
46
60
91
41
37
64
98
83
103
113
118
118
67
109
114
114
100
93
113
54
84
89
103
50
39
112
108
117
89
54
103
114
88
111
77
98
83
104
118
59
111
59
87
117
98
107
109
89
63
63
109
85
31
87
60
49
88
78
83
57
102
113
108
98
96
94
40
59
93
96
88
114
102
54
117
102
103
69
62
59
48
41
87
34
62
87
105
95
42
104
54
87
111
114
99
76
115
71
44
101
111
76
99
100
99
68
92
115
105
65
117
87
89
114
103
113
103
112
111
107
97
40
52
71
84
112
107
118
91
105
112
100
75
64
89
87
74
115
97
78
76
110
92
103
98
85
30
111
87
108
42
118
44
38
81
33
55
80
83
54
78
111
114
72
62
78
76
117
103
113
107
114
108
66
81
91
64
91
40
98
118
60
75
63
112
109
113
116
43
115
48
50
76
73
47
113
86
111
75
107
86
103
113
96
112
102
100
41
43
51
43
103
114
110
104
116
110
99
111
83
108
57


In [169]:
for d in dfres.index:
    dfres['place'][d] = str(dfres['area'][d])[0:2] + "," +  str(dfres['area'][d])[2:] 
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [170]:
dfres.head()

Unnamed: 0,Time,Confirmed,Target,areaCode,predicted_Target,area,date,place
0,118,5,0,39,0.011729,CFCentral African RepublicCentral African Repu...,2020-04-27 00:00:00,"CF,Central African RepublicCentral African Rep..."
1,85,1205,1,75,1.359782,CNChinaZhejiang,2020-03-25 00:00:00,"CN,ChinaZhejiang"
2,81,0,0,192,0.001423,MKMacedoniaMacedonia,2020-03-21 00:00:00,"MK,MacedoniaMacedonia"
3,74,222,1,51,2.28752,CNChinaGuangxi,2020-03-14 00:00:00,"CN,ChinaGuangxi"
4,32,0,0,168,0.003343,JPJapanJapan,2020-02-01 00:00:00,"JP,JapanJapan"


In [171]:
dfres.to_csv('globalresV3.csv') 

In [172]:
df = pd.read_csv("cleanedUSAv2test.csv")

In [173]:
df['daysSince'] = df['daysSince'] + 30

In [174]:
df.head()


Unnamed: 0,daysSince,Confirmed,Deaths,areaCode
0,51,1,0,1713
1,52,1,0,1713
2,53,1,0,1713
3,54,1,0,407
4,54,1,0,1713


In [175]:
df.to_csv('usatestV1.csv') 

In [177]:
dfres2 = dfres
dfres2["CountryCode"] = ""
dfres2["CountryName"] = ""
dfres2["Region"] = ""
df = pd.read_csv("complete_df.csv", parse_dates=['Date'])
for d in dfres2.index:
    dfres2["CountryCode"][d] = df["CountryCode"][d]
    dfres2["CountryName"][d] = df["CountryName"][d]
    dfres2["Region"][d] = df["Region"][d]
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [178]:
dfres2.head()

Unnamed: 0,Time,Confirmed,Target,areaCode,predicted_Target,area,date,place,CountryCode,CountryName,Region
0,118,5,0,39,0.011729,CFCentral African RepublicCentral African Repu...,2020-04-27 00:00:00,"CF,Central African RepublicCentral African Rep...",AE,United Arab Emirates,United Arab Emirates
1,85,1205,1,75,1.359782,CNChinaZhejiang,2020-03-25 00:00:00,"CN,ChinaZhejiang",AF,Afghanistan,Afghanistan
2,81,0,0,192,0.001423,MKMacedoniaMacedonia,2020-03-21 00:00:00,"MK,MacedoniaMacedonia",AM,Armenia,Armenia
3,74,222,1,51,2.28752,CNChinaGuangxi,2020-03-14 00:00:00,"CN,ChinaGuangxi",AT,Austria,Austria
4,32,0,0,168,0.003343,JPJapanJapan,2020-02-01 00:00:00,"JP,JapanJapan",AZ,Azerbaijan,Azerbaijan


In [180]:
df = pd.read_csv("us_counties.csv", parse_dates=['Date'])
df["area"] = df["CountryName"] + df["Region"] + df["County"]
le = preprocessing.LabelEncoder()
le.fit(df['area'])
le.transform(df['area'])

array([1713, 1713, 1713, ..., 1794, 1795, 1796])

In [182]:
dfres = pd.read_csv('usaresv1raw.csv')

In [183]:
dfres['area'] = le.inverse_transform(dfres['areaCode'])

In [187]:
dfres.head()

Unnamed: 0,daysSince,Confirmed,Deaths,areaCode,estimated_Confirmed,predicted_Confirmed,area,place
0,97,3,0,736,78.499924,78.499924,United States of AmericaMassachusettsNorfolk,MassachusettsNorfolk
1,92,1,0,1748,10.048667,10.048667,United States of AmericaWisconsinDane,WisconsinDane
2,111,1,0,118,5.144913,5.144913,United States of AmericaCaliforniaAmador,CaliforniaAmador
3,117,1,0,281,8.523064,8.523064,United States of AmericaGeorgiaCatoosa,GeorgiaCatoosa
4,92,15,6,1701,148.445267,148.445267,United States of AmericaWashingtonKing,WashingtonKing


In [186]:
dfres['place'] = ""
for d in dfres.index:
    dfres['place'][d] = str(dfres['area'][d])[24:]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [188]:
dfres.to_csv('usaresV2.csv') 