In [45]:
import pandas as pd
import numpy as np
import statsmodels as sm
import matplotlib.pyplot as plt
import math
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
%matplotlib inline

In [4]:
from sklearn import preprocessing

In [5]:
df = pd.read_csv("us_counties.csv", parse_dates=['Date'])

In [6]:
df.dtypes


Date                     datetime64[ns]
Days Since 2019-12-31             int64
CountryName                      object
Region                           object
County                           object
Confirmed                         int64
Deaths                            int64
dtype: object

In [7]:
df["area"] = df["CountryName"] + df["Region"] + df["County"]

In [8]:
df.drop(['County', 'CountryName', 'Region'], axis=1, inplace=True)

In [9]:
df.head()

Unnamed: 0,Date,Days Since 2019-12-31,Confirmed,Deaths,area
0,2020-01-21,21,1,0,United States of AmericaWashingtonSnohomish
1,2020-01-22,22,1,0,United States of AmericaWashingtonSnohomish
2,2020-01-23,23,1,0,United States of AmericaWashingtonSnohomish
3,2020-01-24,24,1,0,United States of AmericaIllinoisCook
4,2020-01-24,24,1,0,United States of AmericaWashingtonSnohomish


In [10]:
le = preprocessing.LabelEncoder()
le.fit(df['area'])
le.transform(df['area']) 

array([1713, 1713, 1713, ..., 1794, 1795, 1796])

In [11]:
df["areaCode"] = le.transform(df['area'])

In [12]:
df['Date'] = pd.to_datetime(df.Date , format = '%Y/%m/%d')

In [13]:
data = df.drop(['Date'], axis=1)

In [14]:
df.drop('area', axis=1, inplace=True)

In [15]:
df.drop('Date', axis=1, inplace=True)

In [16]:
df.rename(columns = {'Days Since 2019-12-31':'daysSince'}, inplace = True) 

In [17]:
df.head()

Unnamed: 0,daysSince,Confirmed,Deaths,areaCode
0,21,1,0,1713
1,22,1,0,1713
2,23,1,0,1713
3,24,1,0,407
4,24,1,0,1713


In [47]:
feature_cols = ['daysSince', 'areaCode']
X = df[feature_cols] # Features
y = df.Confirmed # Target variable

In [62]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)

In [49]:
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(X_train,y_train)

#
y_pred=logreg.predict(X_test)



In [55]:
polynomial_features= PolynomialFeatures(degree=3)
x_poly = polynomial_features.fit_transform(X_train)

In [59]:
model = LinearRegression()
model.fit(x_poly, y_train)
y_poly_pred = model.predict(x_poly)

rmse = np.sqrt(mean_squared_error(y_train,y_poly_pred))
r2 = r2_score(y_train,y_poly_pred)
print(rmse)
print(r2)

374.38409267386174
0.0036653536309723123


In [67]:
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

array([[1333,    0,    0, ...,    0,    0,    0],
       [ 584,    0,    0, ...,    0,    0,    0],
       [ 359,    0,    0, ...,    0,    0,    0],
       ...,
       [   1,    0,    0, ...,    0,    0,    0],
       [   1,    0,    0, ...,    0,    0,    0],
       [   1,    0,    0, ...,    0,    0,    0]], dtype=int64)

In [40]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


Accuracy: 0.3367011871684769


In [61]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [63]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_scaled = scaler.fit_transform(X_train)
test_scaled = scaler.transform(X_test)

In [64]:
tree_model = DecisionTreeRegressor()
rf_model = RandomForestRegressor()
tree_model.fit(train_scaled, y_train)
rf_model.fit(train_scaled, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [66]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

tree_mse = mean_squared_error(y_train, tree_model.predict(train_scaled))
tree_mae = mean_absolute_error(y_train, tree_model.predict(train_scaled))
rf_mse = mean_squared_error(y_train, rf_model.predict(train_scaled))
rf_mae = mean_absolute_error(y_train, rf_model.predict(train_scaled))

print("Decision Tree training mse = ",tree_mse," & mae = ",tree_mae," & rmse = ", math.sqrt(tree_mse))
print("Random Forest training mse = ",rf_mse," & mae = ",rf_mae," & rmse = ", math.sqrt(rf_mse))



Decision Tree training mse =  0.0  & mae =  0.0  & rmse =  0.0
Random Forest training mse =  2835.0371137492643  & mae =  5.575111560158289  & rmse =  53.245066567234794


In [70]:
y_pred = tree_model.predict(X_test)

In [71]:
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [72]:
df2 = df
df2['daysSince'] = df2['daysSince'] + 30

In [73]:
df2.head()

Unnamed: 0,daysSince,Confirmed,Deaths,areaCode
0,81,1,0,1713
1,82,1,0,1713
2,83,1,0,1713
3,84,1,0,407
4,84,1,0,1713


In [None]:
X = df2[feature_cols] # Features

In [74]:
X = df2[feature_cols]

In [75]:
df2['Confirmed']=tree_model.predict(X)

In [76]:
df2.head()

Unnamed: 0,daysSince,Confirmed,Deaths,areaCode
0,81,12.0,0,1713
1,82,12.0,0,1713
2,83,12.0,0,1713
3,84,12.0,0,407
4,84,12.0,0,1713


In [77]:
df2.to_csv('test2.csv') 

In [67]:
df["daysSince"] = df["Days Since 2019-12-31"] 

In [71]:
data.drop('Days Since 2019-12-31', axis=1, inplace=True)

In [77]:

data

Unnamed: 0_level_0,Confirmed,Deaths,areaCode,daysSince
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-12-31,0,0,1,0
2019-12-31,0,0,2,0
2019-12-31,0,0,6,0
2019-12-31,0,0,10,0
2019-12-31,0,0,20,0
...,...,...,...,...
2020-03-28,16,0,316,88
2020-03-28,5,1,317,88
2020-03-29,560,4,57,89
2020-03-29,132,2,63,89


In [74]:
data.to_csv('cleanedComplete.csv') 

In [46]:
df = pd.read_csv("cleanedComplete.csv")

In [75]:
data.index = df.Date

In [76]:
data.drop('Date', axis=1, inplace=True)

In [78]:
train = data[:int(0.9*(len(data)))]
valid = data[int(0.9*(len(data))):]

In [79]:
from statsmodels.tsa.vector_ar.var_model import VAR

model = VAR(endog=train)
model_fit = model.fit()



In [80]:
prediction = model_fit.forecast(model_fit.y, steps=len(valid))

In [92]:
def rmsle(ytrue, ypred):
    return np.sqrt(mean_squared_log_error(ytrue, ypred))

In [91]:
cols = data.columns
pred = pd.DataFrame(index=range(0,len(prediction)),columns=[cols])
for j in range(0,3):
    for i in range(0, len(prediction)):
       pred.iloc[i][j] = prediction[i][j]

#check rmse
for i in cols:
    print('rmse value for', i, 'is : ', math.sqrt(mean_squared_error(pred[i], valid[i])))


TypeError: only integer scalar arrays can be converted to a scalar index

In [93]:
data


Unnamed: 0_level_0,Confirmed,Deaths,areaCode,daysSince
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-12-31,0,0,1,0
2019-12-31,0,0,2,0
2019-12-31,0,0,6,0
2019-12-31,0,0,10,0
2019-12-31,0,0,20,0
...,...,...,...,...
2020-03-28,16,0,316,88
2020-03-28,5,1,317,88
2020-03-29,560,4,57,89
2020-03-29,132,2,63,89


In [94]:
mean_error = []
for days in range(0,89):
    train = data[data['daysSince'] < days]
    val = data[data['daysSince'] == days]

    p = val['Deaths'].values

    error = rmsle(val['Deaths'].values, p)
    print('Days Since %d - Error %.5f' % (days, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

NameError: name 'mean_squared_log_error' is not defined

In [95]:
data.reset_index()

Unnamed: 0,Date,Confirmed,Deaths,areaCode,daysSince
0,2019-12-31,0,0,1,0
1,2019-12-31,0,0,2,0
2,2019-12-31,0,0,6,0
3,2019-12-31,0,0,10,0
4,2019-12-31,0,0,20,0
...,...,...,...,...,...
10728,2020-03-28,16,0,316,88
10729,2020-03-28,5,1,317,88
10730,2020-03-29,560,4,57,89
10731,2020-03-29,132,2,63,89


In [100]:
data2 = data[['Confirmed', 'Deaths', 'areaCode', 'daysSince']].copy()


In [101]:
data2.index = data['daysSince']

In [102]:
data

Unnamed: 0_level_0,Confirmed,Deaths,areaCode,daysSince
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-12-31,0,0,1,0
2019-12-31,0,0,2,0
2019-12-31,0,0,6,0
2019-12-31,0,0,10,0
2019-12-31,0,0,20,0
...,...,...,...,...
2020-03-28,16,0,316,88
2020-03-28,5,1,317,88
2020-03-29,560,4,57,89
2020-03-29,132,2,63,89
