In [1]:
import csv
import math
import pickle
import gzip
import random
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pylab as plt

In [56]:
# read in data as pandas dataframe
df = pd.read_csv('data/data_sanitized.csv')

# drop a couple of different metrics
del df['County']
del df['Total Sum Cases']
del df['Number of MIP Charges']

# generate validation and training ids
# ids are indices in df
N = len(df)
valid_ids = np.asarray(random.sample(range(N), int(N/5)))
train_ids = np.asarray([i for i in range(N) if i not in valid_ids])

# seperating labels from features
X = np.asarray(df[['Days Since March 17', 
                 'Population', 
                 'Average Cases Per 7 Days', 
                 'Average Daily Change in Cases Per 7 Days', 
                 'Area of County']].copy())
y = np.asarray(df[['New Cases per Day']].copy())

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, explained_variance_score, precision_score

# using train_test_split() to create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [26]:
# measure of predictions that lie within a margin of the actual
def check_accuracy(y_predict, y_true, variance=.1):
    correct = 0

    for i, j in enumerate(y_predict):
        if abs(j-y_true[i]) <= (variance * y_true[i]): 
            correct += 1
        
    return correct / len(y_predict)


# measure of predictions that lie outside of a margin of the actual
def check_inaccuracy(y_predict, y_true, variance=.5):
    incorrect = 0

    for i, j in enumerate(y_predict):
        # print(abs(j-y_true[i]),(variance * y_true[i]))
        if abs(j-y_true[i]) > (variance * y_true[i]): 
            incorrect += 1
        
    return incorrect / len(y_predict)

**Finding Best Parameters for Random Forest**

In [43]:
# define parameters to test
estimators = [50, 100, 150]
criteria = ['gini', 'entropy']
bootstrapping = [True, False]

n_estimators_list = []
criteria_list = []
bootstrapping_list = []
accuracy_list = []
inaccuracy_list = []

for e in estimators:
    for c in criteria:
        for b in bootstrapping:
            rf = RandomForestClassifier(n_estimators=e, criterion=c, bootstrap=b, n_jobs=-1)
            rf.fit(X_train, y_train.ravel())

            rf_pred = rf.predict(X_test)

            a = check_accuracy(rf_pred, y_test, variance=.25)
            i = check_inaccuracy(rf_pred, y_test, variance=.75)
            
            n_estimators_list.append(e)
            criteria_list.append(c)
            bootstrapping_list.append(b)
            accuracy_list.append(a)
            inaccuracy_list.append(i)

In [54]:
# highest accuracy
index_max = -1
max = 0
for j, i in enumerate(accuracy_list):
    if i>max:
        max = i
        index_max = j

# lowest inaccuracy
index_min = 0
min = inaccuracy_list[0]
for j, i in enumerate(inaccuracy_list):
    if i<min:
        min = i
        index_min = j
        
print(f'Highest Accuracy: {accuracy_list[index_max]:.3}')
print('Used ', n_estimators_list[index_max], ' estimators')
print('Used ', criteria_list[index_max], ' to measure the quality of the split')
print('Used bootstrapping?: ', bootstrapping_list[index_max])

print(f'Inaccuracy of Above: {inaccuracy_list[index_max]:.3}')

print('\n')

print(f'Lowest Inaccuracy: {inaccuracy_list[index_min]:.3}')
print('Used ', n_estimators_list[index_min], ' estimators')
print('Used ', criteria_list[index_min], ' to measure the quality of the split')
print('Used bootstrapping?: ', bootstrapping_list[index_min])

print(f'Accuracy of Above: {accuracy_list[index_min]:.3}')

Highest Accuracy: 0.443
Used  100  estimators
Used  entropy  to measure the quality of the split
Used bootstrapping?:  True
Inaccuracy of Above: 0.373


Lowest Inaccuracy: 0.334
Used  50  estimators
Used  gini  to measure the quality of the split
Used bootstrapping?:  True
Accuracy of Above: 0.403


**Finding Best Parameters for Adaboost**

In [47]:
# define parameters to test
estimators = [50, 100, 150]
learning_rate = np.linspace(.01, 2, 10)

n_estimators_list = []
learning_rate_list = []
accuracy_list = []
inaccuracy_list = []

for e in estimators:
    for l in learning_rate:
        ab = AdaBoostClassifier(n_estimators=e, learning_rate=l)
        ab.fit(X_train, y_train.ravel())

        ab_pred = ab.predict(X_test)

        a = check_accuracy(ab_pred, y_test, variance=.15)
        i = check_inaccuracy(ab_pred, y_test, variance=.75)

        n_estimators_list.append(e)
        learning_rate_list.append(l)
        accuracy_list.append(a)
        inaccuracy_list.append(i)

In [52]:
# highest accuracy
index_max = -1
max = 0
for j, i in enumerate(accuracy_list):
    if i>max:
        max = i
        index_max = j

# lowest inaccuracy
index_min = 0
min = inaccuracy_list[0]
for j, i in enumerate(inaccuracy_list):
    if i<min:
        min = i
        index_min = j
        
print(f'Highest Accuracy: {accuracy_list[index_max]:.3}')
print('Used ', n_estimators_list[index_max], ' estimators')
print('Used ', learning_rate_list[index_max], ' as learning rate')
print(f'Inaccuracy of Above: {inaccuracy_list[index_max]:.3}')

print('\n')

print(f'Lowest Inaccuracy: {inaccuracy_list[index_min]:.3}')
print('Used ', n_estimators_list[index_min], ' estimators')
print('Used ', learning_rate_list[index_min], ' as learning rate')
print(f'Accuracy of Above: {accuracy_list[index_min]:.3}')

Highest Accuracy: 0.443
Used  100  estimators
Used  0.01  as learning rate
Inaccuracy of Above: 0.373


Lowest Inaccuracy: 0.334
Used  50  estimators
Used  0.8944444444444445  as learning rate
Accuracy of Above: 0.403


**Predicting the Month of April**

Using the results from above we are going to generate two estimations for the month of April, one seeking to minimize inaccuracy, and one seeking to maximize accuracy. We will be using Random Forest to generate this dataset since it had the 

Our highest accuracy used 100 estimators, entropy function to measure the quality of the split, and utilized bootstrapping.

Our lowest inaccuracy used 50 estimators, gini function to measure the quality of the split, but also used bootstrapping.

In [58]:
# X and y still around from cell 2

# read in april data as pandas dataframe
df = pd.read_csv('data/data_sanitized_april.csv')

# drop a couple of different metrics
del df['County']
del df['Total Sum Cases']
del df['Number of MIP Charges']

# seperating labels from features
X_april = np.asarray(df[['Days Since March 17', 
                 'Population', 
                 'Average Cases Per 7 Days', 
                 'Average Daily Change in Cases Per 7 Days', 
                 'Area of County']].copy())
y_april = np.asarray(df[['New Cases per Day']].copy())

In [61]:
rf = RandomForestClassifier(n_estimators=100, criterion='entropy', bootstrap=True, n_jobs=-1)
rf.fit(X, y.ravel())
rf_pred = rf.predict(X_april)

a1 = check_accuracy(rf_pred, y_april, variance=.25)
i1 = check_inaccuracy(rf_pred, y_april, variance=.75)

rf = RandomForestClassifier(n_estimators=50, criterion='gini', bootstrap=True, n_jobs=-1)
rf.fit(X, y.ravel())
rf_pred = rf.predict(X_april)

a2 = check_accuracy(rf_pred, y_april, variance=.25)
i2 = check_inaccuracy(rf_pred, y_april, variance=.75)

print('100 Estimators, Entropy, Bootstrapping')
print('accuracy: ', a1)
print('inaccuracy: ', i1)

print('\n')

print('50 Estimators, Gini, Bootstrapping')
print('accuracy: ', a2)
print('inaccuracy: ', i2)

100 Estimators, Entropy, Bootstrapping
accuracy:  0.5838541666666667
inaccuracy:  0.26979166666666665


50 Estimators, Gini, Bootstrapping
accuracy:  0.5739583333333333
inaccuracy:  0.26510416666666664
