In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupShuffleSplit, GroupKFold, GridSearchCV, cross_val_score
from sklearn import metrics
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math
from xgboost import plot_importance
from matplotlib import pyplot
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor, plot_tree
from sklearn.pipeline import make_pipeline

In [2]:
#read in state data
df=pd.read_csv("Downloads/AllDataStateFinalFinalFinal.csv")

In [3]:
df.head()

Unnamed: 0,State,variant,share_max,share_first,max_date,first_date,CDC_Region,max_var_cases_per_week,first_var_cases_week,Quarter_first_cases,...,7 Region,8 Region,9 Region,Omicron,% Fair or Poor Health,% Smokers,% Adults with Obesity,% Flu Vaccinated,% Severe Housing Problems,Population_Density
0,Alabama,B.1.1.529,0.255104,0.007616,1/22/22,12/4/21,4,29658.17465,23.342629,4,...,0,0,0,1,21.410189,21.092716,36.3,42,13.685917,99.513163
1,Alabama,B.1.617.2,0.984046,0.006657,9/4/21,4/24/21,4,32504.99902,25.657797,2,...,0,0,0,0,21.410189,21.092716,36.3,42,13.685917,99.513163
2,Alabama,BA.1.1,0.735191,0.006385,1/22/22,12/4/21,4,85472.53443,19.569541,4,...,0,0,0,1,21.410189,21.092716,36.3,42,13.685917,99.513163
3,Alabama,BA.2,0.733301,0.001973,4/2/22,1/8/22,4,2086.24079,136.665702,1,...,0,0,0,1,21.410189,21.092716,36.3,42,13.685917,99.513163
4,Alabama,BA.2.12.1,0.579754,0.000658,6/11/22,1/8/22,4,4830.511278,45.555234,1,...,0,0,0,1,21.410189,21.092716,36.3,42,13.685917,99.513163


In [4]:
df.shape

(408, 56)

In [5]:
#create split using group shuffle to keep state together
gss=GroupShuffleSplit(n_splits=2, train_size=.8, random_state=42)

#column to group by
split = gss.split(df, groups=df['state_group'])

#splitting testing and training indexes
train_inds, test_inds = next(split)

#test and train dfs split by state
train=df.iloc[train_inds].copy()
test=df.iloc[test_inds].copy()

In [6]:
print(train.shape)
test.shape

(320, 56)


(88, 56)

In [7]:
#create y test & train data
y_test = np.array(test['diff_weeks'])
y_train = np.array(train['diff_weeks'])

In [8]:
#create x test and train data
x_test = test.iloc[:,-42:]
x_train = train.iloc[:,-42:]

In [9]:
#array of the group column
group_train=np.array(train['state_group'])

In [10]:
rf = RandomForestRegressor()
rf.fit(x_train, y_train)
preds = rf.predict(x_test)

In [11]:
#evaluation metrics

print("MAE:", mean_absolute_error(y_test, preds))
print("MSE:", mean_squared_error(y_test, preds))
print("RMSE:", math.sqrt(mean_squared_error(y_test, preds)))

MAE: 2.4295454545454547
MSE: 12.87135
RMSE: 3.5876663724488096


In [12]:
#grid search for best parameters using group kfold to keep the states grouped together in testing and training.
#score to minimize is RMSE

rf = RandomForestRegressor()
grid_values = {'n_estimators' : [ 30, 40, 50,60,70,80,90], 'max_depth': [10,20,30,40,50,60], 'min_samples_leaf':[10,20,30,40,50]}
gkf=GroupKFold(n_splits=5).split(x_train, y_train,group_train )
grid_rf = GridSearchCV(rf, param_grid = grid_values, scoring = 'neg_root_mean_squared_error', cv=gkf)
result=grid_rf.fit(x_train, y_train )
print(grid_rf.best_params_)

{'max_depth': 60, 'min_samples_leaf': 10, 'n_estimators': 50}


In [13]:
#predicting test data using best parameters from grid search
tuned_preds = grid_rf.predict(x_test)

In [14]:
#evaluation metrics of predictions using best parameters from grid search

print("MAE:", mean_absolute_error(y_test, tuned_preds))
print("MSE:", mean_squared_error(y_test, tuned_preds))
print("RMSE:", math.sqrt(mean_squared_error(y_test, tuned_preds)))

MAE: 2.688738578410062
MSE: 13.867130063513317
RMSE: 3.7238595654929467


In [31]:
#grid search for best parameters using group kfold to keep the states grouped together in testing and training.
#score to minimize is RMSE

rf = RandomForestRegressor()
grid_values = {'n_estimators' : [ 30, 40, 50,60,70,80,90], 'max_depth': [10,20,30,40,50,60]}
gkf=GroupKFold(n_splits=5).split(x_train, y_train,group_train )
grid_rf = GridSearchCV(rf, param_grid = grid_values, scoring = 'neg_root_mean_squared_error', cv=gkf)
result=grid_rf.fit(x_train, y_train )
print(grid_rf.best_params_)

{'max_depth': 10, 'n_estimators': 30}


In [32]:
#predicting test data using best parameters from grid search
tuned_preds = grid_rf.predict(x_test)

In [33]:
#evaluation metrics of predictions using best parameters from grid search

print("MAE:", mean_absolute_error(y_test, tuned_preds))
print("MSE:", mean_squared_error(y_test, tuned_preds))
print("RMSE:", math.sqrt(mean_squared_error(y_test, tuned_preds)))

MAE: 2.413920496237742
MSE: 12.53234994242647
RMSE: 3.5401059224868496


In [23]:
#run xtree no parameters
xtree = ExtraTreeRegressor(random_state=0)
xtree.fit(x_train, y_train )
preds = xtree.predict(x_test)
#evaluation metrics

print("MAE:", mean_absolute_error(y_test, preds))
print("MSE:", mean_squared_error(y_test, preds))
print("RMSE:", math.sqrt(mean_squared_error(y_test, preds)))

MAE: 2.465909090909091
MSE: 15.375
RMSE: 3.9210967853395307


In [20]:
#grid search for best parameters using group kfold to keep the states grouped together in testing and training.
#score to minimize is RMSE
xtree = ExtraTreeRegressor(random_state=0)
param = {'max_leaf_nodes': [10,20, 30, 40, 50,60,70 ], 'min_samples_split': [2,3,4,5,6], 'splitter':['best','random']}
#param = {'max_leaf_nodes': [3,6,9,12,15], 'min_samples_split': [2,3,4,5,6]}
gkf=GroupKFold(n_splits=5).split(x_train, y_train,group_train )
gridxtree = GridSearchCV(xtree, param_grid = param, scoring = 'neg_root_mean_squared_error', cv=gkf)
result=gridxtree.fit(x_train, y_train )
print(gridxtree.best_params_)

{'max_leaf_nodes': 30, 'min_samples_split': 5, 'splitter': 'random'}


In [29]:
#predicting test data using best parameters from grid search
tree_tuned_preds = gridxtree.predict(x_test)

In [30]:
#evaluation metrics of predictions using best parameters from grid search

print("MAE:", mean_absolute_error(y_test, tree_tuned_preds))
print("MSE:", mean_squared_error(y_test, tree_tuned_preds))
print("RMSE:", math.sqrt(mean_squared_error(y_test, tree_tuned_preds)))

MAE: 2.423915283992156
MSE: 12.683456483925534
RMSE: 3.5613840685786102


In [34]:
#SVR

In [37]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [38]:
scale = MinMaxScaler()
x_train2 = scale.fit_transform(x_train)
x_test2 = scale.fit_transform(x_test)

In [39]:
svr=SVR()
svr.fit(x_train2, y_train)

SVR()

In [40]:
SVRpreds = svr.predict(x_test2)

In [41]:
#evaluation metrics
print("MAE:", mean_absolute_error(y_test, SVRpreds))
print("MSE:", mean_squared_error(y_test, SVRpreds))
print("RMSE:", math.sqrt(mean_squared_error(y_test, SVRpreds)))

MAE: 4.328296159137
MSE: 37.206820315334234
RMSE: 6.099739364541262


In [44]:
#grid search for best parameters using group kfold to keep the states grouped together in testing and training.
#score to minimize is RMSE
#make pipeline for scaling
make_pipe = make_pipeline(scale, SVR())
#param = {'kernel':['poly','rbf','linear'],'degree':[1,2,3,4,5], 'gamma':[0.1,0.025,0.05,0.01]}
param = {'svr__kernel':['poly','rbf','linear'],'svr__degree':[1,2,3,4,5],'svr__gamma':['scale', 'auto']}
gkf=GroupKFold(n_splits=5).split(x_train, y_train,group_train )
gridsvr = GridSearchCV(make_pipe, param_grid = param, scoring = 'neg_root_mean_squared_error', cv=gkf)
result=gridsvr.fit(x_train, y_train )
print(gridsvr.best_params_)

{'svr__degree': 1, 'svr__gamma': 'scale', 'svr__kernel': 'linear'}


In [45]:
#predicting test data using best parameters from grid search
SVR_tuned_preds = gridsvr.predict(x_test)

In [46]:
#evaluation metrics of predictions using best parameters from grid search

print("MAE:", mean_absolute_error(y_test, SVR_tuned_preds))
print("MSE:", mean_squared_error(y_test, SVR_tuned_preds))
print("RMSE:", math.sqrt(mean_squared_error(y_test, SVR_tuned_preds)))

MAE: 3.5997636813127163
MSE: 23.842655125324406
RMSE: 4.882894134150812


In [47]:
#SVM with Feature Tuning

In [58]:
#drop correlated features
x_test_no_corr = x_test.drop(['Mutation Fitness', '#_Public_Airports' ,'#_business_establishments','Miles freight railroad','Miles passenger railroad' ,'PovertyRate'], axis=1)
x_train_no_corr = x_train.drop(['Mutation Fitness', '#_Public_Airports' , '#_business_establishments','Miles freight railroad','Miles passenger railroad' ,'PovertyRate'], axis=1)

In [59]:
scale = MinMaxScaler()
x_train3 = scale.fit_transform(x_train_no_corr)
x_test3 = scale.fit_transform(x_test_no_corr)

In [60]:
svr=SVR()
svr.fit(x_train3, y_train)
SVRpreds = svr.predict(x_test3)

In [61]:
#evaluation metrics
print("MAE:", mean_absolute_error(y_test, SVRpreds))
print("MSE:", mean_squared_error(y_test, SVRpreds))
print("RMSE:", math.sqrt(mean_squared_error(y_test, SVRpreds)))

MAE: 4.159786625949457
MSE: 35.609947012259134
RMSE: 5.96740705937337


In [62]:
#grid search for best parameters using group kfold to keep the states grouped together in testing and training.
#score to minimize is RMSE
#make pipeline for scaling
make_pipe = make_pipeline(scale, SVR())
#param = {'kernel':['poly','rbf','linear'],'degree':[1,2,3,4,5], 'gamma':[0.1,0.025,0.05,0.01]}
param = {'svr__kernel':['poly','rbf','linear'],'svr__degree':[1,2,3,4,5],'svr__gamma':['scale', 'auto']}
gkf=GroupKFold(n_splits=5).split(x_train_no_corr, y_train,group_train )
gridsvr = GridSearchCV(make_pipe, param_grid = param, scoring = 'neg_root_mean_squared_error', cv=gkf)
result=gridsvr.fit(x_train_no_corr, y_train )
print(gridsvr.best_params_)

{'svr__degree': 2, 'svr__gamma': 'scale', 'svr__kernel': 'poly'}


In [63]:
#predicting test data using best parameters from grid search
SVR_tuned_preds = gridsvr.predict(x_test_no_corr)

In [64]:
#evaluation metrics of predictions using best parameters from grid search

print("MAE:", mean_absolute_error(y_test, SVR_tuned_preds))
print("MSE:", mean_squared_error(y_test, SVR_tuned_preds))
print("RMSE:", math.sqrt(mean_squared_error(y_test, SVR_tuned_preds)))

MAE: 3.5672818424592965
MSE: 26.118154122831683
RMSE: 5.110592345592797
