In [1]:
import pandas as pd
import numpy as np
import holidays
import zipfile
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [3]:
raw_data = pd.read_csv("../data/us-covid-raw-data.csv")

# Split data by regions

In [4]:
regions = {
    'South': ['Alabama', 'Arkansas', 'Delaware', 'Florida', 'Georgia', 'Kentucky',
              'Louisiana', 'Maryland', 'Mississippi', 'North Carolina', 'South Carolina',
              'Tennessee', 'Virginia', 'West Virginia'],
    'NewEngland': ['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 'Rhode Island', 'Vermont', 'New York',
                   'New Jersey', 'Pennsylvania']}

In [5]:
NewEngland = raw_data[raw_data['state_name'].isin(regions['NewEngland'])]
South = raw_data[raw_data['state_name'].isin(regions['South'])]

# Aggregate data by county
- confirmed_cases --> sum
- numerical columns --> mean
- categorical columns --> mode

In [917]:
NewEngland.drop(columns = ["county_name", "state_fips", "state_name", "covid_19_deaths"], inplace = True)
South.drop(columns = ["county_name", "state_fips", "state_name", "covid_19_deaths"], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NewEngland.drop(columns = ["county_name", "state_fips", "state_name", "covid_19_deaths"], inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  South.drop(columns = ["county_name", "state_fips", "state_name", "covid_19_deaths"], inplace = True)


In [918]:
num_cols = NewEngland._get_numeric_data().columns.tolist()
cat_cols = [i for i in NewEngland.columns if i not in num_cols]
cases_cols = ["date", "covid_19_confirmed_cases"]

In [919]:
num_cols.insert(0, "date")
num_cols.remove("covid_19_confirmed_cases")
num_cols.remove("county_fips")

In [920]:
NewEngland_num = NewEngland.loc[:, num_cols]
NewEngland_cat = NewEngland.loc[:, cat_cols]
NewEngland_cases = NewEngland.loc[:, cases_cols]

In [921]:
South_num = South.loc[:, num_cols]
South_cat = South.loc[:, cat_cols]
South_cases = South.loc[:, cases_cols]

## Numerical Columns

In [922]:
NewEngland_num = NewEngland_num.groupby(["date"]).mean()

In [923]:
South_num = South_num.groupby(["date"]).mean()

In [924]:
South_num = South_num.fillna(value = 0)

## Categorical Columns

In [925]:
NewEngland_cat = NewEngland_cat.fillna(value = "F")
South_cat = South_cat.fillna(value = "F")

In [926]:
NewEngland_cat = NewEngland_cat.groupby(["date"]).agg(pd.Series.mode)

In [927]:
South_cat = South_cat.groupby(["date"]).agg(pd.Series.mode)

In [928]:
NewEngland_cat["social_distancing_total_grade"]["04/09/20"] = "C"
NewEngland_cat["social_distancing_total_grade"]["04/21/20"] = "C"
NewEngland_cat["social_distancing_total_grade"]["05/31/20"] = "D"

NewEngland_cat["social_distancing_encounters_grade"]["03/25/20"] = "A"
NewEngland_cat["social_distancing_encounters_grade"]["03/28/20"] = "A"
NewEngland_cat["social_distancing_encounters_grade"]["05/03/20"] = "A"

## Confirmed Cases Column

In [929]:
NewEngland_cases = NewEngland_cases.groupby(["date"]).sum()

In [930]:
South_cases = South_cases.groupby(["date"]).sum()

# Merge data by regions

In [931]:
NewEngland = NewEngland_cases.merge(NewEngland_num, left_index = True, right_index = True)

In [932]:
NewEngland = NewEngland.merge(NewEngland_cat, left_index = True, right_index = True)

In [933]:
South = South_cases.merge(South_num, left_index = True, right_index = True)

In [934]:
South = South.merge(South_cat, left_index = True, right_index = True)

In [935]:
NewEngland.index = pd.to_datetime(NewEngland.index, format = "%m/%d/%y") 
South.index = pd.to_datetime(South.index, format = "%m/%d/%y")

# Train-Test Split

In [936]:
NewEngland_train = NewEngland.iloc[:191,:]
NewEngland_test = NewEngland.iloc[191:,:]
South_train = South.iloc[:191,:]
South_test = South.iloc[191:,:]

# Feature Engineering
- Time-based features
    - Weekends
- Lagged variable
- Moving window statistics

In [937]:
def is_weekend(d):
  return d.weekday() > 4

In [938]:
def feature_engineering(df):
    df = df.assign(weekend = [is_weekend(i) for i in df.index.to_list()])
    df["weekend"].replace({False: 0, True: 1}, inplace=True)
    
    df["lagged_covid_19_confirmed_cases_15"] = df["covid_19_confirmed_cases"].shift(15)
    df["lagged_covid_19_confirmed_cases_30"] = df["covid_19_confirmed_cases"].shift(30)
    df["lagged_covid_19_confirmed_cases_45"] = df["covid_19_confirmed_cases"].shift(45)
    
    df["rolling_covid_19_confirmed_cases_15"] = df["covid_19_confirmed_cases"].rolling(15).mean()
    df["rolling_covid_19_confirmed_cases_30"] = df["covid_19_confirmed_cases"].rolling(30).mean()
    df["rolling_covid_19_confirmed_cases_45"] = df["covid_19_confirmed_cases"].rolling(45).mean()
    
    df.fillna(0, inplace = True)
    
    return df

In [939]:
NewEngland_train = feature_engineering(NewEngland_train)
NewEngland_test = feature_engineering(NewEngland_test)

In [940]:
South_train = feature_engineering(South_train)
South_test = feature_engineering(South_test)

# Data Preprocessing
- scaling numerical columns
- one-hot-encode categorical columns

In [941]:
num_cols = NewEngland_train._get_numeric_data().columns.to_list()
cat_cols = [i for i in NewEngland_train.columns if i not in num_cols]

In [942]:
num_cols.remove("covid_19_confirmed_cases")

In [943]:
def scale_encode(dfs):
    cat_pipeline = Pipeline([("encoding", OneHotEncoder())])
    num_pipeline = Pipeline([("scaling", StandardScaler())])
    
    full_pipeline = ColumnTransformer([
                    ("cat", cat_pipeline, cat_cols),
                    ("num", num_pipeline, num_cols)
    ])
    
    train_preprocessed = full_pipeline.fit_transform(dfs[0])
    test_preprocessed = full_pipeline.transform(dfs[1])
    
    column_names = full_pipeline.get_feature_names_out()
    
    train = pd.DataFrame(train_preprocessed, columns = column_names, index = dfs[0].index)
    test = pd.DataFrame(test_preprocessed, columns = column_names, index = dfs[1].index)
    
    return train, test

In [944]:
NEW_train = scale_encode([NewEngland_train, NewEngland_test])[0].merge(NewEngland_train.loc[:,"covid_19_confirmed_cases"],left_index=True, right_index=True)
NEW_test = scale_encode([NewEngland_train, NewEngland_test])[1].merge(NewEngland_test.loc[:,"covid_19_confirmed_cases"],left_index=True, right_index=True)

S_train = scale_encode([South_train, South_test])[0].merge(South_train.loc[:,"covid_19_confirmed_cases"],left_index=True, right_index=True)
S_test = scale_encode([South_train, South_test])[1].merge(South_test.loc[:,"covid_19_confirmed_cases"],left_index=True, right_index=True)

# Export to CSV

In [945]:
NEW_train.to_csv("/Users/Huey.ts/Desktop/STAT_390/data/ne_train.csv")
NEW_test.to_csv("/Users/Huey.ts/Desktop/STAT_390/data/ne_test.csv")
S_train.to_csv("/Users/Huey.ts/Desktop/STAT_390/data/s_train.csv")
S_test.to_csv("/Users/Huey.ts/Desktop/STAT_390/data/s_test.csv")