In [2]:
import pandas as pd
import numpy as np
import holidays
import zipfile

# To do
- Time-based features
    - Extract month & holiday
- Lagged variable
- Moving window statistics

# Test-Train Split (Raw)

In [2]:
data_raw = pd.read_csv("/Users/Huey.ts/Desktop/STAT_390/data/us-covid-raw-data.csv")
null_dat = data_raw.groupby("county_name", dropna=False).apply(lambda x: x.isnull().mean()*100)
null_per = data_raw.isnull().mean()*100
null_features = [null_per[null_per != 0].index]
counties = []
for i in null_features:
    counties.append(null_dat.iloc[np.where(null_dat[i] == 100)[0]].index)
counties = np.concatenate(counties)
counties = np.unique(counties)
new_data = data_raw.loc[~data_raw["county_name"].isin(counties)].reset_index()

In [3]:
train = []
test = []

for i in new_data.county_fips.unique():
    train.append(new_data[new_data.county_fips == i].iloc[:191,:])
    test.append(new_data[new_data.county_fips == i].iloc[191:,:])
    
train = pd.concat(train)
test = pd.concat(test)

In [4]:
train = train.reset_index()
train.drop(columns = ["index"], inplace = True)

In [5]:
test = test.reset_index()
test.drop(columns = ["index"], inplace = True)

In [6]:
train.drop(columns = ["level_0"], inplace = True)

In [7]:
test.drop(columns = ["level_0"], inplace = True)

# Time-based features

## Month

In [74]:
train["date"] = pd.to_datetime(train["date"], format = "%m/%d/%y")

In [75]:
train["month"] = train["date"].dt.month_name()

## Holiday

In [76]:
dr = pd.date_range(start='2020-01-22', end='2020-09-16')

In [77]:
cal = calendar()
holidays = cal.holidays(start=dr.min(), end=dr.max())

In [78]:
train['holiday'] = train['date'].isin(holidays)

# Lagged variable & Moving window statistics

In [79]:
lagged_15 = []
lagged_30 = []
lagged_45 = []

rolling_15 = []
rolling_30 = []
rolling_45 = []

for i in train.county_fips.unique():
    lagged_15.append(train[train.county_fips == i]["covid_19_confirmed_cases"].shift(15))
    lagged_30.append(train[train.county_fips == i]["covid_19_confirmed_cases"].shift(30))
    lagged_45.append(train[train.county_fips == i]["covid_19_confirmed_cases"].shift(45))
    
    rolling_15.append(train[train.county_fips == i]["covid_19_confirmed_cases"].rolling(15).mean())
    rolling_30.append(train[train.county_fips == i]["covid_19_confirmed_cases"].rolling(30).mean())
    rolling_45.append(train[train.county_fips == i]["covid_19_confirmed_cases"].rolling(45).mean())

In [80]:
lagged_15 = np.concatenate(lagged_15)
lagged_30 = np.concatenate(lagged_30)
lagged_45 = np.concatenate(lagged_45)

In [81]:
rolling_15 = np.concatenate(rolling_15)
rolling_30 = np.concatenate(rolling_30)
rolling_45 = np.concatenate(rolling_45)

In [88]:
train = train.merge(pd.DataFrame(lagged_15, columns = ["lagged_covid_19_confirmed_cases_15"], index = train.index),
           left_index = True, right_index = True)
train = train.merge(pd.DataFrame(lagged_30, columns = ["lagged_covid_19_confirmed_cases_30"], index = train.index),
           left_index = True, right_index = True)
train = train.merge(pd.DataFrame(lagged_45, columns = ["lagged_covid_19_confirmed_cases_45"], index = train.index),
           left_index = True, right_index = True)

In [89]:
train = train.merge(pd.DataFrame(rolling_15, columns = ["rolling_covid_19_confirmed_cases_15"], index = train.index),
           left_index = True, right_index = True)
train = train.merge(pd.DataFrame(rolling_30, columns = ["rolling_covid_19_confirmed_cases_30"], index = train.index),
           left_index = True, right_index = True)
train = train.merge(pd.DataFrame(rolling_45, columns = ["rolling_covid_19_confirmed_cases_45"], index = train.index),
           left_index = True, right_index = True)

In [90]:
train

Unnamed: 0,date,county_fips,county_name,state_fips,state_name,covid_19_confirmed_cases,covid_19_deaths,social_distancing_total_grade,social_distancing_visitation_grade,social_distancing_encounters_grade,...,age_85_or_higher,immigrant_student_ratio,month,holiday,lagged_covid_19_confirmed_cases_15,lagged_covid_19_confirmed_cases_30,lagged_covid_19_confirmed_cases_45,rolling_covid_19_confirmed_cases_15,rolling_covid_19_confirmed_cases_30,rolling_covid_19_confirmed_cases_45
0,2020-01-22,1003,Baldwin County,1,Alabama,0.0,0.0,,,,...,2,0.021048,January,False,,,,,,
1,2020-01-23,1003,Baldwin County,1,Alabama,0.0,0.0,,,,...,2,0.021048,January,False,,,,,,
2,2020-01-24,1003,Baldwin County,1,Alabama,0.0,0.0,,,,...,2,0.021048,January,False,,,,,,
3,2020-01-25,1003,Baldwin County,1,Alabama,0.0,0.0,,,,...,2,0.021048,January,False,,,,,,
4,2020-01-26,1003,Baldwin County,1,Alabama,0.0,0.0,,,,...,2,0.021048,January,False,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445789,2020-07-26,56037,Sweetwater County,56,Wyoming,1.0,0.0,D+,F,A,...,1,0.025714,July,False,8.0,7.0,1.0,5.266667,4.500000,3.977778
445790,2020-07-27,56037,Sweetwater County,56,Wyoming,6.0,0.0,D+,F,A,...,1,0.025714,July,False,2.0,1.0,2.0,5.533333,4.666667,4.066667
445791,2020-07-28,56037,Sweetwater County,56,Wyoming,7.0,0.0,D+,F,A,...,1,0.025714,July,False,12.0,1.0,2.0,5.200000,4.866667,4.177778
445792,2020-07-29,56037,Sweetwater County,56,Wyoming,6.0,0.0,D+,F,A,...,1,0.025714,July,False,6.0,4.0,0.0,5.200000,4.933333,4.311111


# Function

In [8]:
def feature_engineering(df):
    df["date"] = pd.to_datetime(df["date"], format = "%m/%d/%y")
    df["month"] = df["date"].dt.month_name()
    
    dr = pd.date_range(start='2020-01-22', end='2020-09-16')
    cal = calendar()
    holidays = cal.holidays(start=dr.min(), end=dr.max())
    
    df['holiday'] = df['date'].isin(holidays)
    
    lagged_15 = []
    lagged_30 = []
    lagged_45 = []

    rolling_15 = []
    rolling_30 = []
    rolling_45 = []

    for i in df.county_fips.unique():
        lagged_15.append(df[df.county_fips == i]["covid_19_confirmed_cases"].shift(15))
        lagged_30.append(df[df.county_fips == i]["covid_19_confirmed_cases"].shift(30))
        lagged_45.append(df[df.county_fips == i]["covid_19_confirmed_cases"].shift(45))

        rolling_15.append(df[df.county_fips == i]["covid_19_confirmed_cases"].rolling(15).mean())
        rolling_30.append(df[df.county_fips == i]["covid_19_confirmed_cases"].rolling(30).mean())
        rolling_45.append(df[df.county_fips == i]["covid_19_confirmed_cases"].rolling(45).mean())
        
    lagged_15 = np.concatenate(lagged_15)
    lagged_30 = np.concatenate(lagged_30)
    lagged_45 = np.concatenate(lagged_45)
    
    rolling_15 = np.concatenate(rolling_15)
    rolling_30 = np.concatenate(rolling_30)
    rolling_45 = np.concatenate(rolling_45)
    
    df = df.merge(pd.DataFrame(lagged_15, columns = ["lagged_covid_19_confirmed_cases_15"], index = df.index).fillna(0),
           left_index = True, right_index = True)
    df = df.merge(pd.DataFrame(lagged_30, columns = ["lagged_covid_19_confirmed_cases_30"], index = df.index).fillna(0),
           left_index = True, right_index = True)
    df = df.merge(pd.DataFrame(lagged_45, columns = ["lagged_covid_19_confirmed_cases_45"], index = df.index).fillna(0),
           left_index = True, right_index = True)
    
    df = df.merge(pd.DataFrame(rolling_15, columns = ["rolling_covid_19_confirmed_cases_15"], index = df.index).fillna(0),
           left_index = True, right_index = True)
    df = df.merge(pd.DataFrame(rolling_30, columns = ["rolling_covid_19_confirmed_cases_30"], index = df.index).fillna(0),
           left_index = True, right_index = True)
    df = df.merge(pd.DataFrame(rolling_45, columns = ["rolling_covid_19_confirmed_cases_45"], index = df.index).fillna(0),
           left_index = True, right_index = True)
    
    return df

In [9]:
train_engineered = feature_engineering(train)

In [10]:
test_engineered = feature_engineering(test)

# Export to CSV

In [12]:
with zipfile.ZipFile("/Users/Huey.ts/Desktop/STAT_390/data/Engineered_Data.zip", 'w', compression=zipfile.ZIP_DEFLATED) as z:
    with z.open('train.csv', 'w') as f:
        train_engineered.to_csv(f, index = False)
    with z.open('test.csv', 'w') as f:
        test_engineered.to_csv(f, index = False)