In [1]:
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

In [2]:
train_data_path = "../Datasets/hw1/covid.train.csv"
test_data_path = "../Datasets/hw1/covid.test.csv"

In [3]:
train_csv = pd.read_csv(train_data_path)
test_csv = pd.read_csv(test_data_path)

In [4]:
train_csv.head()

Unnamed: 0,id,AL,AK,AZ,AR,CA,CO,CT,FL,GA,...,work_outside_home.4,shop.4,restaurant.4,spent_time.4,large_event.4,public_transit.4,anxious.4,depressed.4,worried_finances.4,tested_positive.4
0,0,0,0,0,0,0,0,0,1,0,...,31.113209,67.394551,36.674291,40.743132,17.842221,4.093712,10.440071,8.627117,37.329512,7.456154
1,1,0,0,0,0,0,1,0,0,0,...,33.920257,64.39838,34.612238,44.035688,17.808103,4.924935,10.172662,9.954333,32.508881,8.010957
2,2,0,0,0,0,0,0,0,0,0,...,31.604604,62.101064,26.521875,36.746453,13.903667,7.313833,10.388712,7.956139,36.745588,2.906977
3,3,0,0,0,0,0,0,0,0,0,...,35.115738,67.93552,38.022492,48.434809,27.134876,3.101904,10.498683,8.231522,38.680162,12.575816
4,4,0,0,0,0,0,0,0,0,0,...,35.129714,69.934592,38.242368,49.095933,22.683709,4.59462,9.878927,9.46929,28.344123,21.428589


共118列：
+ 0：id
+ 1~37: State
+ 38~41: COVID-like illness  (5天)
+ 42~49: Behavior Indicators  (5天)
+ 50~52: Medical Health Indicators  (5天)
+ 53: Tested Positive Cases  (5天)

$$1 + 37 + 3\times 5 + 8\times 5 + 3\times 5 + 1\times 5 = 118$$

## Dataset
+ 统计信息
+ 缺失值
+ 特征相关性
+ 特征规约

In [5]:
# 去掉'id'列
coulmns = train_csv.columns
train_dataset = train_csv[coulmns[1:]]

coulmns = test_csv.columns
test_dataset = test_csv[coulmns[1:]]

In [6]:
# 查看每一列Nan值的个数
nacount = train_dataset.isna().sum()
print(f"Nan元素总数：{nacount.sum()}")

print(nacount)

Nan元素总数：0
AL                    0
AK                    0
AZ                    0
AR                    0
CA                    0
                     ..
public_transit.4      0
anxious.4             0
depressed.4           0
worried_finances.4    0
tested_positive.4     0
Length: 117, dtype: int64


In [7]:
import csv

def save_pred(preds, save_path):
    with open(save_path, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['id', 'tested_positive'])
        for i, p in enumerate(preds):
            writer.writerow([i, p])

### Baseline

**All original features, linear regression**

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

def cross_val(model, x, y):
    scores = cross_val_score(model, x, y, scoring='neg_mean_squared_error', cv=10)
    print(np.mean(np.sqrt(-scores)))

In [9]:
lin_reg = LinearRegression()
# 使用全部原始特征
cross_val(lin_reg, train_dataset.iloc[:, :-1], train_dataset.iloc[:, -1])

1.0521641724873885


In [10]:
# save test result

lin_reg = LinearRegression()
lin_reg.fit(train_dataset.iloc[:, :-1], train_dataset.iloc[:, -1])
preds = lin_reg.predict(test_dataset)

save_pred(preds, './plain_line_reg.txt')

**All original features, decision tree**

In [11]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()

# 使用全部原始特征
cross_val(tree_reg, train_dataset.iloc[:, :-1], train_dataset.iloc[:, -1])

1.5966659089652886


In [12]:
# save test results

tree_reg = DecisionTreeRegressor()
tree_reg.fit(train_dataset.iloc[:, :-1], train_dataset.iloc[:, -1])
preds = tree_reg.predict(test_dataset)

save_pred(preds, './plain_tree_reg.txt')

#### Feature Selection

In [13]:
# 特征与特征之间的相关性矩阵
corr_matrix = train_dataset.corr()

# 查看与test_positive.4与其他特征之间的相关性
positive4_coor = corr_matrix['tested_positive.4'].sort_values(ascending=False)
mask = positive4_coor > 0.5
print(f'相关性大于0.5的特征数：{sum(mask)}')
# print(positive4_coor[mask])

相关性大于0.5的特征数：35


**Selected Original features, linear regression**

In [14]:
lin_reg = LinearRegression()
# 使用相关性较大的若干原始特征
selcted_columns = list(positive4_coor.index[mask])
cross_val(lin_reg, train_dataset[selcted_columns[1:]], train_dataset[selcted_columns[0]])

1.0492169977778807


In [15]:
# save test result

lin_reg = LinearRegression()
lin_reg.fit(train_dataset[selcted_columns[1:]], train_dataset[selcted_columns[0]])
preds = lin_reg.predict(test_dataset[selcted_columns[1:]])

save_pred(preds, './plain_line_reg_with_feature_selction.txt')

**Selected Original features, Decision Tree**

In [16]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
# 使用相关性较大的若干原始特征
selcted_columns = list(positive4_coor.index[mask])
cross_val(tree_reg, train_dataset[selcted_columns[1:]], train_dataset[selcted_columns[0]])

1.5812097726200158


In [17]:
# save test result

tree_reg = LinearRegression()
tree_reg.fit(train_dataset[selcted_columns[1:]], train_dataset[selcted_columns[0]])
preds = tree_reg.predict(test_dataset[selcted_columns[1:]])

save_pred(preds, './plain_tree_reg_with_feature_selction.txt')

**Feature Scaling**

In [18]:
from sklearn.base import BaseEstimator, TransformerMixin

In [19]:
class NormalScaler(BaseEstimator, TransformerMixin):
    def __init__(self, skip=None):
        self.skip = skip

    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        for col in x.columns:
            if self.skip not in col and x[col].max() > 1:
                mean = x[col].mean()
                std = x[col].std()
                x[col] = x[col].map(lambda i: (i - mean) / std)
                
        return x


class MaxminScaler(BaseEstimator, TransformerMixin):
    def __init__(self, skip=None):
        self.skip = skip

    def fit(self, x, y=None):
        return self
    
    def transform(self, x):
        for col in x.columns:
            if self.skip not in col and x[col].max() > 1:
                max_v = x[col].max()
                min_v = x[col].min()
                x[col] = x[col].map(lambda i: (i - min_v) / (max_v - min_v))
                
        return x

In [20]:
from sklearn.pipeline import Pipeline

In [21]:
normal_scaled_pipeline = Pipeline([('std_scaler', NormalScaler('tested_positive'))])
train_dataset_normal_scaled = normal_scaled_pipeline.transform(train_dataset.copy())
test_dataset_normal_scaled = normal_scaled_pipeline.transform(test_dataset.copy())

maxmin_scaled_pipeline = Pipeline([('maxmin_scaler', MaxminScaler('tested_positive'))])
train_dataset_maxmin_scaled = maxmin_scaled_pipeline.transform(train_dataset.copy())
test_dataset_maxmin_scaled = maxmin_scaled_pipeline.transform(test_dataset.copy())

In [22]:
train_dataset_normal_scaled.head()

Unnamed: 0,AL,AK,AZ,AR,CA,CO,CT,FL,GA,ID,...,work_outside_home.4,shop.4,restaurant.4,spent_time.4,large_event.4,public_transit.4,anxious.4,depressed.4,worried_finances.4,tested_positive.4
0,0,0,0,0,0,0,0,1,0,0,...,-0.497704,0.375942,0.45434,-1.121859,-0.626113,-0.281985,-0.671085,-0.854542,0.691637,7.456154
1,0,0,0,0,0,1,0,0,0,0,...,0.235301,-0.845236,-0.052472,-0.221926,-0.635917,0.108189,-0.78887,-0.024633,-0.642577,8.010957
2,0,0,0,0,0,0,0,0,0,0,...,-0.369386,-1.781575,-2.040928,-2.214246,-1.75786,1.229533,-0.693707,-1.274106,0.530023,2.906977
3,0,0,0,0,0,0,0,0,0,0,...,0.547478,0.596431,0.785702,0.980457,2.04414,-0.747538,-0.645268,-1.101909,1.065458,12.575816
4,0,0,0,0,0,0,0,0,0,1,...,0.551127,1.411212,0.839743,1.161158,0.765093,-0.04686,-0.91825,-0.327931,-1.795264,21.428589


In [23]:
train_dataset_maxmin_scaled.head()

Unnamed: 0,AL,AK,AZ,AR,CA,CO,CT,FL,GA,ID,...,work_outside_home.4,shop.4,restaurant.4,spent_time.4,large_event.4,public_transit.4,anxious.4,depressed.4,worried_finances.4,tested_positive.4
0,0,0,0,0,0,0,0,1,0,0,...,0.327454,0.62273,0.65406,0.297424,0.373811,0.184499,0.252049,0.257307,0.608567,7.456154
1,0,0,0,0,0,1,0,0,0,0,...,0.467725,0.405868,0.560692,0.475805,0.372223,0.242983,0.230257,0.397139,0.37825,8.010957
2,0,0,0,0,0,0,0,0,0,0,...,0.35201,0.239588,0.194364,0.080897,0.190477,0.411063,0.247863,0.186614,0.580669,2.906977
3,0,0,0,0,0,0,0,0,0,0,...,0.527464,0.661885,0.715106,0.714135,0.806369,0.114716,0.256825,0.215628,0.673097,12.575816
4,0,0,0,0,0,0,0,0,0,1,...,0.528163,0.806577,0.725062,0.749953,0.599174,0.219742,0.206319,0.346036,0.179269,21.428589


In [24]:
lin_reg = LinearRegression()
cross_val(lin_reg, train_dataset_normal_scaled.iloc[:, :-1], train_dataset_normal_scaled.iloc[:, -1])

lin_reg = LinearRegression()
cross_val(lin_reg, train_dataset_maxmin_scaled.iloc[:, :-1], train_dataset_maxmin_scaled.iloc[:, -1])

1.0519556275693471
1.0519556275693447


In [25]:
# save test result

lin_reg = LinearRegression()
lin_reg.fit(train_dataset_normal_scaled.iloc[:, :-1], train_dataset_normal_scaled.iloc[:, -1])
preds = lin_reg.predict(test_dataset_normal_scaled)

save_pred(preds, './line_reg_with_normal_scaled_features.txt')

In [26]:
# save test result

lin_reg = LinearRegression()
lin_reg.fit(train_dataset_maxmin_scaled.iloc[:, :-1], train_dataset_maxmin_scaled.iloc[:, -1])
preds = lin_reg.predict(test_dataset_maxmin_scaled)

save_pred(preds, './tree_reg_with_normal_scaled_features.txt')

In [27]:
# 特征与特征之间的相关性矩阵
corr_matrix = train_dataset_normal_scaled.corr()

# 查看与test_positive.4与其他特征之间的相关性
positive4_coor = corr_matrix['tested_positive.4'].sort_values(ascending=False)

scores = []
thrs = []
for thr in np.linspace(0.1, 0.9, num=100):
    mask = positive4_coor > thr
    # print(f'相关性大于{thr}的特征数：{sum(mask)}')
    thrs.append(thr)
    lin_reg = LinearRegression()
    # 使用相关性较大的若干原始特征
    selcted_columns = list(positive4_coor.index[mask])
    x, y = train_dataset_normal_scaled[selcted_columns[1:]], train_dataset_normal_scaled[selcted_columns[0]]
    score_list = cross_val_score(lin_reg, x, y, scoring='neg_mean_squared_error', cv=10)
    scores.append(np.mean(np.sqrt(-score_list)))

print(f"minimal score: {min(scores)}, index {np.argmin(scores)}, thr: {thrs[np.argmin(scores)]}")

minimal score: 1.0468164063875014, index 65, thr: 0.6252525252525253


In [28]:
# save test result

# 特征与特征之间的相关性矩阵
corr_matrix = train_dataset_normal_scaled.corr()
# 查看与test_positive.4与其他特征之间的相关性
positive4_coor = corr_matrix['tested_positive.4'].sort_values(ascending=False)

mask = positive4_coor > thrs[np.argmin(scores)]
# 使用相关性较大的若干原始特征
selcted_columns = list(positive4_coor.index[mask])
x = train_dataset_normal_scaled[selcted_columns[1:]]
y = train_dataset_normal_scaled[selcted_columns[0]]

lin_reg = LinearRegression()
lin_reg.fit(x, y)
preds = lin_reg.predict(test_dataset_normal_scaled[selcted_columns[1:]])

save_pred(preds, './line_reg_with_selected_normal_scaled_features.txt')

### Deep Learning

In [29]:
from torch.utils.data import Dataset, DataLoader, random_split
import numpy as np
import pandas as pd
import sklearn
import torch
from tqdm import tqdm
from pathlib import Path
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [30]:
def same_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [31]:
def train_valid_split(dataset, valid_ratio, seed):
    valid_set_size = int(valid_ratio * len(dataset))
    train_set_size = len(dataset) - valid_set_size
    train_set, valid_set = random_split(dataset, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
    return np.array(train_set), np.array(valid_set)

class COVID19Dataset(Dataset):

    def __init__(self, x, y=None):
        if y is not None:
            self.y = torch.from_numpy(y)
        else:
            self.y = y
        self.x = torch.from_numpy(x)
    
    def __len__(self):
        return len(self.x)

    def __getitem__(self, item):
        if self.y is None:
            return self.x[item]
        else:
            return self.x[item], self.y[item]

In [78]:
class Model(torch.nn.Module):

    def __init__(self, input_channel):
        super(Model, self).__init__()
        self.linear1 = torch.nn.Linear(input_channel, 32)
        self.linear2 = torch.nn.Linear(32, 16)
        self.act = torch.nn.ReLU(inplace=True)
        self.linear3 = torch.nn.Linear(16, 1)

    def forward(self, x):
        x = self.act(self.linear1(x))
        # x = self.dropout(x)
        x = self.act(self.linear2(x))
        x = self.linear3(x)
        # x = self.linear4(x)
        return x

#### Training

In [92]:
from sklearn.model_selection import train_test_split

same_seed(77)
dataset = pd.read_csv(train_data_path)
dataset = dataset[dataset.columns[1:]]  # remove 'id' column
corr_matrix = dataset.corr()
target_coor = corr_matrix['tested_positive.4'].sort_values(ascending=False)
mask = target_coor > 0.5
print(f"selected features num: {np.sum(mask)}")
selected_feature_idx = list(target_coor.index[mask])

x_dataset = dataset[selected_feature_idx[1:]]
y_dataset = dataset.iloc[:, -1]
x_train, x_val, y_train, y_val = train_test_split(x_dataset, y_dataset, test_size=0.2, random_state=77)
print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)


selected features num: 35
(2159, 34) (540, 34) (2159,) (540,)


In [93]:
train_loader = DataLoader(COVID19Dataset(x_train.values, y_train.values), 
                            batch_size=16, 
                            shuffle=True, 
                            num_workers=0, 
                            drop_last=True)

val_loader = DataLoader(COVID19Dataset(x_val.values, y_val.values), 
                        batch_size=8, 
                        shuffle=False)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = Model(input_channel=len(selected_feature_idx[1:])).to(device=device)
loss_fcn = torch.nn.MSELoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr=0.00001, momentum=0.9, weight_decay=1e-4, nesterov=True)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)

total_epoch = 600
for epoch in range(total_epoch):
    with tqdm(train_loader, total=len(train_loader)) as tbar:
        tbar.set_description(f"epoch {epoch+1}/{total_epoch}")
        for i, (x, y) in enumerate(train_loader):
            # print(x, y)
            model.train()
            x = x.float().to(device)
            preds = model(x).squeeze(dim=1)                
            loss = loss_fcn(y.float().to(device), preds)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if i % len(train_loader) == 0:
                tot_mse = []
                for j, (x, y) in enumerate(val_loader):
                    model.eval()
                    preds = model(x.float().to(device)).squeeze(dim=1)
                    tot_mse.append(loss_fcn(y.float().to(device), preds).detach().cpu().numpy())
                tbar.set_postfix_str(f'train loss {loss.item():.3f} ; val loss {np.mean(tot_mse):.3f}')
            tbar.update(1)

epoch 1/600: 100%|██████████| 134/134 [00:00<00:00, 671.80it/s, train loss 100.557 ; val loss 98.535]
epoch 2/600: 100%|██████████| 134/134 [00:00<00:00, 926.62it/s, train loss 10.778 ; val loss 17.749]
epoch 3/600: 100%|██████████| 134/134 [00:00<00:00, 933.23it/s, train loss 7.993 ; val loss 10.960]
epoch 4/600: 100%|██████████| 134/134 [00:00<00:00, 926.80it/s, train loss 3.974 ; val loss 7.668]
epoch 5/600: 100%|██████████| 134/134 [00:00<00:00, 932.87it/s, train loss 6.898 ; val loss 6.478]
epoch 6/600: 100%|██████████| 134/134 [00:00<00:00, 920.22it/s, train loss 6.076 ; val loss 5.828]
epoch 7/600: 100%|██████████| 134/134 [00:00<00:00, 939.59it/s, train loss 3.761 ; val loss 5.387]
epoch 8/600: 100%|██████████| 134/134 [00:00<00:00, 926.63it/s, train loss 7.971 ; val loss 5.033]
epoch 9/600: 100%|██████████| 134/134 [00:00<00:00, 901.56it/s, train loss 3.648 ; val loss 4.732]
epoch 10/600: 100%|██████████| 134/134 [00:00<00:00, 920.26it/s, train loss 3.345 ; val loss 4.421]
epo

In [54]:
# for j, (x, y) in enumerate(val_loader):
#     model.eval()
#     preds = model(x.float().to(device)).squeeze(dim=1)
#     print(preds, y, sep='\n')
#     print(f"{'=' * 80}")

In [94]:
test_dataset = pd.read_csv(test_data_path)
test_dataset = test_dataset[test_dataset.columns[1:]]
test_dataset = test_dataset[selected_feature_idx[1:]]

test_loader = DataLoader(COVID19Dataset(test_dataset.values, None), batch_size=16, shuffle=False)

model.eval()
preds_all = []
for x in test_loader:
    x = x.float().to(device)
    preds = model(x).detach().cpu().numpy().squeeze()
    # print(preds)
    preds_all.extend(preds)


save_pred(preds_all, "./dl_selected_original_feature_adamw.txt")

#### 使用所有训练数据进行训练

In [95]:
same_seed(77)
dataset = pd.read_csv(train_data_path)
dataset = dataset[dataset.columns[1:]]  # remove 'id' column

feature_process_pipeline = Pipeline([('maxmin_scaler', MaxminScaler('tested_positive.4'))])  # 对除了target column之外的feature值进行规约化
dataset = feature_process_pipeline.transform(dataset.copy())

corr_matrix = dataset.corr()
target_coor = corr_matrix['tested_positive.4'].sort_values(ascending=False)
mask = target_coor > 0.5  # 选择与target相关性大于0.5的feature参与训练
selected_feature_idx = list(target_coor.index[mask])

x_dataset = dataset[selected_feature_idx[1:]]
y_dataset = dataset.iloc[:, -1]
x_train, x_val, y_train, y_val = train_test_split(x_dataset, y_dataset, test_size=0.2, random_state=77)  # 划分测试集和验证集
print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)

(2159, 34) (540, 34) (2159,) (540,)


In [96]:
x_train.head()

Unnamed: 0,tested_positive.3,tested_positive.2,tested_positive.1,tested_positive,hh_cmnty_cli.4,hh_cmnty_cli.3,nohh_cmnty_cli.4,hh_cmnty_cli.2,nohh_cmnty_cli.3,hh_cmnty_cli.1,...,anxious.4,anxious.3,anxious.2,anxious.1,anxious,work_outside_home.4,work_outside_home.3,work_outside_home.2,work_outside_home.1,work_outside_home
1611,0.179428,0.157589,0.112928,0.095897,0.030578,0.038501,0.043391,0.035927,0.050363,0.03626,...,0.349364,0.315327,0.365267,0.352714,0.315041,0.12502,0.101719,0.108746,0.125541,0.118981
1738,0.16333,0.193173,0.25615,0.23635,0.12311,0.137675,0.101318,0.140786,0.106964,0.132884,...,0.424925,0.448514,0.426557,0.41761,0.46968,0.461224,0.470054,0.477118,0.463857,0.506354
1156,0.458181,0.475254,0.475254,0.437008,0.795198,0.790392,0.810744,0.811431,0.795247,0.801507,...,0.61092,0.637113,0.735003,0.739021,0.779809,0.432208,0.454133,0.552732,0.601344,0.591493
530,0.421764,0.328752,0.2794,0.29047,0.19316,0.193023,0.159913,0.176219,0.154867,0.181626,...,0.393338,0.43717,0.466091,0.519418,0.507335,0.498154,0.50657,0.537812,0.517657,0.528373
614,0.085595,0.101132,0.105805,0.104925,0.060224,0.056132,0.050647,0.057334,0.046105,0.058849,...,0.29007,0.281768,0.294648,0.268472,0.224005,0.152179,0.156876,0.128048,0.107788,0.110595


In [97]:
train_loader = DataLoader(COVID19Dataset(x_dataset.values, y_dataset.values),   # 使用全部的训练数据
                            batch_size=16, 
                            shuffle=True, 
                            num_workers=0, 
                            drop_last=True)

val_loader = DataLoader(COVID19Dataset(x_val.values, y_val.values),   # 从训练数据中拿出一部分测试（其实这部分数据也参加了训练）
                        batch_size=8, 
                        shuffle=False)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = Model(input_channel=len(selected_feature_idx[1:])).to(device=device)
loss_fcn = torch.nn.MSELoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001, momentum=0.9, weight_decay=1e-4, nesterov=True)
lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[50, 100, 250, 300, 400], gamma=0.1)

total_epoch = 500
for epoch in range(total_epoch):
    with tqdm(train_loader, total=len(train_loader)) as tbar:
        tbar.set_description(f"{epoch+1}/{total_epoch}")
        for i, (x, y) in enumerate(train_loader):
            # print(x, y)
            model.train()
            x = x.float().to(device)
            preds = model(x).squeeze(dim=1)                
            loss = loss_fcn(y.float().to(device), preds)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if i % 100 == 0:
                tot_mse = []
                for j, (x, y) in enumerate(val_loader):
                    model.eval()
                    preds = model(x.float().to(device)).squeeze(dim=1)
                    tot_mse.append(loss_fcn(y.float().to(device), preds).detach().cpu().numpy())
                tbar.set_postfix_str(f'train loss {loss.item():.3f} ; val loss {np.mean(tot_mse):.3f}; lr {lr_scheduler.get_last_lr()[0]:.2e}')
            tbar.update(1)
            
    lr_scheduler.step()

1/500: 100%|██████████| 168/168 [00:00<00:00, 838.06it/s, train loss 7.519 ; val loss 9.416; lr 1.00e-04]   
2/500: 100%|██████████| 168/168 [00:00<00:00, 1039.66it/s, train loss 2.924 ; val loss 5.910; lr 1.00e-04]
3/500: 100%|██████████| 168/168 [00:00<00:00, 1020.91it/s, train loss 9.121 ; val loss 5.261; lr 1.00e-04]
4/500: 100%|██████████| 168/168 [00:00<00:00, 1033.44it/s, train loss 1.112 ; val loss 4.151; lr 1.00e-04]
5/500: 100%|██████████| 168/168 [00:00<00:00, 979.52it/s, train loss 2.098 ; val loss 3.442; lr 1.00e-04]
6/500: 100%|██████████| 168/168 [00:00<00:00, 1039.81it/s, train loss 2.406 ; val loss 2.882; lr 1.00e-04]
7/500: 100%|██████████| 168/168 [00:00<00:00, 1046.28it/s, train loss 3.055 ; val loss 2.804; lr 1.00e-04]
8/500: 100%|██████████| 168/168 [00:00<00:00, 1008.86it/s, train loss 1.529 ; val loss 2.255; lr 1.00e-04]
9/500: 100%|██████████| 168/168 [00:00<00:00, 973.86it/s, train loss 1.171 ; val loss 2.098; lr 1.00e-04]
10/500: 100%|██████████| 168/168 [00:

#### Testing

In [40]:
# for j, (x, y) in enumerate(val_loader):
#     model.eval()
#     preds = model(x.float().to(device)).squeeze(dim=1)
#     print(preds, y, sep='\n')
#     print(f"{'=' * 80}")

In [41]:
test_dataset = pd.read_csv(test_data_path)
test_dataset = test_dataset[test_dataset.columns[1:]]
test_dataset = feature_process_pipeline.transform(test_dataset.copy())

test_dataset = test_dataset[selected_feature_idx[1:]]
test_loader = DataLoader(COVID19Dataset(test_dataset.values, None), batch_size=16, shuffle=False)

model.eval()
preds_all = []
for x in test_loader:
    x = x.float().to(device)
    preds = model(x).detach().cpu().numpy().squeeze()
    preds_all.extend(preds)

save_pred(preds_all, "./dl_selected_maxmin_normalized_feature.txt")