Author: Heng-Jui Chang


# **Download Data**


If the Google drive links are dead, you can download data from [kaggle](https://www.kaggle.com/c/ml2021spring-hw1/data), and upload data manually to the workspace.

# **Import Some Packages**

In [1]:
# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

# For data preprocess
import pandas as pd
import numpy as np
import csv
import os

from tqdm import tqdm

# For plotting
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

myseed = 80215  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

# **Some Utilities**

You do not need to modify this part.

In [2]:
def get_device():
    ''' Get device (if GPU is available, use GPU) '''
    return 'cuda' if torch.cuda.is_available() else 'cpu'

def plot_learning_curve(loss_record, title=''):
    ''' Plot learning curve of your DNN (train & dev loss) '''
    total_steps = len(loss_record['train'])
    x_1 = range(total_steps)
    x_2 = x_1[::len(loss_record['train']) // len(loss_record['dev'])]
    figure(figsize=(6, 4))
    plt.plot(x_1, loss_record['train'], c='tab:red', label='train')
    plt.plot(x_2, loss_record['dev'], c='tab:cyan', label='dev')
    plt.ylim(1000.0, 100000.)
    plt.xlabel('Training steps')
    plt.ylabel('MSE loss')
    plt.title('Learning curve of {}'.format(title))
    plt.legend()
    plt.show()


def plot_pred(dv_set, model, device, lim=35., preds=None, targets=None):
    ''' Plot prediction of your DNN '''
    if preds is None or targets is None:
        model.eval()
        preds, targets = [], []
        for x, y in dv_set:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                preds.append(pred.detach().cpu())
                targets.append(y.detach().cpu())
        preds = torch.cat(preds, dim=0).numpy()
        targets = torch.cat(targets, dim=0).numpy()

    figure(figsize=(5, 5))
    plt.scatter(targets, preds, c='r', alpha=0.5)
    plt.plot([-0.2, lim], [-0.2, lim], c='b')
    plt.xlim(-0.2, lim)
    plt.ylim(-0.2, lim)
    plt.xlabel('ground truth value')
    plt.ylabel('predicted value')
    plt.title('Ground Truth v.s. Prediction')
    plt.show()
    


# **Preprocess**

We have three kinds of datasets:
* `train`: for training
* `dev`: for validation
* `test`: for testing (w/o target value)

In [3]:
attri_data = pd.read_csv('../LiterallyWikidata/files_needed/numeric_literals_final_ver04',sep='\t')

In [4]:
attri_data

Unnamed: 0,e,a,rescale_v
0,Q100,P1082,1.483085e-04
1,Q100,P2044,1.032598e-08
2,Q100,P2046,5.574606e-02
3,Q100,P571,1.630000e+03
4,Q100,P625_Latitude,4.235833e+01
...,...,...,...
296489,Q99987,P1335_Longtiude,9.564391e+00
296490,Q99987,P2044,6.411716e-08
296491,Q99987,P2046,9.941761e-04
296492,Q99987,P625_Latitude,4.571767e+01


In [5]:
x = attri_data[['e','a']].to_numpy()
y = attri_data['rescale_v'].to_numpy()

In [6]:
# ## constraint needed:
# pop_idx = dict_all_2_idx['P1082']
# gdp = dict_all_2_idx['P4010']
# nominal_gdp = dict_all_2_idx['P2131']
# nominal_gdp_per = dict_all_2_idx['P2132']
# gdp_per = dict_all_2_idx['P2299']
# date_of_birth = dict_all_2_idx['P569']
# date_of_death = dict_all_2_idx['P570']
# area = ['P2046']
# # net_profit = dict_all_2_idx['P2295']
# # retirement_age = dict_all_2_idx['P3001']
# # age_of_majority = dict_all_2_idx['P2997']
# # work_start = dict_all_2_idx['P2031']
# # work_end = dict_all_2_idx['P2032']

NameError: name 'dict_all_2_idx' is not defined

In [7]:
## Load pretrain embedding
emb_ent = torch.load('../LiterallyWikidata/files_needed/pretrained_kge/pretrained_complex_entemb.pt')
list_ent_ids =[]
with open('../LiterallyWikidata/files_needed/list_ent_ids.txt','r') as f:
    for line in f:
        list_ent_ids.append(line.strip())
## Preparing ent embedding
ent2idx = {e:i for i,e in enumerate(list_ent_ids)}
attri_data['ent_idx']= attri_data['e'].map(ent2idx)
embedding_e = torch.nn.Embedding.from_pretrained(emb_ent)
input_e = torch.LongTensor(attri_data['ent_idx'].to_numpy())

entity_embedding = embedding_e(input_e)
## Preparing att embedding
att2idx = {a:i for i,a in enumerate(attri_data['a'].unique())}
attri_data['a_idx']=attri_data['a'].map(att2idx)
embedding_a = torch.nn.Embedding(len(attri_data['a'].unique()),128,padding_idx=0)
input_a = torch.LongTensor(attri_data['a_idx'].to_numpy())

attribute_embedding = embedding_a(input_a)
## concat two embedding
x_data = torch.cat([entity_embedding,attribute_embedding],dim=1).detach().numpy()



In [8]:
from sklearn.model_selection import train_test_split
X_trainset, X_testset, y_trainset, y_testset = train_test_split(x_data, y,test_size=0.2, random_state=802)

In [9]:
x_data.shape

(296494, 256)

## **Dataset**


In [10]:
class KGMTL_Data(Dataset):
    '''
    x: Features.
    y: Targets, if none, do prediction.
    '''
    def __init__(self, x, y=None):
        if y is None:
            self.y = y
        else:
            self.y = torch.FloatTensor(y)
        self.x = torch.FloatTensor(x)
        

    def __getitem__(self, idx):
        if self.y is None:
            return self.x[idx]
        else:
            return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)


    

# **Setup Hyper-parameters**

`config` contains hyper-parameters for training and the path to save your model.

In [11]:
device = get_device()                 # get the current available device ('cpu' or 'cuda')
os.makedirs('models', exist_ok=True)  # The trained model will be saved to ./models/
target_only = False                   # TODO: Using 40 states & 2 tested_positive features

# TODO: How to tune these hyper-parameters to improve your model's performance?
config = {
    'n_epochs': 5,                # maximum number of epochs
    'batch_size': 200,               # mini-batch size for dataloader
    'learning_rate':0.001,
    'early_stop': 200,               # early stopping epochs (the number epochs since your model's last improvement)
    'save_path': 'models/model.pth' , # your model will be saved here
    'valid_ratio': 0.1,   # validation_size = train_size * valid_ratio
}


## **DataLoader**

A `DataLoader` loads data from a given `Dataset` into batches.


In [12]:
train_set =KGMTL_Data(X_trainset,y_trainset)
valid_set =KGMTL_Data(X_testset,y_testset)
train_loader = DataLoader(train_set, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_set, batch_size=config['batch_size'], shuffle=True, pin_memory=True)


# **Deep Neural Network**

`NeuralNet` is an `nn.Module` designed for regression.
The DNN consists of 2 fully-connected layers with ReLU activation.
This module also included a function `cal_loss` for calculating loss.


In [13]:
class NeuralNet(nn.Module):
    ''' A simple fully-connected deep neural network '''
    def __init__(self, input_dim):
        super(NeuralNet, self).__init__()

        # Define your neural network here
        # TODO: How to modify this model to achieve better performance?
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

        # Mean squared error loss
        self.criterion = nn.MSELoss(reduction='mean')

    def forward(self, x):
        ''' Given input of size (batch_size x input_dim), compute output of the network '''
        return self.net(x).squeeze(1)

    def cal_loss(self, pred, target):
        ''' Calculate loss '''
        # TODO: you may implement L1/L2 regularization here
        return torch.sqrt(self.criterion(pred, target))

# **Train/Dev/Test**

## **Training**

In [22]:
def train(tr_set, dv_set, model, config, device):
    ''' DNN training '''

    n_epochs = config['n_epochs']  # Maximum number of epochs

    # Setup optimizer
    optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=0.9, weight_decay=1e-6) 

    min_mse = 10.**20
    loss_record = {'train': [], 'dev': []}      # for recording training loss
    early_stop_cnt = 0
    epoch = 0

    
    for epoch in range(n_epochs):
        model.train() 
        
        # tqdm is a package to visualize your training progress.
        train_pbar = tqdm(train_loader, position=0, leave=True)
        
        # set model to training mode
        for x, y in train_pbar:                     # iterate through the dataloader
            optimizer.zero_grad()               # set gradient to zero
            x, y = x.to(device), y.to(device)   # move data to device (cpu/cuda)
            pred = model(x)                     # forward pass (compute output)
            mse_loss = model.cal_loss(pred, y)  # compute loss
            mse_loss.backward()                 # compute gradient (backpropagation)
            optimizer.step()                    # update model with optimizer
            loss_record['train'].append(mse_loss.detach().cpu().item())
        print(np.mean(loss_record['train']))

        # After each epoch, test your model on the validation (development) set.
        dev_mse = dev(dv_set, model, device)
        print(dev_mse)
        if dev_mse < min_mse:
            # Save model if your model improved
            min_mse = dev_mse
            print('Saving model (epoch = {:4d}, loss = {:.4f})'
                .format(epoch + 1, min_mse))
            torch.save(model.state_dict(), config['save_path'])  # Save model to specified path
            early_stop_cnt = 0
        else:
            early_stop_cnt += 1

        epoch += 1
        loss_record['dev'].append(dev_mse)
        if early_stop_cnt > config['early_stop']:
            # Stop training if your model stops improving for "config['early_stop']" epochs.
            break

    print('Finished training after {} epochs'.format(epoch))
    return min_mse, loss_record

## **Validation**

In [28]:
def dev(dv_set, model, device):
    model.eval()                                # set model to evalutation mode
    total_loss = 0
    for x, y in dv_set:                         # iterate through the dataloader
        x, y = x.to(device), y.to(device)       # move data to device (cpu/cuda)
        with torch.no_grad():                   # disable gradient calculation
            pred = model(x)# forward pass (compute output)
            #print(pred)
            mse_loss = model.cal_loss(pred, y)  # compute loss
            print(mse_loss)
        total_loss += mse_loss.detach().cpu().item() * len(x)  # accumulate loss
    total_loss = total_loss / len(dv_set.dataset)              # compute averaged loss

    return total_loss

## **Testing**

In [25]:
def test(tt_set, model, device):
    model.eval()                                # set model to evalutation mode
    preds = []
    for x in tt_set:                            # iterate through the dataloader
        x = x.to(device)                        # move data to device (cpu/cuda)
        with torch.no_grad():                   # disable gradient calculation
            pred = model(x)                     # forward pass (compute output)
            preds.append(pred.detach().cpu())   # collect prediction
    preds = torch.cat(preds, dim=0).numpy()     # concatenate all predictions and convert to a numpy array
    return preds

# **Load data and model**

In [26]:
model = NeuralNet(256).to(device)  # Construct model and move to device

# **Start Training!**

In [29]:
model_loss, model_loss_record = train(train_loader, valid_loader, model, config, device)

100%|██████████████████████████████████████████████████| 1186/1186 [00:02<00:00, 425.48it/s]


27206794380737.605
tensor(56.9351, device='cuda:0')
tensor(34.1710, device='cuda:0')
tensor(53.2306, device='cuda:0')
tensor(140.1584, device='cuda:0')
tensor(65.0979, device='cuda:0')
tensor(36.5466, device='cuda:0')
tensor(38.7197, device='cuda:0')
tensor(22.6992, device='cuda:0')
tensor(118.2332, device='cuda:0')
tensor(39.6474, device='cuda:0')
tensor(71.5075, device='cuda:0')
tensor(25.0179, device='cuda:0')
tensor(45.1396, device='cuda:0')
tensor(36.4636, device='cuda:0')
tensor(28.6648, device='cuda:0')
tensor(37.8877, device='cuda:0')
tensor(42.0716, device='cuda:0')
tensor(22.2514, device='cuda:0')
tensor(66.4389, device='cuda:0')
tensor(67.0015, device='cuda:0')
tensor(39.7849, device='cuda:0')
tensor(58.4328, device='cuda:0')
tensor(36.0650, device='cuda:0')
tensor(38.1333, device='cuda:0')
tensor(23.1611, device='cuda:0')
tensor(195.8438, device='cuda:0')
tensor(29.1927, device='cuda:0')
tensor(51.6525, device='cuda:0')
tensor(35.4155, device='cuda:0')
tensor(140.8380, devi

100%|██████████████████████████████████████████████████| 1186/1186 [00:02<00:00, 472.34it/s]


27206794380737.125
tensor(67.7863, device='cuda:0')
tensor(44.0201, device='cuda:0')
tensor(56.3329, device='cuda:0')
tensor(45.0872, device='cuda:0')
tensor(136.4607, device='cuda:0')
tensor(40.4640, device='cuda:0')
tensor(31.4957, device='cuda:0')
tensor(47.4053, device='cuda:0')
tensor(27.8939, device='cuda:0')
tensor(41.8816, device='cuda:0')
tensor(32.7508, device='cuda:0')
tensor(72.4392, device='cuda:0')
tensor(33.6549, device='cuda:0')
tensor(34.8303, device='cuda:0')
tensor(80.1100, device='cuda:0')
tensor(35.4107, device='cuda:0')
tensor(33.4964, device='cuda:0')
tensor(19.8629, device='cuda:0')
tensor(26.5780, device='cuda:0')
tensor(55.2108, device='cuda:0')
tensor(78.4699, device='cuda:0')
tensor(93.3876, device='cuda:0')
tensor(36.0953, device='cuda:0')
tensor(55.8108, device='cuda:0')
tensor(24.3825, device='cuda:0')
tensor(40.3903, device='cuda:0')
tensor(37.5397, device='cuda:0')
tensor(31.2360, device='cuda:0')
tensor(719.7882, device='cuda:0')
tensor(30.0808, device

100%|██████████████████████████████████████████████████| 1186/1186 [00:02<00:00, 468.88it/s]


27206794380736.8
tensor(59.5055, device='cuda:0')
tensor(41.4490, device='cuda:0')
tensor(108.2228, device='cuda:0')
tensor(52.1941, device='cuda:0')
tensor(43.3926, device='cuda:0')
tensor(24.6522, device='cuda:0')
tensor(198.8937, device='cuda:0')
tensor(59.8032, device='cuda:0')
tensor(28.5866, device='cuda:0')
tensor(30.0417, device='cuda:0')
tensor(93.0155, device='cuda:0')
tensor(49.0789, device='cuda:0')
tensor(22.7965, device='cuda:0')
tensor(29.1715, device='cuda:0')
tensor(44.6015, device='cuda:0')
tensor(148.3117, device='cuda:0')
tensor(22.7449, device='cuda:0')
tensor(75.3102, device='cuda:0')
tensor(26.9894, device='cuda:0')
tensor(24.6029, device='cuda:0')
tensor(25.7312, device='cuda:0')
tensor(29.2029, device='cuda:0')
tensor(33.6259, device='cuda:0')
tensor(28.2224, device='cuda:0')
tensor(133.7465, device='cuda:0')
tensor(42.3293, device='cuda:0')
tensor(42.0162, device='cuda:0')
tensor(38.9887, device='cuda:0')
tensor(27.2299, device='cuda:0')
tensor(50.9995, device

100%|██████████████████████████████████████████████████| 1186/1186 [00:02<00:00, 444.59it/s]


27206794380736.37
tensor(27.4895, device='cuda:0')
tensor(25.4391, device='cuda:0')
tensor(391.1062, device='cuda:0')
tensor(26.4285, device='cuda:0')
tensor(166.1151, device='cuda:0')
tensor(22.3322, device='cuda:0')
tensor(126.7725, device='cuda:0')
tensor(44.0218, device='cuda:0')
tensor(49.9444, device='cuda:0')
tensor(36.3035, device='cuda:0')
tensor(22.3368, device='cuda:0')
tensor(82.2035, device='cuda:0')
tensor(24.4519, device='cuda:0')
tensor(41.3500, device='cuda:0')
tensor(250.4832, device='cuda:0')
tensor(36.5689, device='cuda:0')
tensor(21.5009, device='cuda:0')
tensor(25.1881, device='cuda:0')
tensor(40.0963, device='cuda:0')
tensor(31.5904, device='cuda:0')
tensor(45.5168, device='cuda:0')
tensor(28.4727, device='cuda:0')
tensor(30.2480, device='cuda:0')
tensor(28.9182, device='cuda:0')
tensor(197.8361, device='cuda:0')
tensor(35.4887, device='cuda:0')
tensor(43.2872, device='cuda:0')
tensor(17.6428, device='cuda:0')
tensor(33.1286, device='cuda:0')
tensor(32.0235, devi

100%|██████████████████████████████████████████████████| 1186/1186 [00:02<00:00, 472.51it/s]


27206794380736.027
tensor(52.2849, device='cuda:0')
tensor(30.2561, device='cuda:0')
tensor(88.2066, device='cuda:0')
tensor(132.7478, device='cuda:0')
tensor(58.2039, device='cuda:0')
tensor(29.8984, device='cuda:0')
tensor(62.4203, device='cuda:0')
tensor(26.4327, device='cuda:0')
tensor(43.8363, device='cuda:0')
tensor(34.5208, device='cuda:0')
tensor(23.1599, device='cuda:0')
tensor(48.7301, device='cuda:0')
tensor(59.1352, device='cuda:0')
tensor(115.0720, device='cuda:0')
tensor(29.6410, device='cuda:0')
tensor(18.5002, device='cuda:0')
tensor(101.6867, device='cuda:0')
tensor(114.9152, device='cuda:0')
tensor(32.0956, device='cuda:0')
tensor(31.5888, device='cuda:0')
tensor(35.5929, device='cuda:0')
tensor(40.4319, device='cuda:0')
tensor(18.7632, device='cuda:0')
tensor(65175.5781, device='cuda:0')
tensor(33.8615, device='cuda:0')
tensor(26.4582, device='cuda:0')
tensor(28.5492, device='cuda:0')
tensor(24.4272, device='cuda:0')
tensor(22.3088, device='cuda:0')
tensor(24.6820, d

In [None]:
plot_learning_curve(model_loss_record, title='deep model')

In [21]:
model_loss

1e+20

In [None]:
del model
model = NeuralNet(tr_set.dataset.dim).to(device)
ckpt = torch.load(config['save_path'], map_location='cpu')  # Load your best model
model.load_state_dict(ckpt)
plot_pred(dv_set, model, device)  # Show prediction on the validation set

# **Testing**
The predictions of your model on testing set will be stored at `pred.csv`.

In [None]:
def save_pred(preds, file):
    ''' Save predictions to specified file '''
    print('Saving results to {}'.format(file))
    with open(file, 'w') as fp:
        writer = csv.writer(fp)
        writer.writerow(['id', 'tested_positive'])
        for i, p in enumerate(preds):
            writer.writerow([i, p])

preds = test(valid_loader, model, device)  # predict COVID-19 cases with your model
print('pred')         # save prediction file to pred.csv

# **Hints**

## **Simple Baseline**
* Run sample code

## **Medium Baseline**
* Feature selection: 40 states + 2 `tested_positive` (`TODO` in dataset)

## **Strong Baseline**
* Feature selection (what other features are useful?)
* DNN architecture (layers? dimension? activation function?)
* Training (mini-batch? optimizer? learning rate?)
* L2 regularization
* There are some mistakes in the sample code, can you find them?

# **Reference**
This code is completely written by Heng-Jui Chang @ NTUEE.  
Copying or reusing this code is required to specify the original author. 

E.g.  
Source: Heng-Jui Chang @ NTUEE (https://github.com/ga642381/ML2021-Spring/blob/main/HW01/HW01.ipynb)
