# **Import Some Packages**

In [1]:
# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

# For data preprocess
import pandas as pd
import numpy as np
import csv
import os

from tqdm import tqdm

import math
from torch.utils.tensorboard import SummaryWriter
# For plotting
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
    
from sklearn.preprocessing import StandardScaler

# **Some Utilities**

You do not need to modify this part.

In [2]:
def get_device():
    ''' Get device (if GPU is available, use GPU) '''
    return 'cuda' if torch.cuda.is_available() else 'cpu'

def plot_learning_curve(loss_record, title=''):
    ''' Plot learning curve of your DNN (train & dev loss) '''
    total_steps = len(loss_record['mean_train_loss'])
    x_1 = range(total_steps)
    x_2 = x_1[::len(loss_record['mean_train_loss']) // len(loss_record['mean_valid_loss'])]
    figure(figsize=(6, 4))
    plt.plot(x_1, loss_record['mean_train_loss'], c='tab:red', label='train')
    plt.plot(x_2, loss_record['mean_valid_loss'], c='tab:cyan', label='dev')
    plt.ylim(-1,6)
    plt.xlabel('Training steps')
    plt.ylabel('MSE loss')
    plt.title('Learning curve of {}'.format(title))
    plt.legend()
    plt.show()


def plot_pred(dv_set, model, device, lim=35., preds=None, targets=None):
    ''' Plot prediction of your DNN '''
    if preds is None or targets is None:
        model.eval()
        preds, targets = [], []
        for x, y in dv_set:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                preds.append(pred.detach().cpu())
                targets.append(y.detach().cpu())
        preds = torch.cat(preds, dim=0).numpy()
        targets = torch.cat(targets, dim=0).numpy()

    figure(figsize=(5, 5))
    plt.scatter(targets, preds, c='r', alpha=0.5)
    plt.plot([-0.2, lim], [-0.2, lim], c='b')
    plt.xlim(-0.2, lim)
    plt.ylim(-0.2, lim)
    plt.xlabel('ground truth value')
    plt.ylabel('predicted value')
    plt.title('Ground Truth v.s. Prediction')
    plt.show()
    

In [3]:
def same_seed(seed): 
    '''Fixes random number generator seeds for reproducibility.'''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def train_valid_split(data_set, valid_ratio, seed):
    '''Split provided training data into training set and validation set'''
    valid_set_size = int(valid_ratio * len(data_set)) 
    train_set_size = len(data_set) - valid_set_size
    train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
    return np.array(train_set), np.array(valid_set)

def predict(test_loader, model, device):
    model.eval() # Set your model to evaluation mode.
    preds = []
    for x in tqdm(test_loader):
        x = x.to(device)                        
        with torch.no_grad():                   
            pred = model(x)                     
            preds.append(pred.detach().cpu())   
    preds = torch.cat(preds, dim=0).numpy()  
    return preds

In [4]:
attri_data = pd.read_csv('../LiterallyWikidata/files_needed/train_attri_data_minmax.csv')
attri_data_valid = pd.read_csv('../LiterallyWikidata/files_needed/valid_attri_data_minmax.csv')

In [11]:
attri_data['name_a'].value_counts()[:10]

coordinate location(latitude)        19961
coordinate location(logtitude)       19961
area                                 17401
population                           17157
elevation above sea level            13134
date of birth                        12746
inception                             9742
mass                                  7127
coordinates of northernmost point     6830
coordinates of easternmost point      6802
Name: name_a, dtype: int64

In [13]:
attri_data[attri_data['e']=='Q9682']

Unnamed: 0,e,a,v,name_e,name_a,ent_type,minmax
143668,Q9682,P569,1926.0,Elizabeth II,date of birth,Q5,0.963184
155906,Q9682,P2048,1.63,Elizabeth II,height,Q5,0.000159


count    6227.000000
mean        5.328110
std       126.105278
min         0.077000
25%         1.780000
50%         1.830000
75%         1.910000
max      9753.600000
Name: v, dtype: float64

In [18]:
height = attri_data[attri_data['a']=="P2048"]
height[height.v==0.077]

Unnamed: 0,e,a,v,name_e,name_a,ent_type,minmax
170142,Q660841,P2048,0.077,Rose Trellis,height,Q331225,0.0


In [51]:
attri_data_valid=attri_data_valid[['e','a','v']]

In [52]:
# 用kgeemb順序
ent2idx ={}
with open('../LiterallyWikidata/files_needed/list_ent_ids.txt','r') as fr:
    for i, word in enumerate(fr.readlines()):
        ent2idx[word.strip()] = i


#attri_data_std_v = attri_data[['e','a','new_stdv']]
attri_data_train = attri_data[['e','a','v']]
#attri_data_valid = attri_data[['e','a','v']]
# att2idx = {}
# #rel2idx = {v:k for k,v in enumerate(relations['label'].unique())}

# with open('../LiterallyWikidata/files_needed/attribute.txt','r') as fr:
#     for i, word in enumerate(fr.readlines()):
#         att2idx[word.strip()] = i
        
att2idx = {v:k for k,v in enumerate(attri_data['a'].unique())}

In [None]:
# attri_data['a_idx']=attri_data['a'].map(att2idx)
# attri_data['e_idx']=attri_data['e'].map(ent2idx)

In [64]:
attri_data_eav

Unnamed: 0,e,a,v
0,Q111349,P1332_Longtiude,10.397868
1,Q51413,P625_Longtiude,13.859156
2,Q7382874,P569,1926.000000
3,Q213459,P2067,73.000000
4,Q498680,P619,1962.000000
...,...,...,...
237041,Q342419,P2031,1992.000000
237042,Q725787,P2067,86.182550
237043,Q43067,P570,1945.000000
237044,Q810317,P582,1977.000000


In [59]:
def numeric_literal_array(data, ent2idx, att2idx):
    #'LiterallyWikidata/LitWD48K/train_attri_data'
    df_all = data

    # Resulting file
    num_lit = np.zeros([len(ent2idx), len(att2idx)],dtype=np.float32)

# Create literal wrt vocab
    for i, (s, p, lit) in enumerate(df_all.values):
        try:
            num_lit[ent2idx[s], att2idx[p]] = lit
        except KeyError:
            continue
    return num_lit


# num_lit shape (47998, 86)


In [60]:
num_lit = numeric_literal_array(attri_data_train, ent2idx, att2idx)
print(num_lit.shape)

(47998, 86)


In [None]:
num_lit

In [None]:
#num_lit_stdv = numeric_literal_array(attri_data_std_v, ent2idx, att2idx)
#print(num_lit_stdv.shape)

In [18]:
# ## constraint needed:
pop_idx = att2idx['P1082']
gdp = att2idx['P4010']
nominal_gdp = att2idx['P2131']
# nominal_gdp_per = att2idx['P2132']
gdp_per = att2idx['P2299']
# date_of_birth = att2idx['P569']
# date_of_death = att2idx['P570']
# area = ['P2046']
# # net_profit = att2idx['P2295']
# # retirement_age = att2idx['P3001']
# # age_of_majority = att2idx['P2997']
# # work_start = att2idx['P2031']
# # work_end = att2idx['P2032']

In [None]:
gdp

In [None]:
#attri_data[attri_data['a']=='P4010']

In [None]:
num_lit[84][gdp]/num_lit[84][gdp_per],num_lit[84][pop_idx]

In [61]:
# x_list: ent的pop有值，把除了pop那個值之外的值存到inner_x_list，len(x_list)是有幾組變數有值
# normalized or non-normalized
x_list=[]

for ent in num_lit:
    if ent[pop_idx] == 0:
        pass
    else:
        inner_x_list=[]
        for j in range(len(ent)):
            if j != pop_idx :
                inner_x_list.append(ent[j])
        inner_x_list.append(ent[pop_idx])
        x_list.append(inner_x_list)


In [75]:

def create_x_list(var_idx,num_lit):
    x_list=[]

    for ent in num_lit:
        if ent[var_idx] == 0:
            pass
        else:
            inner_x_list=[]
            for j in range(len(ent)):
                if j != var_idx :
                    inner_x_list.append(ent[j])
            inner_x_list.append(ent[var_idx])
            x_list.append(inner_x_list)
    return x_list

In [76]:
x_list_train=create_x_list(pop_idx)


In [78]:
len(x_list_train)

17075

In [56]:
x_list[1][85]
#len(x_list[1])

97385.0

In [30]:
# y 
select_feature = []
for i in range(len(x_list[1])-1):
    if x_list[0][i] !=0:
        select_feature.append(i)

In [31]:
select_feature


[0, 1, 5, 6, 7, 13, 14, 15, 16, 25, 33, 38, 48, 55, 70, 72, 84]

In [None]:
#[list(att2idx.keys())[list(att2idx.values()).index(i)] for i in select_feature]

In [20]:
#data=pd.DataFrame(x_list,columns=list(range(len(x_list[0]))))

In [21]:
# data
#data.loc[:,(data!=0).any(axis=0)]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,76,77,78,79,80,81,82,83,84,85
0,-156.479996,-98.579498,0.0,0.0,0.0,-97.394829,-168.118286,39.828175,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,120.0,325145952.0
1,0.000000,-105.292778,0.0,0.0,0.0,0.000000,0.000000,40.019444,1655.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,97385.0
2,-77.041000,-77.036667,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,705749.0
3,0.000000,13.520207,0.0,0.0,0.0,0.000000,0.000000,49.231163,472.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11032.0
4,0.000000,-0.653611,0.0,0.0,0.0,0.000000,0.000000,45.028057,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1375.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17070,0.000000,-80.220001,0.0,0.0,0.0,0.000000,0.000000,33.660000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34355.0
17071,0.000000,0.381389,0.0,0.0,0.0,0.000000,0.000000,43.882221,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1135.0
17072,0.000000,13.528611,0.0,0.0,0.0,0.000000,0.000000,45.997776,196.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5632.0
17073,0.000000,4.233889,0.0,0.0,0.0,0.000000,0.000000,44.184723,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,902.0


In [None]:
# y_list=list()
# for ent2 in num_lit:
#     if ent2[gdp] ==0:
#         pass
#     else:
#         y_list.append(ent2[gdp])

In [None]:
# for i in range(len(x_list)):
#     inner_x = x_list[i]
#     inner_x.append(y_list[i])
# x_list.append(inner_x)


In [62]:
attri_data_train[attri_data_train['a']=='P1082'].describe()

Unnamed: 0,v
count,17157.0
mean,997784.6
std,33624580.0
min,0.0
25%,1250.0
50%,6576.0
75%,45670.0
max,4164252000.0


In [13]:
attri_data[attri_data['a']=='P1082']

Unnamed: 0,e,a,v,name_e,name_a,ent_type,new_stdv
17,Q25160,P1082,789.0,Olcenengo,population,Q747074,-0.029652
18,Q17296,P1082,769.0,Camino,population,Q747074,-0.029652
26,Q23243,P1082,756.0,"Bee, Piedmont",population,Q747074,-0.029653
46,Q591156,P1082,85930.0,St. Croix County,population,Q12178928,-0.027119
47,Q44835372,P1082,3289.0,Rio,population,Q747074,-0.029577
...,...,...,...,...,...,...,...
237007,Q46665,P1082,17879.0,Villorba,population,Q747074,-0.029143
237009,Q50083,P1082,280.0,Civitella Alfedena,population,Q747074,-0.029667
237020,Q113438,P1082,3570.0,Vågå,population,Q755707,-0.029569
237039,Q17780,P1082,162.0,Castelletto Molina,population,Q747074,-0.029670


In [63]:
def select_feat(train_data, valid_data, select_all=True):
    '''Selects useful features to perform regression'''

    sc = StandardScaler()
    train_data = sc.fit_transform(train_data)
    valid_data = sc.transform(valid_data)
    
    y_train, y_valid = train_data[:,-1], valid_data[:,-1]
    raw_x_train, raw_x_valid = train_data[:,:-1], valid_data[:,:-1]
    

    if select_all:
        feat_idx = list(range(raw_x_train.shape[1]))
    else:
        feat_idx = select_feature # TODO: Select suitable feature columns.
        
    return raw_x_train[:,feat_idx], raw_x_valid[:,feat_idx], y_train, y_valid

In [33]:
class KGMTL_Data(Dataset):
    '''
    x: Features.
    y: Targets, if none, do prediction.
    '''
    def __init__(self, x, y=None):
        if y is None:
            self.y = y
        else:
            self.y = torch.FloatTensor(y)
        self.x = torch.FloatTensor(x)
        

    def __getitem__(self, idx):
        if self.y is None:
            return self.x[idx]
        else:
            return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)


    

In [55]:
train_data 

array([['Q111349', 'P1332_Longtiude', 10.397868, ...,
        'coordinates of northernmost point', 'Q747074',
        -0.0796088333226554],
       ['Q51413', 'P625_Longtiude', 13.859156, ...,
        'coordinate location(logtitude)', 'Q747074', 0.2003265611430018],
       ['Q7382874', 'P569', 1926.0, ..., 'date of birth', 'Q5',
        -0.0840688844160978],
       ...,
       ['Q43067', 'P570', 1945.0, ..., 'date of death', 'Q5',
        0.0923617917975669],
       ['Q810317', 'P582', 1977.0, ..., 'end time', 'Q27020041',
        -0.4844878349514918],
       ['Q801206', 'P1619', 1889.0, ..., 'date of official opening',
        'Q55488', -0.7155432573543784]], dtype=object)

In [35]:
x_list[0][-1]

325145950.0

In [None]:
raw_x_train = train_data[:,:-1]

In [None]:
raw_x_train[0][84]

In [58]:
attri_data_valid.values

array([['Q977964', 'P2047', 1110004.0],
       ['Q5847390', 'P2044', 564.0],
       ['Q29367', 'P625_Longtiude', 12.3243],
       ...,
       ['Q8034527', 'P1540', 743.0],
       ['Q1143978', 'P1418', 129.0],
       ['Q669003', 'P1082', 664.0]], dtype=object)

In [54]:
# Set seed for reproducibility
same_seed(config['seed'])


# train_data size: 2699 x 118 (id + 37 states + 16 features x 5 days) 
# test_data size: 1078 x 117 (without last day's positive rate)
train_data, valid_data = x_list_train, x_list_valid
#train_data, valid_data = train_valid_split(x_list, config['valid_ratio'], config['seed'])

# Print out the data size.
print(f"""train_data size: {train_data.shape} 
valid_data size: {valid_data.shape} """)
# test_data size: {test_data.shape}""")


# Select features
x_train, x_valid, y_train, y_valid = select_feat(train_data, valid_data, config['select_all'])

# Print out the number of features.
print(f'number of features: {x_train.shape[1]}')

train_dataset, valid_dataset = KGMTL_Data(x_train, y_train), \
                                            KGMTL_Data(x_valid, y_valid)

print('train_dataset', train_dataset[0])

# Pytorch data loader loads pytorch dataset into batches.
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)
# test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)

train_data size: (237046, 7) 
valid_data size: (29631, 3) 


ValueError: could not convert string to float: 'Q111349'

In [None]:
train_dataset[6]

In [None]:
class NeuralNet(nn.Module):
    ''' A simple fully-connected deep neural network '''
    def __init__(self, input_dim):
        super(NeuralNet, self).__init__()

        self.layers = nn.Sequential(
            nn.Linear(input_dim, 10),
            nn.ReLU(),
            nn.Dropout(0.5),
#             nn.Linear(64,32),
#             nn.ReLU(),
            nn.Linear(10, 1),
            nn.Tanh()
        )

        # Mean squared error loss
        self.criterion = nn.MSELoss(reduction='mean')

    def forward(self, x):
        ''' Given input of size (batch_size x input_dim), compute output of the network '''
        x = self.layers(x)
        x = x.squeeze(1)
        return x

    def cal_loss(self, pred, target):
        ''' Calculate loss '''
        # TODO: you may implement L1/L2 regularization here
        return self.criterion(pred, target)

# **Preprocess**

We have three kinds of datasets:
* `train`: for training
* `dev`: for validation
* `test`: for testing (w/o target value)

In [None]:
loss_record={'train': [], 'dev': [],'mean_train_loss':[],'mean_valid_loss':[]} 

def trainer(train_loader, valid_loader, model, config, device):

    criterion = nn.MSELoss(reduction='mean') # Define your loss function, do not modify this.

    # Define your optimization algorithm. 
    # TODO: Please check https://pytorch.org/docs/stable/optim.html to get more available algorithms.
    # TODO: L2 regularization (optimizer(weight decay...) or implement by your self).
    optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=0.9, weight_decay=1e-6) 

    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0

    writer = SummaryWriter() # Writer of tensoboard
#     if not os.path.isdir('./models_var'):
#         os.mkdir('./models_var') # Create directory of saving models.
    
    for epoch in range(n_epochs):
        model.train() # Set your model to train mode.

        # tqdm is a package to visualize your training progress.
        train_pbar = tqdm(train_loader, position=0, leave=True)

        for x, y in train_pbar:
            optimizer.zero_grad()               # Set gradient to zero.
            
            x, y = x.to(device), y.to(device)   # Move your data to device. 
            pred = model(x)   
            print(f'-------predict: {pred}, y: {y}----------') 
            #x_constraint = torch.tensor([ (y[i] - x[i][0]*x[i][18]) ** 2 for i in range(len(x))])
            #x_constraint = torch.tensor([x[i][pop_idx]*x[i][gdp_per] for i in range(len(x))])
            #print(x_constraint)
            #x_constraint = x_constraint.to(device)          
            #loss = criterion(pred, y) + criterion(pred, x_constraint)
            loss = criterion(pred, y) 
            # criterion(pred,x_constraint)
                # ((pred-x[0]*x[18])**2) 

            loss.backward()                     # Compute gradient(backpropagation).
            optimizer.step()                    # Update parameters.
            step += 1
            loss_record["train"].append(loss.detach().item())
            
            # Display current epoch number and loss on tqdm progress bar.
            train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
            train_pbar.set_postfix({'loss': loss.detach().item()})

        mean_train_loss = sum(loss_record["train"])/len(loss_record["train"])
        writer.add_scalar('Loss/train', mean_train_loss, step)
        loss_record['mean_train_loss'].append(mean_train_loss)

        model.eval() # Set your model to evaluation mode.
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                # print(f'x: {x}')
                loss = criterion(pred, y)

            loss_record["dev"].append(loss.item())
            
        mean_valid_loss = sum(loss_record["dev"])/len(loss_record["dev"])
        print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
        writer.add_scalar('Loss/valid', mean_valid_loss, step)
        loss_record['mean_valid_loss'].append(mean_valid_loss)

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_path']) # Save your best model
            print('Saving model with loss {:.3f}...'.format(best_loss))
            early_stop_count = 0
        else: 
            early_stop_count += 1

        if early_stop_count >= config['early_stop']:
            print('\nModel is not improving, so we halt the training session.')
            return

## **Validation**

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
def eval_matrics(y_test, y_pred):

    MSE = mean_squared_error(y_test, y_pred)
    print('MSE=',MSE)
    RMSE =np.sqrt(MSE)
    print('RMSE=',RMSE)
    MAE= mean_absolute_error(y_test, y_pred)
    print('MAE=',MAE)

    R2=1-MSE/np.var(y_test)
    print("R2=", R2)

## **Testing**

In [None]:
def test(tt_set, model, device):
    model.eval()                                # set model to evalutation mode
    preds = []; y_b=[]
    for x,y in tt_set:                            # iterate through the dataloader
        x ,y = x.to(device), y.to(device)                          # move data to device (cpu/cuda)
        with torch.no_grad():                   # disable gradient calculation
            pred = model(x)                     # forward pass (compute output)
            preds.append(pred.detach().cpu())
            y_b.append(y.detach().cpu())   # collect prediction
    preds = torch.cat(preds, dim=0).numpy().reshape(-1,1)     # concatenate all predictions and convert to a numpy array
    y_b= torch.cat(y_b,0).numpy().reshape(-1,1) 
    table  = np.concatenate((preds, y_b),axis=1)
    eval_matrics(y_b,preds)
    return table

In [38]:
device = get_device()                 # get the current available device ('cpu' or 'cuda')
#os.makedirs('models', exist_ok=True)  # The trained model will be saved to ./models/

# TODO: How to tune these hyper-parameters to improve your model's performance?
config = {
    'seed': 5201314,      # Your seed number, you can pick your lucky number. :)
    'select_all': False,   # Whether to use all features.
    'n_epochs': 50,                # maximum number of epochs
    'batch_size': 200,               # mini-batch size for dataloader
    'learning_rate':1e-3,
    'early_stop': 15,               # early stopping epochs (the number epochs since your model's last improvement)
    'save_path': './models_var/model_pop_no_cons.pt' , # your model will be saved here
    'valid_ratio': 0.1,   # validation_size = train_size * valid_ratio
}


# **Load data and model**

In [None]:
model = NeuralNet(input_dim=x_train.shape[1]).to(device)  # Construct model and move to device
print(model)

# **Start Training!**

In [None]:
trainer(train_loader, valid_loader, model, config, device)

In [None]:
loss_record["mean_valid_loss"]

In [None]:
plot_learning_curve(loss_record, title='deep model')

In [None]:
del model
model = NeuralNet(input_dim=x_train.shape[1]).to(device)
ckpt = torch.load(config['save_path'], map_location='cpu')  # Load your best model
model.load_state_dict(ckpt)

In [None]:
preds = test(valid_loader, model, device) 

In [None]:
valid_data

# **Testing**
The predictions of your model on testing set will be stored at `pred.csv`.

In [None]:
def save_pred(preds, file):
    ''' Save predictions to specified file '''
    print('Saving results to {}'.format(file))
    with open(file, 'w') as fp:
        writer = csv.writer(fp)
        writer.writerow(['id', 'tested_positive'])
        for i, p in enumerate(preds):
            writer.writerow([i, p])

preds = test(valid_loader, model, device)  # predict COVID-19 cases with your model
print('pred')         # save prediction file to pred.csv

# **Hints**

## **Simple Baseline**
* Run sample code

## **Medium Baseline**
* Feature selection: 40 states + 2 `tested_positive` (`TODO` in dataset)

## **Strong Baseline**
* Feature selection (what other features are useful?)
* DNN architecture (layers? dimension? activation function?)
* Training (mini-batch? optimizer? learning rate?)
* L2 regularization
* There are some mistakes in the sample code, can you find them?

# **Reference**
This code is completely written by Heng-Jui Chang @ NTUEE.  
Copying or reusing this code is required to specify the original author. 

E.g.  
Source: Heng-Jui Chang @ NTUEE (https://github.com/ga642381/ML2021-Spring/blob/main/HW01/HW01.ipynb)


In [None]:
## Load pretrain embedding
emb_ent = torch.load('../LiterallyWikidata/files_needed/pretrained_kge/pretrained_complex_entemb.pt')
list_ent_ids =[]
with open('../LiterallyWikidata/files_needed/list_ent_ids.txt','r') as f:
    for line in f:
        list_ent_ids.append(line.strip())
## Preparing ent embedding
ent2idx = {e:i for i,e in enumerate(list_ent_ids)}
attri_data['ent_idx']= attri_data['e'].map(ent2idx)
embedding_e = torch.nn.Embedding.from_pretrained(emb_ent)
input_e = torch.LongTensor(attri_data['ent_idx'].to_numpy())

entity_embedding = embedding_e(input_e)
## Preparing att embedding
# att2idx = {a:i for i,a in enumerate(attri_data['a'].unique())}
attri_data['a_idx']=attri_data['a'].map(att2idx)
embedding_a = torch.nn.Embedding(len(att2idx),128,padding_idx=0)
input_a = torch.LongTensor(attri_data['a'].to_numpy())

attribute_embedding = embedding_a(input_a)
## concat two embedding
x_data = torch.cat([entity_embedding,attribute_embedding],dim=1).detach().numpy()

