In [1]:
import torch
from torchvision import transforms, datasets
device = 'cpu'

import pandas as pd
import numpy as np

from sklearn import linear_model
from sklearn.model_selection import cross_val_score

seed = 42069
torch.backends.cudnn.deterministic = True
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)

from tqdm import tqdm

In [2]:
train_file = 'train_features.csv'
train_lab_file = 'train_labels.csv'
df_x_train = pd.read_csv(train_file)
df_y_train = pd.read_csv(train_lab_file).iloc[:,-1]

pretrain_file = 'pretrain_features.csv'
pretrain_lab_file = 'pretrain_labels.csv'
df_x_pretrain = pd.read_csv(pretrain_file)
df_y_pretrain = pd.read_csv(pretrain_lab_file).iloc[:,-1]

test_file = 'test_features.csv'
df_x_test = pd.read_csv(test_file)

df_features = pd.concat([df_x_pretrain,df_x_train,df_x_test]).iloc[:,2:]
# delete features with constants
df_features = df_features.loc[:, (df_features != 1).any(axis=0)]
df_features = df_features.loc[:, (df_features != 0).any(axis=0)]
selected_features = df_features.columns
df_test_index = df_x_test['Id']
df_x_pretrain = df_x_pretrain[selected_features]
df_x_train = df_x_train[selected_features]
df_x_test = df_x_test[selected_features]



In [3]:
pretrain_x_t = torch.tensor(df_x_pretrain.values).float().to(device)
pretrain_y_t = torch.tensor(df_y_pretrain.values).float().to(device)
# normalize output data
pre_y_mean = torch.mean(pretrain_y_t)
pre_y_std = torch.std(pretrain_y_t)
pretrain_y_norm_t = (pretrain_y_t-pre_y_mean)/(pre_y_std+1e-5)

pretrain_x_feature_t = pretrain_x_t

pretrain_ds = torch.utils.data.TensorDataset(pretrain_x_feature_t, pretrain_y_norm_t)
#split into train and validation set
pretrain_size = int(0.8 * len(pretrain_ds))
prevalid_size = len(pretrain_ds) - pretrain_size
pretrain_ds, pretrain_valid_ds = torch.utils.data.random_split(pretrain_ds,[pretrain_size,prevalid_size])
pretrain_loader = torch.utils.data.DataLoader(pretrain_ds, batch_size=128, shuffle=True)
prevalid_loader = torch.utils.data.DataLoader(pretrain_valid_ds, batch_size=10001, shuffle=True)

In [4]:
class lumo_predictor(torch.nn.Module):

    def __init__(self, input_dim: int):
        super().__init__() 

        layer_seq = [input_dim,500]
        self.reg_layer = torch.nn.Linear(500,1)
        self.do_regress = True
              
        num_layer = len(layer_seq)-1
        layer_fc = [None]*num_layer
        
        num_param=500+1
        
        for i in range(num_layer):
            layer_fc[i] = torch.nn.Linear(layer_seq[i],layer_seq[i+1])
            num_param +=(layer_seq[i]+1)*layer_seq[i+1]
        print(f"{num_param} parameters")
        print(f"{num_layer} hidden layers")
        
        self.layer_fc = torch.nn.ModuleList(layer_fc)
        self.activation_fn = torch.nn.ReLU()
        self.dropout_hid = torch.nn.Dropout(0.1)
        

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        result = x
        for func in self.layer_fc:
            result = func(result)
            result = self.activation_fn(result)
            result = self.dropout_hid(result)
        if self.do_regress:
            result = self.reg_layer(result)
        
        return result
            
model = lumo_predictor(pretrain_x_feature_t.shape[1]).to(device)

optim = torch.optim.Adam(model.parameters(),lr=1e-4)
loss_func = torch.nn.MSELoss()

488501 parameters
1 hidden layers


In [6]:


def evaluate(model: torch.nn.Module) :
    # goes through the test dataset and computes the test accuracy
    model.eval()  # bring the model into eval mode
    with torch.no_grad():
        loss_cum = 0.0
        num_eval_samples = 0
        
        for x_batch, y_batch in prevalid_loader:
            output = model(x_batch).flatten()*pre_y_std+pre_y_mean
            y_batch = y_batch*pre_y_std+pre_y_mean
            batch_size = output.shape[0]
            
            print(f"predicted:{output[:8]}")
            print(f"actual:   {y_batch[:8]}")
            num_eval_samples += batch_size
            loss_batch = loss_func(output, y_batch)
            loss_cum += loss_batch * batch_size
            
        avg_loss = loss_cum / num_eval_samples
        return torch.sqrt(avg_loss)
            
        
for epoch in range(11):
    # reset statistics trackers
    train_loss_cum = 0.0
    num_samples_epoch = 0
    num_errors = 0
    for x_batch, y_batch in pretrain_loader:
        # zero grads and put model into train mode
        optim.zero_grad()
        model.train()
        # forward pass
        output = model(x_batch).flatten()
        loss = loss_func(output, y_batch)
        # backward pass
        loss.backward()
        optim.step()

        # keep track of train stats
        num_samples_batch = output.shape[0]
        num_samples_epoch += num_samples_batch
        train_loss_cum += loss * num_samples_batch
            
    # average the accumulated statistics
    avg_train_loss = train_loss_cum / num_samples_epoch
    verif_loss = evaluate(model)

    # print some infos
    print(f'Epoch {epoch} | Train loss: {avg_train_loss:.4f} | verif rms: {verif_loss:.4f}')
    # save checkpoint of model
    if epoch % 5 == 0 and epoch > 0:
        save_path = f'model_lumo_epoch_{epoch}.pt'
        torch.save({'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optim.state_dict()},
                    save_path)
        print(f'Saved model checkpoint to {save_path}')

predicted:tensor([-3.6573, -3.5196, -4.1616, -3.0519, -3.0425, -3.0426, -3.1657, -2.8293])
actual:   tensor([-3.7025, -3.6878, -4.0376, -3.0299, -3.1004, -2.7587, -3.3262, -2.7418])
Epoch 0 | Train loss: 0.2643 | verif loss: 0.1229
predicted:tensor([-3.5114, -3.3087, -3.7808, -3.4855, -3.9178, -3.4424, -3.6918, -3.3689])
actual:   tensor([-3.4710, -3.4436, -4.0092, -3.3869, -3.9373, -3.5650, -3.7813, -3.4592])
Epoch 1 | Train loss: 0.0828 | verif loss: 0.1034
predicted:tensor([-3.2908, -2.8637, -3.8747, -2.8474, -2.8535, -3.3317, -2.9391, -3.0289])
actual:   tensor([-3.2077, -2.8636, -3.6948, -2.8012, -2.8783, -3.3338, -2.7123, -3.1293])
Epoch 2 | Train loss: 0.0635 | verif loss: 0.0945
predicted:tensor([-3.4509, -3.0825, -3.8961, -2.8301, -3.6790, -4.0233, -3.1661, -3.4717])
actual:   tensor([-3.4506, -3.0140, -3.9731, -2.9336, -3.7103, -3.9692, -3.1077, -3.5771])
Epoch 3 | Train loss: 0.0534 | verif loss: 0.0882
predicted:tensor([-3.6085, -3.0946, -3.7560, -3.1289, -3.0631, -2.7868, 

In [86]:
def reload_model(number):
    previous_run = torch.load(f'./model_lumo_epoch_{number}.pt')
    model.load_state_dict(previous_run['model_state_dict'])
    optim.load_state_dict(previous_run['optimizer_state_dict'])
reload_model(10)

In [7]:
train_x_t = torch.tensor(df_x_train.values).float().to(device)
train_y_t = torch.tensor(df_y_train.values).float().to(device)


test_t = torch.tensor(df_x_test.values).float().to(device)

model.eval()
model.do_regress = False

with torch.no_grad():    
    train_x_feature_t = model(train_x_t)
    test_feature_t = model(test_t)



In [8]:
alphas = [0,*np.logspace(-5,1)]
models = [linear_model.Ridge(alpha=alpha) for alpha in alphas]
rms_loss = [-cross_val_score(model, train_x_feature_t, train_y_t, scoring='neg_root_mean_squared_error', n_jobs=2, cv=10) for model in tqdm(models)]
max_loss = np.max(rms_loss,axis = 1)
max_loss

100%|██████████████████████████████████████████████████████████████████████████████████| 51/51 [00:06<00:00,  8.46it/s]


array([0.20249552, 0.20249537, 0.20249533, 0.20249526, 0.20249518,
       0.20249507, 0.20249493, 0.20249474, 0.20249449, 0.20249415,
       0.20249371, 0.20249312, 0.20249234, 0.20249131, 0.20248994,
       0.20248813, 0.20248574, 0.20248258, 0.20247841, 0.20247291,
       0.20246568, 0.20245619, 0.20244379, 0.20242765, 0.20240678,
       0.20238   , 0.20234605, 0.20230368, 0.20225195, 0.20219075,
       0.20212172, 0.20204963, 0.2019844 , 0.20194385, 0.20195688,
       0.2020668 , 0.20233402, 0.20283733, 0.20367353, 0.20495549,
       0.20680955, 0.2099286 , 0.21558562, 0.22261083, 0.23115837,
       0.24132914, 0.25313963, 0.26650341, 0.28123489, 0.29707681,
       0.313745  ])

In [10]:
index = np.argmin(max_loss)
print(f"alpha={alphas[index]}, rms={np.min(max_loss)}")
reg = models[index]
reg.fit(train_x_feature_t,train_y_t)
answer = reg.predict(test_feature_t)

sol = pd.DataFrame(df_test_index,columns=['Id'])
sol.insert(1, 'y', np.around(answer,decimals=1))
sol.to_csv("submission.csv",index=False,float_format='%.1f')
sol['y'].head(10)

alpha=0.08286427728546843, rms=0.20194384790260664


0    1.7
1    2.2
2    1.7
3    2.4
4    1.6
5    1.6
6    2.1
7    2.3
8    1.8
9    1.6
Name: y, dtype: float64