## Install Packages

In [None]:
# !pip install scikit-learn
# !pip install numpy
# !pip install pandas
# !pip install torch
# !pip install tqdm

## Import Packages

In [None]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

from tqdm import tqdm
import math
import random
import csv

## Useful Functions

### Set seeds

In [None]:
def same_seeds(seed):
    random.seed(seed) 
    np.random.seed(seed)  
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed) 
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

### Read Dataset from CSV Files

In [None]:
def preprocessData(previousData, data, dataType):
    preprocessedData = []
    currentDate = []
    dateData = []
    dateLabel = []
    
    for singleData in data:
        singleData[5] -= 24.5
        singleData[6] -= 121
        singleData[4] /= 60
        if len(currentDate) == 0:
            currentDate = [singleData[0], singleData[1], singleData[2], singleData[3], singleData[4]]
            dateData.append([singleData[5], singleData[6], singleData[7]])
            dateLabel.append(singleData[8])
            if dataType == 'test':
                dateLabel.append(singleData[9])
        elif currentDate == [singleData[0], singleData[1], singleData[2], singleData[3], singleData[4]]:
            dateData.append([singleData[5], singleData[6], singleData[7]])
            dateLabel.append(singleData[8])
            if dataType == 'test':
                dateLabel.append(singleData[9])
        else:
            dateData = list(np.concatenate(dateData).flat)
            preprocessedData.append(dateData+currentDate+dateLabel)
            currentDate = [singleData[0], singleData[1], singleData[2], singleData[3], singleData[4]]
            dateData = [[singleData[5], singleData[6], singleData[7]]]
            dateLabel = [singleData[8]]
            if dataType == 'test':
                dateLabel.append(singleData[9])
                
    dateData = list(np.concatenate(dateData).flat)
    preprocessedData.append(dateData+currentDate+dateLabel)
    
    for singlePreprocessedData in preprocessedData:
        previousData.append(singlePreprocessedData)
    return previousData

def readDataset(data_filepath, inference_filepath):
    assert os.path.exists(data_filepath)
    filenames = os.listdir(data_filepath)
    if '.DS_Store' in filenames:
        filenames.remove('.DS_Store')
    filenames = sorted(filenames)
    train, valid, test = [], [], []
    for idx, filename in enumerate(filenames):
        data = (pd.read_csv(data_filepath + filename).values).tolist()
        if idx >= len(filenames) - 14:
            # validation
            valid = preprocessData(valid, data, 'valid')
        else:
            #training
            train = preprocessData(train, data, 'train')
    
    assert os.path.exists(inference_filepath)
    filenames = os.listdir(inference_filepath)
    if '.DS_Store' in filenames:
        filenames.remove('.DS_Store')
    filenames = sorted(filenames, reverse=True)
    test = []
    for idx, filename in enumerate(filenames):
        print(filename)
        data = (pd.read_csv(inference_filepath + filename).values).tolist()
        test = preprocessData(test, data, 'test')
    return train, valid, test

## My Youbike Dataset Class

In [None]:
class YoubikeDataset(Dataset):
    def __init__(self, data, dataType):
        super(YoubikeDataset, self).__init__()
        # [month, date, weekday, hr, min, lat, lng, act, ratio, sbi, tot, title, act_title]
        # [month, day ,weekday ,hr ,min ,lat ,lng ,act ,tot ,title ]
        self.data = data
        self.datasize = len(self.data)
        self.type = dataType

    def __getitem__(self, idx):
        if self.type == "train" or self.type == "val":
            features = self.data[idx][:-112]
            labels = self.data[idx][-112:]
            return torch.FloatTensor(features), torch.FloatTensor(labels)
        elif self.type == "test":
            features = self.data[idx][:-224]
            outputInfo = self.data[idx][-224:]
            return torch.FloatTensor(features), outputInfo
        else:
            raise NotImplementedError
            
    def __len__(self):
        return self.datasize

## My Model(s)

### DNN model

In [None]:
class My_Model(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(My_Model, self).__init__()
        # TODO: modify model's structure, be aware of dimensions. 
        self.layers = nn.Sequential(
            nn.Linear(input_dim, input_dim),
            nn.Dropout(p=0.1),
            nn.ReLU(),
            nn.Linear(input_dim, 2*input_dim),
            nn.Dropout(p=0.1),
            nn.ReLU(),
            nn.Linear(2*input_dim, 4*input_dim),
            nn.Dropout(p=0.1),
            nn.ReLU(),
            nn.Linear(4*input_dim, 8*input_dim),
            nn.Dropout(p=0.1),
            nn.ReLU(),
            nn.Linear(8*input_dim, 8*input_dim),
            nn.Dropout(p=0.1),
            nn.ReLU(),
            nn.Linear(8*input_dim, 4*input_dim),
            nn.Dropout(p=0.1),
            nn.ReLU(), 
            nn.Linear(4*input_dim, 2*input_dim),
            nn.Dropout(p=0.1),
            nn.ReLU(),
            nn.Linear(2*input_dim, input_dim),
            nn.Dropout(p=0.1),
            nn.ReLU(),
            nn.Linear(input_dim, output_dim*2),
            nn.Dropout(p=0.1),
            nn.ReLU(),
            nn.Linear(output_dim*2, output_dim),
#             nn.Dropout(p=0.1),
            nn.Sigmoid(),
#             nn.Linear(input_dim, 4),
#             nn.Sigmoid(),
#             nn.Linear(4, 2),
#             nn.Sigmoid(),
#             nn.Linear(2, 1),
#             nn.Sigmoid(),
#             nn.Linear(input_dim, 1),
#             nn.Sigmoid(),
        )

    def forward(self, x):
        x = self.layers(x)
        return x

## My Loss Function

In [None]:
def getLoss(pred, label):
    loss = torch.mean(torch.square(pred-label))
#     print(f"loss = {loss}")
    return loss

## Training function

In [None]:
def train(model, config, train_loader, valid_loader, device):
#     criterion = nn.MSELoss(reduction='mean') # Define your loss function, do not modify this.
    # criterion = nn.CrossEntropyLoss()
#     optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay']) 
    optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay']) 
    scheduler = CosineAnnealingWarmRestarts(optimizer,T_0=1,T_mult=2)
    if not os.path.isdir(config["save_dir"]):
        os.mkdir(config["save_dir"]) # Create directory of saving models.

    n_epochs, best_loss, step, early_stop_count = config['epochs'], math.inf, 0, 0

    for epoch in range(n_epochs):
        model.train() # Set your model to train mode.
        loss_record = []
        print(scheduler.get_last_lr())

        # tqdm is a package to visualize your training progress.
        train_pbar = tqdm(train_loader, position=0, leave=True)
        final_y, final_pred = None, None
        for x, y in train_pbar:
            optimizer.zero_grad()               # Set gradient to zero.
            x, y = x.to(device), y.to(device)   # Move your data to device. 
            pred = model(x) 
            
            loss = getLoss(pred, y)
            loss.backward()                     # Compute gradient(backpropagation).
            optimizer.step()                    # Update parameters.
            step += 1
            loss_record.append(loss.detach().item())
            
            # Display current epoch number and loss on tqdm progress bar.
            train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
            train_pbar.set_postfix({'loss': loss.detach().item()})
            final_y = y
            final_pred = pred

        scheduler.step()
        mean_train_loss = sum(loss_record)/len(loss_record)
        print(final_y)
        print(final_pred)

        model.eval() # Set your model to evaluation mode.
        loss_record = []
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                loss = getLoss(pred, y)

            loss_record.append(loss.item())
            
        mean_valid_loss = sum(loss_record)/len(loss_record)
        print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
        # writer.add_scalar('Loss/valid', mean_valid_loss, step)

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_dir'] + config['model_name']) # Save your best model
            print('Saving model with loss {:.4f}...'.format(best_loss))
            early_stop_count = 0
        else: 
            early_stop_count += 1

        if early_stop_count >= config['early_stop']:
            print('\nModel is not improving, so we halt the training session.')
            print('best loss {:.4f}...'.format(best_loss))
            return

## Predict function

In [None]:
def predict(test_loader, model, device):
    model.eval() # Set your model to evaluation mode.
    preds = []
    tots = []
    titles = []
    for x, tmp in tqdm(test_loader):
        x = x.to(device)   
        infos = []
        for i in range(len(tmp[0])):
            lst = [tmp[x][i] for x in range(len(tmp)) ]
            infos = infos + lst
        for idx, info in enumerate(infos):
            if idx % 2 == 0:
                tots.append(info)
            else:
                titles.append(info)
        with torch.no_grad():                   
            pred = model(x)                     
            preds.append(pred.detach().cpu())
            
    preds = list(np.concatenate(preds).flat)
    print(len(preds), len(tots), len(titles))
    assert len(preds) == len(tots)
    assert len(tots) == len(titles)
    prediction = [['id','sbi']]
    for (pred, tot, title) in zip(preds, tots, titles):
        prediction.append([title, pred*tot.item()])
    
    with open('prediction.csv', 'w', newline='') as file:
    # Step 4: Using csv.writer to write the list to the CSV file
        writer = csv.writer(file)
        writer.writerows(prediction) # Use writerows for nested list
    
    return

# Hyperparameters

In [None]:
config = {
    "batch_size": 2,
#     "data_filepath": 'dataset_csv/',
    "data_filepath": '/kaggle/input/dataset-1201-new/dataset_w_csv/',
#     "inference_filepath": '/kaggle/input/inference-1204/inference_csv/',
    "inference_filepath": '/kaggle/input/inference-new-1204/inference_w_csv/',
    "epochs": 100,
    "learning_rate": 2.5e-3,
#     "weight_decay": 5e-4,
    "weight_decay": 0,
    "save_dir": "./models/",
    "model_name": "1201-DNN.ckpt",
    "checkpoint": "/kaggle/input/300-ckpt/1201-DNN.ckpt",
    "useCheckpoint": True,
    "early_stop":100,
    "seeds": 10901039
}
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Load Datasets

In [None]:
same_seeds(config["seeds"])
train_data, valid_data, test_data = readDataset(config['data_filepath'], config['inference_filepath'])
print(f'train_data_size: {len(train_data)}')
print(f'valid_data_size: {len(valid_data)}')
print(f'test_data_size : {len(test_data)}')
train_dataset, valid_dataset, test_dataset = YoubikeDataset(train_data, "train"), YoubikeDataset(valid_data, "val"), YoubikeDataset(test_data, "test")
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
test_loader  = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)

## Load Checkpoint

In [None]:
model = My_Model(input_dim=len(train_data[0])-112, output_dim=112).to(device) # put your model and data on the same computation device.
if config['useCheckpoint']:
    model.load_state_dict(torch.load(config['checkpoint']))
    
print(model)

## Start Training

In [None]:
train(model, config, train_loader, valid_loader, device)

## Inference

In [None]:
predict(test_loader, model, device)