In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
import os
import numpy as np
import pandas as pd
import warnings
# from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_auc_score
import random
%config InlineBackend.figure_format = 'retina'
warnings.filterwarnings('ignore')


In [18]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
writer = SummaryWriter("train result")
# device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device = torch.device('mps' if torch.has_mps else 'cpu')
device


device(type='mps')

In [19]:
# load dataset
def load_dataset(path):
    return pd.read_csv(path, index_col=0)

# extract X_train, y_train, X_test, y_test


class MyDataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.as_tensor(x)
        self.y = torch.as_tensor(y)
        self.len = x.shape[0]

    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return self.len


def extract_Xy(train_df, target_df, window=28):
    
    new = pd.concat([train_df, target_df], axis=1) # (30460, 1064) 1036+28

    X = [new.iloc[:, i:i+28] for i in range(0, new.shape[1]-28-27, 28)]
    y = [new.iloc[:, i:i+28] for i in range(28, new.shape[1]-27, 28)]

    X, y = np.array(X), np.array(y)

    # (30460, 28, 1009) = (user, window=28, )
    X = np.stack([X[:, i, :].T for i in range(X.shape[1])])
    # (30460, 28, 1009)
    y = np.stack([y[:, i, :].T for i in range(y.shape[1])])

    train_len, val_len = int(0.8*X.shape[0]), int(0.1*X.shape[0])
    all = set(range(X.shape[0]))
    train_ch = set(random.sample(range(X.shape[0]), train_len))
    val_ch = set(random.sample(all-train_ch, val_len))
    test_ch = all-train_ch-val_ch

    X_train, y_train = X[list(train_ch), :, :], y[list(train_ch), :, :]
    X_test, y_test = X[list(test_ch), :, :], y[list(test_ch), :, :]
    X_val, y_val = X[list(val_ch), :, :], y[list(val_ch), :, :]

    train_df = MyDataset(X_train, y_train)
    test_df = MyDataset(X_test, y_test)
    val_df = MyDataset(X_val, y_val)

    return train_df, test_df, val_df

In [20]:
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        self.dropout = 0.5
        self.input_size = 37
        self.output_size = 37
        self.hidden_size = 100
        self.num_layers = 2

        self.lstm = nn.LSTM(self.input_size, self.hidden_size, self.num_layers,
                            bidirectional=True, batch_first=True, dropout=self.dropout)
        self.fc = nn.Linear(self.hidden_size*2, self.output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Set initial hidden and cell states
        h0 = torch.zeros(self.num_layers*2, x.size(0),
                         self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers*2, x.size(0),
                         self.hidden_size).to(device)

        out, h_n = self.lstm(x, (h0, c0))
        # print(out.shape)
        out = self.fc(out).to(device)
        # print(out.shape)
        out = self.sigmoid(out)
        # print(out.shape)
        return out


def train(model, train_loader, test_loader, val_loader, window):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.BCELoss()

    total_batch = 0  # record batch
    dev_best_loss = float('inf')
    require_improvement = 1000
    last_improve = 0  # record last batch that the loss decrease
    flag = False  # if improved or not

    for epoch in range(4):
        
        for i, (datas, labels) in enumerate(train_loader):
            datas = datas.to(device, dtype=torch.float)
            labels = labels.to(device, dtype=torch.float)

            # print(datas.shape, labels.shape)

            optimizer.zero_grad()
            outputs = model(datas)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            dev_loss,auc = evaluate(model, val_loader, test=False)
            if dev_loss < dev_best_loss:
                dev_best_loss = dev_loss
                torch.save(model.state_dict(), 'RNN.ckpt')
                last_improve = total_batch
            if i % 50 == 0 or i == len(train_loader):
                print(
                    f'Epoch:{epoch}/100  Iter: {i}/{len(train_loader)} || Train Loss: {loss.item():.4f}, Val Loss: {dev_loss:.4f}, AUC: {auc:.4f}')
            
            model.train()  # return to train mode
            total_batch += 1
            if total_batch - last_improve > require_improvement:
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        if flag:
            break
        writer.add_scalar("loss/train", loss.item(), epoch)
        writer.add_scalar("loss/val", dev_loss, epoch)
        writer.add_scalar('AUC/val',auc,epoch)
    writer.close()
    test(model, test_loader)


def test(model, test_loader):
    model.load_state_dict(torch.load('RNN64.ckpt'))
    model.eval()
    test_loss = evaluate(model, test_loader, test=True)
    msg = 'Test Loss: {0:>5.2}'
    print(msg.format(test_loss))


def evaluate(model, data_iter, test=False):
    model.eval()
    loss_total = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    with torch.no_grad():
        criterion = nn.BCELoss()
        for datas, labels in data_iter:
            datas = datas.to(device, dtype=torch.float)
            labels = labels.to(device, dtype=torch.float)
            outputs = model(datas)
            loss = criterion(outputs, labels)

            loss_total += loss
            labels = labels.data.cpu().numpy()
            predic = outputs.data.cpu().numpy()
            labels_all = np.append(labels_all, labels)
            predict_all = np.append(predict_all, predic)
        
        auc=roc_auc_score(labels_all,predict_all)
    return loss_total / len(data_iter),auc


# main


In [None]:
# train_path = os.path.abspath(
#     'C:\\Users\\user\\Desktop\\SHARE\\OneDrive - 國立台灣大學\\研究所課程\\研究所\\碩一下\\資料分析方法\\final project\\data-analytics-datagame\\light_train_source_labels.csv')
# test_path = os.path.abspath(
#     'C:\\Users\\user\\Desktop\\SHARE\\OneDrive - 國立台灣大學\\研究所課程\\研究所\\碩一下\\資料分析方法\\final project\\data-analytics-datagame\\light_test_source_labels.csv')
# target_path = os.path.abspath(
#     'C:\\Users\\user\\Desktop\\SHARE\\OneDrive - 國立台灣大學\\研究所課程\\研究所\\碩一下\\資料分析方法\\final project\\data-analytics-datagame\\light_train_target_labels.csv')

# random seed
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)

# load data
light_train = load_dataset('light_train_source_labels.csv')
light_test = load_dataset('light_test_source_labels.csv')
light_target = load_dataset('light_train_target_labels.csv')

# data preprocessing
# light_train = minmaxscaler(light_train, is_train=True)
# light_test = minmaxscaler(light_test, is_train=False)

# Dataset
trainset, testset, valset = extract_Xy(
    light_train, light_target)

# Dataloader
train_loader = DataLoader(trainset, batch_size=64,
                          shuffle=True, pin_memory=True)
test_loader = DataLoader(testset, batch_size=64,
                         shuffle=True, pin_memory=True)
val_loader = DataLoader(valset, batch_size=64, shuffle=True, pin_memory=True)

# build model
model = RNN().to(device)

# train
window = 28
train(model, train_loader=train_loader, test_loader=test_loader,
      val_loader=val_loader, window=window)


In [None]:
model = RNN().to(device)
model.load_state_dict(torch.load('RNN.ckpt'))

model.eval()
X_pre = np.array([light_test.iloc[:, i:i+28]
                 for i in range(0, light_test.shape[1]-27)])
X_pre = np.stack([X_pre[:, i, :].T for i in range(X_pre.shape[1])])
X_pre = torch.as_tensor(X_pre).to(
    device, dtype=torch.float)  # (7616, 28, 1009)

output = model(X_pre)  # (7616, 28, 1009)??
output = output.data.cpu().numpy()

cols = [f'time_slot_{i}' for i in range(28)]
index = list(range(30460, 38076))
output = pd.DataFrame(output[:, :, -1].reshape(7616, 28), columns=cols)
user = pd.DataFrame(range(30640, 48076), columns=['user_id'])
output = pd.concat([user, output], axis=1)

output.to_csv('RNN64_mac.csv', index=False)