*This code runs on synthetically generated data from "../data_preprocessing/Synthetic data generation and preprocessing.ipynb"*

*The below cell converts user part into one hot encoding vector, sort the data by time stamp and saves it in new data file*

In [None]:
import numpy as np
import pandas as pd

calls_train = pd.read_csv('../data/378calls_data.csv')
users = pd.Categorical(calls_train.user.unique())
users_new = users.codes
new_ids = pd.DataFrame({'user':users.categories,'user_id':users_new})
calls_train = pd.merge(calls_train, new_ids, on='user')
del calls_train['user']
#calls_train = calls_train[['user_id', 'duration','month', 'date', 'hour', 'minute', 'second']]
calls_train = calls_train[['user_id', 'duration', 'time_normalized']]
calls_train = calls_train.sort_values(by=['time_normalized'])
calls_train = calls_train.reset_index()
del calls_train['index']
cols = pd.get_dummies(calls_train.user_id).columns
calls_train[cols] = pd.get_dummies(calls_train.user_id)
columnss = ['user_id']+cols.to_list()
# columnss+=['duration','month', 'date', 'hour', 'minute', 'second']
columnss+=['duration', 'time_normalized']
calls_train = calls_train[columnss]
calls_train.to_csv('../data/large_calls_data.csv', index=False)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
import pickle as pkl
import logging

dev = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")
# dev = torch.device("cpu")
print(dev)

*Train test split(70:30)*

In [None]:
data = pd.read_csv("../data/large_calls_data.csv")
del data['user_id']
train_data = data[:81000]
test_data = data[81000:]
train_data = torch.tensor(train_data.values)
#d = d.index_select(1,torch.LongTensor([x for x in range(28)]))
test_data = torch.tensor(test_data.values)
print("train_dataset:",train_data.shape, " test_dataset:",test_data.shape)

*resource predictor model*

In [None]:
class RNN_v1(nn.Module):
    def __init__(self, batch_size, n_steps, input_size, output_size, hidden_size, softmax):
        super(RNN_v1, self).__init__()
        self.bs = batch_size
        self.n_steps = n_steps
        self.h_neurons = hidden_size
        self.input_size = input_size
        self.output_size = output_size
        self.rnn = nn.RNN(input_size = self.input_size, hidden_size = self.h_neurons, num_layers = 1, batch_first=True, 
                           nonlinearity = 'tanh', dropout = 0, bias = True)
        #self.rnn = nn.LSTM(input_size = self.input_size, hidden_size = self.h_neurons, num_layers = 1, batch_first=True,dropout= 0, bias = True)
        self.final_layer = nn.Linear(self.h_neurons, 380)   ##378 users+ duration + time normalized column
        torch.nn.init.xavier_uniform_(self.final_layer.weight)
        self.softmax = softmax
        
    def forward(self, X):
        X, _ = self.rnn(X)
        X = self.final_layer(X)
        f = torch.zeros(1,380)
        for i in range(self.bs):
            if i==0:
                f = X[i][-1].view(1,-1)
            else:
                f = torch.cat((f, X[i][-1].view(1,-1)), dim=0)
        return f

*The below cell is for loss calculation,converting the input into required batch-size for MLP input and calculating train, validation and test accuracies.*

In [None]:
loss = nn.CrossEntropyLoss()
mseLoss = nn.MSELoss()
l1loss = nn.L1Loss(reduction='mean')
z = [x for x in range(378)]

def loss_func(X,Y):
    ce_loss = loss(X.cpu().index_select(1, torch.LongTensor(z)), np.argmax(Y.cpu().index_select(1, torch.LongTensor(z)), axis=1))
    duration_loss = l1loss(Y.cpu().index_select(1, torch.LongTensor([378, 379])).float(),X.cpu().index_select(1, torch.LongTensor([378,379])))
    total_loss = ce_loss+duration_loss
    return total_loss.to(dev)
    
train_data = torch.tensor(data[0:72000].values)
validation_data = torch.tensor(data[72000:].values)

    
def preproc_input(bs, k, dataset):
    i = 0
    if dataset == 'train':
        y = train_data[k+1:k+bs+1]
    elif dataset == 'valid':
        y = validation_data[k+1:k+bs+1]
    elif dataset == 'test':
        y = test_data[k+1:k+bs+1]
    x = torch.zeros(1,10,380)
    while i< bs:
        if i==0:
            if dataset == 'train':
                x = train_data[i+k:(i+k+10)].view(1,10,-1)
            elif dataset == 'valid':
                x = validation_data[i+k:(i+k+10)].view(1,10,-1)
            elif dataset == 'test':
                x = test_data[i+k:(i+k+10)].view(1,10,-1)
        else:
            if dataset == 'train':
                x = torch.cat((x, train_data[i+k:(i+k+10)].view(1,10,-1)), dim=0)
            elif dataset == 'valid':
                x = torch.cat((x, validation_data[i+k:(i+k+10)].view(1,10,-1)), dim=0)
            elif dataset == 'test':
                x = torch.cat((x, test_data[i+k:(i+k+10)].view(1,10,-1)), dim=0)
        i+=1
        #print('error')
        #print("dataset: ",dataset, i+k, i+k+10, " i ",i, " k ",k)
    return x.to(dev),y.to(dev)
    
def calc_accuracy(x, y):
    predicted = np.argmax(x.detach().cpu().index_select(1, torch.LongTensor(z)), axis=1)
    true_labels = np.argmax(y.cpu().index_select(1, torch.LongTensor(z)), axis=1)
    #print(predicted, true_labels)
    accuracy = torch.sum(torch.eq(predicted, true_labels))/predicted.shape[0]
    return accuracy.item()

*Training code*

In [None]:
def train(epochs, optimizer, rnn, train_data, bs):
    for epoch in range(epochs):
        ind, train_loss, train_accuracy=0, 0, 0
        rnn.train()
        while ind<=(train_data.shape[0]-2*bs):
            x, y = preproc_input(bs, ind, 'train')
            ind+=20
            optimizer.zero_grad()
            y_pred = rnn(x.float())
            t_loss = loss_func(y_pred, y)
            t_loss.backward()
            optimizer.step()
            train_loss+=t_loss.item()
            train_accuracy+= calc_accuracy(y_pred, y)
        train_loss/=(ind/20)
        train_accuracy/=(ind/20)
        
        #########------------------validation----------------------##########
        
        print("in validation")
        rnn.eval()
        with torch.no_grad():
            v_ind,valid_accuracy, valid_loss = 0,0,0
            while(v_ind<= validation_data.shape[0]-2*bs):
                x,y = preproc_input(bs, v_ind, 'valid')
                v_ind+=20
                v_pred = rnn(x.float())
                v_loss = loss_func(v_pred, y)
                valid_loss+=v_loss.item()
                valid_accuracy+= calc_accuracy(v_pred,y)
            valid_loss/=(v_ind/20)
            valid_accuracy/=(v_ind/20)
        print("epoch: ", epoch, " train_loss:", train_loss, " train_accuracy:", train_accuracy, " validation_loss:", valid_loss,
             " validation_accuracy: ", valid_accuracy)
        logging.info("%d,      %f,    %f,       %f,         %f", epoch, train_loss, train_accuracy, valid_loss, valid_accuracy)
        if(epoch%10==0):
            torch.save(rnn,"rnn"+str(epoch)+".pkl")

*This below cell creates a RNN model and trains it on train dataset*

In [None]:
rnn = RNN_v1(20, 10, 380, 380, 128, F.softmax)
rnn.to(dev)

optimizer = optim.Adam(rnn.parameters(), lr=1e-3)

logging.basicConfig(filename='./rnnLog_new.txt', filemode='w', format='%(asctime)s - %(message)s',level=logging.INFO)
logging.info("parms: lr=1e-3, batch size=20 ,session length=10, hidden layers=1, hidden size=128")
logging.info("epoch, train_loss, train_accuracy, validation_loss, validation_accuracy")

train(101, optimizer, rnn, train_data, 20)

*The below cell generates predictions using the last saved model during RNN training*

In [None]:
#generate data

rnn = torch.load('./rnn100.pkl')
predictions = []
true_values = []
def test(rnn, test_data, bs):
    rnn.eval()
    with torch.no_grad():
        v_ind,test_loss, test_accuracy = 0,0,0
        while(v_ind<= test_data.shape[0]-2*bs):
            x,y = preproc_input(bs, v_ind, 'test')
            v_ind+=20
            v_pred = rnn(x.float())
            predictions.append(v_pred)
            true_values.append(y)
            v_loss = loss_func(v_pred, y)
            test_loss+=v_loss.item()
            test_accuracy+= calc_accuracy(v_pred,y)
        test_loss/=(v_ind/20)
        test_accuracy/=(v_ind/20)
        print( "test_loss:", test_loss, " test_accuracy:", test_accuracy)

In [None]:
test(rnn, test_data, 20)

*Th below code saves the RNN predictions in ../data/results/ folder*

In [None]:
a = np.append(predictions[0].cpu().numpy(), predictions[1].cpu().numpy(), axis=0)
b = np.append(true_values[0].cpu().numpy(), true_values[1].cpu().numpy(), axis=0)
for i in range(2,len(predictions)):
  a = np.append(a, predictions[i].cpu().numpy(), axis=0)
  b = np.append(b, true_values[i].cpu().numpy(), axis=0)
  preds = pd.DataFrame(a)
preds.names = [z for z in range(380)]
preds['duration'] = preds[378]
preds['time_normalized'] = preds[379]
del preds[378]
del preds[379]
print(preds.head())
tru = pd.DataFrame(b)
tru.names = [z for z in range(380)]
tru['duration'] = tru[378]
tru['time_normalized'] = tru[379]
del tru[378]
del tru[379]
print(tru.head())
c = [z for z in range(378)]
preds['user_id'] = preds[z].idxmax(axis=1)
tru['user_id'] = tru[z].idxmax(axis=1)
pred1 = preds[['user_id', 'duration', 'time_normalized']]
tru1 = tru[['user_id', 'duration', 'time_normalized']]
pred1.to_csv('../data/results/rnn_prediction_100.csv', index=False)
#tru1.to_csv('/content/lstm_true_labels.csv', index=False)