In [3]:
from model.MultiBranch import MultiBranchModel
from helpers.Trainer import Trainer
from helpers.YoutubeDataset import YoutubeDataset
from helpers.EarlyStopping import EarlyStopping
from torch.utils.data import DataLoader
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


# Load data

In [4]:
train_data = pd.read_csv('data/train_data.csv')
val_data = pd.read_csv('data/val_data.csv')
test_data = pd.read_csv('data/test_data.csv')
train_data.head()

Unnamed: 0,ytvideoid,views,comments,likes,dislikes,timestamp
0,1,472549,3887,72559,1339,2019-11-25 17:30:00
1,1,472549,4026,74427,1365,2019-11-25 18:00:00
2,1,472549,4150,76092,1388,2019-11-25 18:30:00
3,1,485143,4235,77472,1420,2019-11-25 19:00:00
4,1,532254,4334,78655,1443,2019-11-25 19:30:00


In [5]:
feature_cols = ['views', 'comments', 'likes', 'dislikes'] 

scaler = StandardScaler()

scaler.fit(train_data[feature_cols])

train_scaled = train_data.copy()
train_scaled[feature_cols] = scaler.transform(train_data[feature_cols])

val_scaled = val_data.copy()
val_scaled[feature_cols] = scaler.transform(val_data[feature_cols])

test_scaled = test_data.copy()
test_scaled[feature_cols] = scaler.transform(test_data[feature_cols])

train_scaled.head()

Unnamed: 0,ytvideoid,views,comments,likes,dislikes,timestamp
0,1,-0.405903,-0.157636,-0.160458,-0.040363,2019-11-25 17:30:00
1,1,-0.405903,-0.154329,-0.153561,-0.040076,2019-11-25 18:00:00
2,1,-0.405903,-0.151378,-0.147413,-0.039823,2019-11-25 18:30:00
3,1,-0.403192,-0.149355,-0.142317,-0.03947,2019-11-25 19:00:00
4,1,-0.393049,-0.146999,-0.137949,-0.039217,2019-11-25 19:30:00


In [6]:
seq_len = 10
forecast_horizon = 1
batch_size = 128

train_dataset = YoutubeDataset(
    df=train_scaled,
    features=feature_cols,
    target='views',
    seq_len=seq_len,
    forecast_horizon=forecast_horizon
)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    drop_last=True
)

In [7]:
val_dataset = YoutubeDataset(
    df=val_scaled,
    features=feature_cols,
    target='views',
    seq_len=seq_len,
    forecast_horizon=forecast_horizon
)
test_dataset = YoutubeDataset(
    df=test_scaled,
    features=feature_cols,
    target='views',
    seq_len=seq_len,
    forecast_horizon=forecast_horizon
)

In [8]:
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Train

In [9]:
model = MultiBranchModel(
    input_size=len(feature_cols),
    hidden_size={'lstm': 50, 'gru': 50},
    num_layers={'lstm': 1, 'gru': 1},
    dropout={'lstm': 0.3, 'gru': 0.3},
    output_size=1
)



In [10]:
import torch.optim as optim
import torch.nn as nn
import torch

optimizer = optim.Adam(model.parameters(), lr=0.001) 
loss_fn = nn.MSELoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'DEVICE: {device.type}')

early_stopping = EarlyStopping(patience = 30,
                               min_delta = 0.0001,
                               verbose = True,
                               path = "best_model.pt")

trainer = Trainer(
    model=model,
    optimizer=optimizer,
    loss_fn=loss_fn,
    device=device,
    early_stopping=early_stopping
)

DEVICE: cpu


In [None]:
epochs = 5000
train_loss, val_loss = trainer.fit(train_loader=train_loader, val_loader=val_loader, epochs=epochs)

In [None]:
plt.plot(train_loss, label='Training loss')
plt.plot(val_loss, label='Validation loss')
plt.title('Loss evolution')
plt.grid(True)
plt.tight_layout()
plt.legend()
plt.show()
