In [3]:
import numpy as np
import pandas as pd
import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

def generate_purchase_data(dates, average_period, std_period, minor_purchase_probability, main_mean, main_std, main_clip, noise_mean, noise_std, noise_clip):
    purchases = np.zeros(len(dates))

    current_date_index = 0
    last_purchase_index = -1
    while current_date_index < len(dates):
        rand_value = main_std*np.random.randn() + main_mean
        clip_rand_value = np.clip(rand_value, main_mean-main_clip + (np.random.rand()-0.5)*main_std/5, main_mean + main_clip + (np.random.rand()-0.5)*main_std/5)
        clip_rand_value = np.clip(clip_rand_value, a_min=main_mean//2, a_max=None)
        purchases[current_date_index] += clip_rand_value
        last_purchase_index = current_date_index

        next_period = int(np.random.normal(average_period, std_period))
        next_period = np.clip(next_period, average_period//2, average_period+average_period//2)
        current_date_index += next_period

    for i in range(len(dates)):
        if np.random.rand() < minor_purchase_probability and purchases[i] == 0:
            rand_value = noise_std*np.random.randn() + noise_mean
            clip_rand_value = np.clip(rand_value, noise_mean-noise_clip + (np.random.rand()-0.5)*noise_std/5, noise_mean + noise_clip + (np.random.rand()-0.5)*noise_std/5)
            clip_rand_value = np.clip(clip_rand_value, a_min=0, a_max=None)
            purchases[i] += clip_rand_value
    

    return purchases, last_purchase_index

DEVICE = 'cuda'

In [None]:
start_date = '2018-01-01'
end_date = '2022-12-31'

X = []
y = []

dates = pd.date_range(start=start_date, end=end_date, freq='M').astype(int)
dates = (dates-dates.min())/(dates.max()-dates.min())
dates = dates.to_numpy()

n = 5000000
for i in tqdm.tqdm(range(n)):
    average_period = np.random.randint(3, 15)
    main_mean = np.random.randint(3000, 5000)
    main_std = np.random.randint(500, 2000)
    main_clip = np.random.randint(500, 3000)
    noise_mean = np.random.randint(250, 1000)
    noise_std = np.random.randint(100, 500)
    noise_clip = np.random.randint(100, 600)
    minor_purchase_probability = np.random.rand()/3
    to_zero_range = np.random.randint(0, 64-24)
    
    if average_period > 7 and np.random.rand() > 0.8:
        std_period = 2
    else:
        std_period = 1
        
    purchases, last_purchase_index = generate_purchase_data(dates, average_period, std_period, minor_purchase_probability, main_mean, main_std, main_clip, noise_mean, noise_std, noise_clip)
    
    if np.random.rand() > 0.7:
        purchases[:to_zero_range] *= 0
        
    X.append(purchases)
    y.append([average_period, last_purchase_index])

X = np.stack(X)
X = (X - X.min(axis=-1)[..., None])/(X.max(axis=-1)[..., None] - X.min(axis=-1)[..., None])
X = np.concatenate((dates[None].repeat(n, axis=0)[:, None], X[:, None]), axis=1)

y = np.array(y).astype(np.float32)

In [5]:
np.savez_compressed('synthetic_data/chunk1', X=X, y=y)

In [18]:
chunk = np.load('/home/fromy/projects/ldt/synthetic_data/chunk1.npz')

X, y = chunk['X'], chunk['y']

In [19]:


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

In [67]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        
        self.block = nn.Sequential(
            nn.Conv1d(2, 16, kernel_size=3, padding=1),
            nn.MaxPool1d(2),
            nn.ReLU(),
            nn.Conv1d(16, 32, kernel_size=3, padding=1),
            nn.MaxPool1d(2),
            nn.ReLU(),
            nn.Conv1d(32, 32, kernel_size=5, padding=1),
            nn.ReLU(),
        )

        self.fc = nn.Sequential(
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 2),
        ) 

    def forward(self, x):
        x = self.block(x)
        x = self.fc(x.permute(0, 2, 1))
        x = x.mean((1,))
        return x

model = ConvNet().to(DEVICE)

In [20]:
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

batch_size = 1024
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [21]:

criterion = nn.MSELoss()  
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [12]:
num_epochs = 20  
validation_steps = 10 

for epoch in tqdm.tqdm(range(num_epochs)):
    model.train()
    running_loss = 0.0

    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(DEVICE)
        labels = labels.to(DEVICE)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    if (epoch + 1) % validation_steps == 0:
        model.eval()
        with torch.no_grad():
            val_loss = 0.0
            for test_inputs, test_labels in test_loader:
                test_inputs = test_inputs.to(DEVICE)
                test_labels = test_labels.to(DEVICE)
                val_outputs = model(test_inputs)
                val_loss += criterion(val_outputs, test_labels).item()

        val_loss /= len(test_loader)
        train_loss = running_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}], Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')
        running_loss = 0.0

print('Finished Training')

 50%|█████     | 10/20 [06:07<06:16, 37.69s/it]

Epoch [10/20], Step [3907], Loss: 0.3671, Validation Loss: 0.3599


100%|██████████| 20/20 [12:12<00:00, 36.60s/it]

Epoch [20/20], Step [3907], Loss: 0.3340, Validation Loss: 0.3364
Finished Training





In [23]:
torch.save(model.state_dict(), 'checkpoints/1/model.pt')

In [31]:
import json

metadata = {
    'dates_range': (start_date, end_date),
    'dates_freq': 'ME',
}

with open('checkpoints/1/metadata.json', mode='w') as f:
    json.dump(metadata, f)