In [1]:
"""
Preamble for most code and jupyter notebooks
@author: tobinsouth
@notebook date: 28 Oct 2021
"""

import numpy as np, pandas as pd, matplotlib.pyplot as plt, matplotlib as mpl, seaborn as sns
import math, string, re, pickle, json, os, sys, datetime, itertools
from collections import Counter
from tqdm import tqdm

# Set panda's options
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 120)

# Better graphics
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('retina')
plt.style.use('seaborn-poster')

In [10]:
from torch.utils.data import Dataset, DataLoader
import torch

class StaysDataset(Dataset):
    """Loads in stayz dataset as zipped csv files"""

    def __init__(self, root_dir):
        from glob import glob
        self.root_dir = root_dir
        self.all_csvs = glob(root_dir+'/*.csv.gz')
        stays = pd.concat([pd.read_csv(csv, nrows = 100000) for csv in self.all_csvs])

        self.all_users = list(stays['user'].unique())
        self.grouped_users = stays.groupby('user')
        self.user_homes = dict(self.grouped_users['GEOID_home'].unique())
        self.grouped_stays = self.grouped_users['GEOID']
        self.all_geoid = list(set(list(stays['GEOID'].unique()) + [l.item() for l in self.user_homes.values()]))
        self.all_geoid_mapping = dict(zip(self.all_geoid, range(1,len(self.all_geoid)+1)))

        # We could also truncate each time they leave home?


    def __len__(self):
        return len(self.all_users)

    def __getitem__(self, idx):
        """Get item from grouped frame"""
        user = self.all_users[idx]
        user_stays_seq = self.grouped_stays.get_group(user).to_list()
        user_home = self.user_homes[user].item()
        user_stays_seq = [self.all_geoid_mapping[user_home]] + [self.all_geoid_mapping[geoid] for geoid in user_stays_seq]
        user_stays_seq = torch.tensor(user_stays_seq, dtype=torch.long)
        return user_stays_seq


root_dir = '../data/'
staysDataset = StaysDataset(root_dir)

from torch.nn.utils.rnn import pad_sequence

dataloader = DataLoader(staysDataset, batch_size=1, shuffle=True, 
    collate_fn=lambda batch: pad_sequence(batch, batch_first=True, padding_value=0))

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

In [12]:

class SentEnc(nn.Module):
    def __init__(self, num_locations, hidden_size, dropout=0):
        super(SentEnc, self).__init__()
        self.embedding = nn.Embedding(num_locations, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, dropout=dropout, batch_first=True)
        self.linear =  nn.Linear(hidden_size, num_locations)      

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        loc_space = self.linear(lstm_out)
        return loc_space

In [13]:
HIDDEN_SIZE = 64
batch_size = 32
num_layers = 1
num_epochs = 3 
criterion = nn.CrossEntropyLoss()
dropout = 0

lstm = SentEnc(len(staysDataset.all_geoid), HIDDEN_SIZE, dropout)
optimizer = torch.optim.Adam(lstm.parameters()) 

# # Training LSTM next step prediction on sequences
# for epoch in range(num_epochs):
#     for seq_batch in dataloader:
#         seq_batch = seq_batch.to(device)
#         optimizer.zero_grad()
#         lstm_out = lstm(seq_batch)
#         loss = criterion(lstm_out, seq_batch)
#         loss.backward()
#         optimizer.step()
#         print(loss)

In [14]:
for seq_batch in dataloader:
    break

In [15]:
print(seq_batch.shape)
x = lstm.embedding(seq_batch)
print(x.shape)
lstm_out, _ = lstm.lstm(x)
lstm.linear(x)

In [10]:
from glob import glob
all_csvs = glob(root_dir+'/*.csv.gz')
stays = pd.concat([pd.read_csv(csv, nrows = 100000) for csv in all_csvs])

In [35]:
for user, seq in stays.groupby('user')['GEOID']:
    seq = torch.tensor([staysDataset.all_geoid_mapping[geoid] for geoid in seq]).reshape(1,-1)
    break