In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import gzip
from collections import defaultdict
import operator
import string
from tqdm import tqdm_notebook as tqdm
import torch

In [2]:
def parse(path):
    g = open(path, 'r')
    for l in g:
        yield eval(l)

data = list(parse("lthing_data/reviews.json"))
relationships = [l.replace("\n", "").split(" ") for l in open("lthing_data/edges.txt").readlines()]

In [3]:
data_new = []

for d in tqdm(data):
    if "stars" in d:
        data_new.append(d)
data = data_new

print(data[0])
print(relationships[0])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for d in tqdm(data):


  0%|          | 0/1707070 [00:00<?, ?it/s]

{'work': '3206242', 'flags': [], 'unixtime': 1194393600, 'stars': 5.0, 'nhelpful': 0, 'time': 'Nov 7, 2007', 'comment': 'This a great book for young readers to be introduced to the world of Middle Earth. ', 'user': 'van_stef'}
['Rodo', 'anehan']


In [4]:
# Data:
userItemRating = {}
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated

for read in tqdm(data):
    uid, bid = read['user'], read['work']
    if "stars" in read:
        userItemRating[(uid, bid)] = read['stars']
    usersPerItem[bid].add(uid)
    itemsPerUser[uid].add(bid)
    
# Relationships:
usersPerUser = defaultdict(set) # Maps an item to the users who rated it

for relationship in tqdm(relationships):
    u1, u2 = relationship
    usersPerUser[u1].add(u2)
    usersPerUser[u2].add(u1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for read in tqdm(data):


  0%|          | 0/1387209 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for relationship in tqdm(relationships):


  0%|          | 0/219790 [00:00<?, ?it/s]

In [5]:
train_split = int(0.9 * len(data))
train = data[:train_split]
val = data[train_split:]

### N-Grams

In [None]:
unigrams = {}
bigrams ={}

for d in tqdm(train):
    review = d["comment"].lower().translate(str.maketrans('', '', string.punctuation))
    d_unigrams = review.split()
    d_bigrams = list(zip(d_unigrams[:-1], d_unigrams[1:]))
    
    for ug in d_unigrams:
        if ug in unigrams:
            unigrams[ug] += 1
        else:
            unigrams[ug] = 1
            
    for bg in d_bigrams:
        if bg in bigrams:
            bigrams[bg] += 1
        else:
            bigrams[bg] = 1
            
list_unigrams = [(k, v) for k, v in tqdm(unigrams.items())]
list_unigrams.sort(reverse=True, key=operator.itemgetter(1))
unigrams = [k for k, v in list_unigrams]
ugId = dict(zip(unigrams, range(len(unigrams))))


list_bigrams = [(k, v) for k, v in tqdm(bigrams.items())]
list_bigrams .sort(reverse=True, key=operator.itemgetter(1))
bigrams = [k for k, v in list_bigrams]
bgId = dict(zip(bigrams, range(len(bigrams))))

list_unigrams_bigrams = list_unigrams + list_bigrams
list_unigrams_bigrams.sort(reverse=True, key=operator.itemgetter(1))
unigrams_bigrams = [k for k, v in list_unigrams_bigrams]
ugbgId = dict(zip(unigrams_bigrams, range(len(unigrams_bigrams))))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for d in tqdm(train):


  0%|          | 0/1248488 [00:00<?, ?it/s]

### N-Gram Features:

In [None]:
def unigrams_feat(datum, N):
    review = datum["comment"].lower().translate(str.maketrans('', '', string.punctuation))
    d_unigrams = review.split()
    feat = [0] * N
    unigrams_set = set(unigrams[:N])
    for ug in d_unigrams:
        if ug in unigrams_set:
            feat[ugId[ug]] += 1
    feat.append(1)
    return feat

def bigrams_feat(datum, N):
    review = datum["comment"].lower().translate(str.maketrans('', '', string.punctuation))
    d_unigrams = review.split()
    d_bigrams = list(zip(d_unigrams[:-1], d_unigrams[1:]))
    feat = [0] * N
    bigrams_set = set(bigrams[:N])
    for bg in d_bigrams:
        if bg in bigrams_set:
            feat[bgId[bg]] += 1
    feat.append(1)
    return feat

def unigrams_bigrams_feat(datum, N):
    review = datum["comment"].lower().translate(str.maketrans('', '', string.punctuation))
    d_unigrams = review.split()
    d_bigrams = list(zip(d_unigrams[:-1], d_unigrams[1:]))
    d_unigrams_bigrams  = d_unigrams + d_bigrams
    feat = [0] * N
    unigrams_bigrams_set = set(unigrams_bigrams[:N])
    for ugbg in d_unigrams_bigrams:
        if ugbg in unigrams_bigrams_set:
            feat[ugbgId[ugbg]] += 1
    feat.append(1)
    return feat

def label(datum):
    return datum["rating"]

### Relationship Features:

In [None]:
def Jaccard(s1, s2):
    s1 = set(s1)
    s2 = set(s2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

def get_feat(datum, experiment):
    uf, bf, ubf = experiment
    feat = []
    
    if uf:
        unigrams = unigrams_feat(datum, 2000)
        for i in unigrams:
            feat.append(i)
    if bf:
        bigrams = bigrams_feat(datum, 2000)
        for i in bigrams:
            feat.append(i)
    if ubf:
        unigrams_bigrams = unigrams_bigrams_feat(datum, 2000)
        for i in unigrams_bigrams:
            feat.append(i)
            
    return feat


In [None]:
from torch.utils.data import Dataset, DataLoader

class ThingDataset(Dataset):
    """Dataset class for Argoverse"""
    def __init__(self, d, experiment, test=False):
        super(ThingDataset, self).__init__()
        self.data = d
        self.experiment = experiment
        self.test = test
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        review = self.data[i]
        feat = get_feat(review, self.experiment)
        
#         if self.test:
#             return rid, feat
        
        label = review["stars"]
        
        return feat, label

def my_collate(batch):
    """ collate lists of samples into batches, create [ batch_sz x agent_sz x seq_len x feature] """
    feats = [scene[0] for scene in batch]
    labels = [scene[1] for scene in batch]
    
    return [feats, labels]

def test_collate(batch):
    """ collate lists of samples into batches, create [ batch_sz x agent_sz x seq_len x feature] """
    feats = [scene[0] for scene in batch]
    
    return [feats]

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class SimilarityModel(nn.Module):
    def __init__(self, input_vector_size):
        super().__init__()
        
        # FC
        self.fc1 = nn.Linear(input_vector_size, 4096)
        self.fc2 = nn.Linear(4096, 8192)
        self.fc3 = nn.Linear(8192, 4096)
        self.fc4 = nn.Linear(4096, 2048)
        self.fc5 = nn.Linear(2048, 1024)
        self.fc6 = nn.Linear(1024, 512)
        self.fc7 = nn.Linear(512, 256)
        self.fc8 = nn.Linear(256, 1)
        self.sig = nn.Sigmoid()
        
    def forward(self, x):
        x = x.float()
            
        out = F.relu(self.fc1(x))
        out = F.relu(self.fc2(out))
        out = F.relu(self.fc3(out))
        out = F.relu(self.fc4(out))
        out = F.relu(self.fc5(out))
        out = F.relu(self.fc6(out))
        out = F.relu(self.fc7(out))
        out = self.fc8(out)
        
        return out
        

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class RNN(nn.Module):
    def __init__(self, input_size=120, num_layers=2, hidden_size=256):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.s1 = 512
        self.s2 = 1024
        self.s3 = 2048
        
        # Encoder
        self.lstm1 = nn.LSTMCell(input_size, self.s1)
        self.lstm2 = nn.LSTMCell(self.s1, self.s2)
        self.lstm3 = nn.LSTMCell(self.s2, self.s3)
        
        # Decoder
        self.lstm4 = nn.LSTMCell(input_size, self.s1)
        self.lstm5 = nn.LSTMCell(self.s1, self.s2)
        self.lstm6 = nn.LSTMCell(self.s2, self.s3)
        
        # FC
        self.fc1 = nn.Linear(self.s3, 2048)
        self.fc2 = nn.Linear(2048, 1024)
        self.fc3 = nn.Linear(1024, 512)
        self.fc4 = nn.Linear(512, 256)
        self.fc5 = nn.Linear(256, 120)
        
    def forward(self, x, future=0):
        x = x.float()
        outputs = []
        n_samples = x.size(0)
        
        ht = torch.zeros(n_samples, self.s1, dtype=torch.float32).to(device)
        ct = torch.zeros(n_samples, self.s1, dtype=torch.float32).to(device)
        ht2 = torch.zeros(n_samples, self.s2, dtype=torch.float32).to(device)
        ct2 = torch.zeros(n_samples, self.s2, dtype=torch.float32).to(device)
        ht3 = torch.zeros(n_samples, self.s3, dtype=torch.float32).to(device)
        ct3 = torch.zeros(n_samples, self.s3, dtype=torch.float32).to(device)
        for input_t in x.split(1, dim=1):
            input_t = input_t.reshape((n_samples, 120))
            ht, ct = self.lstm1(input_t, (ht, ct))
            ht2, ct2 = self.lstm2(ht, (ht2, ct2))
#             ht3, ct3 = self.lstm3(ht2, (ht3, ct3))
            out = input_t
        
        for i in range(future):
            ht, ct = self.lstm4(out, (ht, ct))
            ht2, ct2 = self.lstm5(ht, (ht2, ct2))
#             ht3, ct3 = self.lstm6(ht2, (ht3, ct3))
            
            out = F.relu(self.fc1(ht3))
            out = F.relu(self.fc2(out))
            out = F.relu(self.fc3(ht2))
            out = F.relu(self.fc4(out))
            out = self.fc5(out)
            outputs.append(out)
            
        outputs = torch.cat(outputs, dim=1)
        return outputs

In [None]:
def train_func(model, device, train_loader, optimizer, epoch):
    model.train()
    iterator = tqdm(train_loader, total=int(len(train_loader)))

    total = 0
    count = 0
    for i, batch in enumerate(iterator):
        feat, target = batch
           
#         print(len(feat[0]))
            
        feat = np.array(feat)
        feat = torch.from_numpy(feat).to(device)
        
        target  = np.array(target)
        target = torch.from_numpy(target).float().to(device)
            
        optimizer.zero_grad()
        output = model(feat)
            
#         target = torch.argmax(target, dim=1)
            
    
        loss =  nn.MSELoss()(output, target)
#         loss =  nn.CrossEntropyLoss()(output, target)
        loss.backward()
        optimizer.step()
    
        total += loss.item()
        count += 1
        iterator.set_postfix_str("loss={}, avg.={}".format(loss.item(), total/count))
        
def val_func(model, device, test_loader):
    model.eval()
    iterator = tqdm(test_loader, total=int(len(test_loader)))

    total = 0
    count = 0
    for i, batch in enumerate(iterator):
        feat, target = batch
           
        feat = np.array(feat)
        feat = torch.from_numpy(feat).to(device)
        
        target  = np.array(target)
        target = torch.from_numpy(target).float().to(device)
            
        output = model(feat)
            
#         target = torch.argmax(target, dim=1)
            
        loss =  nn.MSELoss()(output, target)
    
        total += loss.item()
        count += 1
        iterator.set_postfix_str("avg.={}".format(loss.item(), total/count))
        
    return total/count

In [None]:
learning_rate = 0.001
momentum = 0.5
device = "cuda"
batch_sz = 1000
num_epoch = 1



experiments = [
    [True, True, True],
    [True, False, True]
]

for experiment in experiments:
    input_vector_size = len(get_feat(data[0], experiment))
    print(f"{experiment}:{input_vector_size}")
    
    
    model = SimilarityModel(input_vector_size).to(device) #using cpu here
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    
    
    train_data = ThingDataset(train, experiment)
    val_data = ThingDataset(val, experiment)

    train_loader = DataLoader(train_data, batch_size=batch_sz, shuffle = True, collate_fn=my_collate, num_workers=0)
    val_loader = DataLoader(val_data, batch_size=batch_sz, shuffle = True, collate_fn=my_collate, num_workers=0)

    PATH = f"a2_{experiment}.pth"

    best_val =  100000
    for epoch in range(1, num_epoch + 1):
        print("EPOCH: {} -----------------------------------".format(epoch))
        train_func(model, device, train_loader, optimizer, epoch)
        val_mse = val_func(model, device, val_loader)
        if val_mse <  best_val:
            best_val  =  val_mse
            torch.save(model.state_dict(), PATH)

In [None]:
allRatings = []
userRatings = defaultdict(list)

for d in train:
    uid = d["user"]
    r = int(d['stars'])
    allRatings.append(r)
    userRatings[uid].append(r)

In [None]:
globalAverage = sum(allRatings) / len(allRatings)
userAverage = {}
for u in userRatings:
    userAverage[u] = sum(userRatings[u]) / len(userRatings[u])
        
count = 0
total = 0
for d in val:
    uid = d["user"]
    if uid in userAverage:
        pred = userAverage[u]
    else:
        pred = globalAverage
    count += (pred - d["stars"]) ** 2
    total += 1

print(f"Baseline performance on val: {count/total}")

In [None]:
rating_stars = {}
for d in train:
    r = int(d['stars'])
    rating_stars[r] = 0
print(rating_stars)