In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import gzip
import pickle
from collections import defaultdict
import operator
import string
from tqdm.notebook import tqdm
import torch
import matplotlib.pyplot as plt
import copy

In [2]:
def parse(path):
    g = open(path, 'r')
    for l in g:
        yield eval(l)

data = list(parse("lthing_data/reviews.json"))
relationships = [l.replace("\n", "").split(" ") for l in open("lthing_data/edges.txt").readlines()]

In [3]:
data_new = []

for d in tqdm(data):
    if "stars" in d:
        data_new.append(d)
data = data_new[:100000]

# data.sort(reverse=True, key = lambda x: x["unixtime"])
print(data[0])
print(relationships[0])

  0%|          | 0/1707070 [00:00<?, ?it/s]

{'work': '3206242', 'flags': [], 'unixtime': 1194393600, 'stars': 5.0, 'nhelpful': 0, 'time': 'Nov 7, 2007', 'comment': 'This a great book for young readers to be introduced to the world of Middle Earth. ', 'user': 'van_stef'}
['Rodo', 'anehan']


In [4]:
# Data:
userItemRating = {}
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated

for read in tqdm(data):
    uid, bid = read['user'], read['work']
    if "stars" in read:
        userItemRating[(uid, bid)] = read['stars']
    usersPerItem[bid].add(uid)
    itemsPerUser[uid].add(bid)
    

def cosine_similarity(v1, v2):
    denom = (np.linalg.norm(v1) * np.linalg.norm(v2))
    if denom == 0:
        return 0
    else:
        return np.dot(v1, v2) / denom
    
# find similarity between users on the basis of the ratings they give to books
def user_similarity(u1, u2):
    u1_books = itemsPerUser[u1]
    u2_books = itemsPerUser[u2]
    all_books = u1_books.union(u2_books)
    u1_ratings = []
    u2_ratings = []
    for b in all_books:
        if (u1, b) in userItemRating:
            u1_ratings.append(userItemRating[(u1, b)])
        else:
            u1_ratings.append(0)
        if (u2, b) in userItemRating:
            u2_ratings.append(userItemRating[(u2, b)])
        else:
            u2_ratings.append(0)
    return cosine_similarity(u1_ratings, u2_ratings)

# Relationships:
usersPerUser = defaultdict(set) # Maps an item to the users who rated it
for relationship in tqdm(relationships):
    u1, u2 = relationship
    userSim = user_similarity(u1, u2)
    usersPerUser[u1].add((userSim, u2))
    usersPerUser[u2].add((userSim, u1))
    
for u in usersPerUser:
    usersPerUser[u] = list(usersPerUser[u])
    usersPerUser[u].sort(reverse=True)

  0%|          | 0/100000 [00:00<?, ?it/s]

  0%|          | 0/219790 [00:00<?, ?it/s]

In [5]:
train_split = int(0.9 * len(data))
train = data[:train_split]
val = data[train_split:]

### N-Grams

In [6]:
unigrams = {}
bigrams ={}

for d in tqdm(train):
    review = d["comment"].lower().translate(str.maketrans('', '', string.punctuation))
    d_unigrams = review.split()
#     d_bigrams = list(zip(d_unigrams[:-1], d_unigrams[1:]))
    
    for ug in d_unigrams:
        if ug in unigrams:
            unigrams[ug] += 1
        else:
            unigrams[ug] = 1
            
#     for bg in d_bigrams:
#         if bg in bigrams:
#             bigrams[bg] += 1
#         else:
#             bigrams[bg] = 1
            
list_unigrams = [(k, v) for k, v in tqdm(unigrams.items())]
list_unigrams.sort(reverse=True, key=operator.itemgetter(1))
unigrams = [k for k, v in list_unigrams]
ugId = dict(zip(unigrams, range(len(unigrams))))

N=500
unigrams_set = set(unigrams[:N])

unigram_feats = {}
for datum in tqdm(train):
    work = datum["work"]
    if work in unigram_feats:
        feat = unigram_feats[work]
    else:
        feat = ([0] * N)
        feat.append(1)
    review = datum["comment"].lower().translate(str.maketrans('', '', string.punctuation))
    d_unigrams = review.split()
    for ug in d_unigrams:
        if ug in unigrams_set:
            index = ugId[ug]
            feat[index] += 1
    unigram_feats[work] = feat

# list_bigrams = [(k, v) for k, v in tqdm(bigrams.items())]
# list_bigrams .sort(reverse=True, key=operator.itemgetter(1))
# bigrams = [k for k, v in list_bigrams]
# bgId = dict(zip(bigrams, range(len(bigrams))))

# list_unigrams_bigrams = list_unigrams + list_bigrams
# list_unigrams_bigrams.sort(reverse=True, key=operator.itemgetter(1))
# unigrams_bigrams = [k for k, v in list_unigrams_bigrams]
# ugbgId = dict(zip(unigrams_bigrams, range(len(unigrams_bigrams))))

# with open('unigrams.pickle', 'wb') as handle:
#     pickle.dump(unigrams, handle, protocol=pickle.HIGHEST_PROTOCOL)
# with open('ugId.pickle', 'wb') as handle:
#     pickle.dump(ugId, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# with open('bigrams.pickle', 'wb') as handle:
#     pickle.dump(bigrams, handle, protocol=pickle.HIGHEST_PROTOCOL)
# with open('bgId.pickle', 'wb') as handle:
#     pickle.dump(bgId, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('unigrams_bigrams.pickle', 'wb') as handle:
#     pickle.dump(unigrams_bigrams, handle, protocol=pickle.HIGHEST_PROTOCOL)
# with open('ugbgId.pickle', 'wb') as handle:
#     pickle.dump(ugbgId, handle, protocol=pickle.HIGHEST_PROTOCOL)

  0%|          | 0/90000 [00:00<?, ?it/s]

  0%|          | 0/279838 [00:00<?, ?it/s]

  0%|          | 0/90000 [00:00<?, ?it/s]

In [7]:
userRatingAverages = {}
for t in tqdm(train):
    if t["user"]  in userRatingAverages:
        total, count = userRatingAverages[t["user"]]
        total += t["stars"]
        count += 1
        userRatingAverages[t["user"]] = (total, count)
    else:
        total = t["stars"]
        count = 1
        userRatingAverages[t["user"]] = (total, count)
        
        
for u in userRatingAverages:
    total, count = userRatingAverages[u]
    userRatingAverages[u] = total/count
    
global_count = 0
global_total = 0
global_average = 0    

itemRatingAverages = {}
for t in tqdm(train):
    if t["work"]  in itemRatingAverages:
        total, count = itemRatingAverages[t["work"]]
        total += t["stars"]
        count += 1
        itemRatingAverages[t["work"]] = (total, count)
    else:
        total = t["stars"]
        count = 1
        itemRatingAverages[t["work"]] = (total, count)
        
    global_total += t["stars"]
    global_count += 1
    
global_average = global_total / global_count
        
for u in itemRatingAverages:
    total, count = itemRatingAverages[u]
    itemRatingAverages[u] = total/count

  0%|          | 0/90000 [00:00<?, ?it/s]

  0%|          | 0/90000 [00:00<?, ?it/s]

### N-Gram Features:

In [8]:
def unigrams_feat(datum, N):
    review = datum["comment"].lower().translate(str.maketrans('', '', string.punctuation))
    d_unigrams = review.split()
    feat = [0] * N
    unigrams_set = set(unigrams[:N])
    for ug in d_unigrams:
        if ug in unigrams_set:
            feat[ugId[ug]] += 1
    feat.append(1)
    return feat

def bigrams_feat(datum, N):
    review = datum["comment"].lower().translate(str.maketrans('', '', string.punctuation))
    d_unigrams = review.split()
    d_bigrams = list(zip(d_unigrams[:-1], d_unigrams[1:]))
    feat = [0] * N
    bigrams_set = set(bigrams[:N])
    for bg in d_bigrams:
        if bg in bigrams_set:
            feat[bgId[bg]] += 1
    feat.append(1)
    return feat

def unigrams_bigrams_feat(datum, N):
    review = datum["comment"].lower().translate(str.maketrans('', '', string.punctuation))
    d_unigrams = review.split()
    d_bigrams = list(zip(d_unigrams[:-1], d_unigrams[1:]))
    d_unigrams_bigrams  = d_unigrams + d_bigrams
    feat = [0] * N
    unigrams_bigrams_set = set(unigrams_bigrams[:N])
    for ugbg in d_unigrams_bigrams:
        if ugbg in unigrams_bigrams_set:
            feat[ugbgId[ugbg]] += 1
    feat.append(1)
    return feat

def label(datum):
    return datum["rating"]

### Relationship Features:

In [9]:
def Jaccard(s1, s2):
    s1 = set(s1)
    s2 = set(s2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

def getFriendsFeat(datum):
    uid, bid = datum["user"], datum["work"]
    friends = usersPerUser[uid]
    bid_books = usersPerItem[bid]
    
    feat_friends = []
    for sim, f in friends:
        f_feat = [1]                                  # is a real user
        
        f_books = itemsPerUser[f]                    # user rating
        if bid in f_books:
            f_feat.append(userItemRating[(f, bid)])
        else:
            f_feat.append(-1)
            
        f_feat.append(sim)
            
        f_friends = [ff for sim, ff in usersPerUser[f]]
        f_jacc = Jaccard(f_friends, bid_books) # user jacc
        f_feat.append(f_jacc)
        
        feat_friends.append(f_feat)
    
    if len(feat_friends) < 5:
        while(len(feat_friends) < 5):
            feat_friends.append([0, -1, 0, 0])
    
    feat_friends.sort(reverse=True, key=operator.itemgetter(3)) # book sims for friends
    feat_friends.sort(reverse=True, key=operator.itemgetter(2)) # similar friends in opinion
    feat_friends.sort(reverse=True, key=operator.itemgetter(1)) # friends who had book
    feat_friends.sort(reverse=True, key=operator.itemgetter(0)) # real users first
        
    feat_friends = feat_friends[:5]
    return feat_friends

# for i in range(10):
#     print(getFriendsFeat(data[i]))

def getJaccFriendsFeat(datum):
    uid, bid = datum["user"], datum["work"]
    if uid in usersPerUser:
        s1 = usersPerUser[uid]
    else:
        s1 = set()
    if bid in usersPerItem:
        s2 = usersPerItem[bid]
    else:
        s2 = set()
        
    return Jaccard(s1, s2)
    
    
def get_feat(datum, experiment):
    uf, ff, jf, uaf, iaf = experiment
    feat = []
    
    if uf:
        work = datum["work"]
        if work in unigram_feats:
            feat = np.array(copy.deepcopy(unigram_feats[datum["work"]]))
        else:
            feat = np.zeros((N + 1))
            
    if ff:
        friends_feats = getFriendsFeat(datum)
        for f in friends_feats:
            for i in f:
                feat = np.append(feat, i)
                
    if jf:
        jacc = getJaccFriendsFeat(datum)
        feat  = np.append(feat, jacc)
        
    if uaf:
        if datum["user"] in userRatingAverages:
            feat = np.append(feat, userRatingAverages[datum["user"]])
        else:
            feat = np.append(feat, global_average)
            
    if uaf:
        if datum["work"] in itemRatingAverages:
            feat = np.append(feat, itemRatingAverages[datum["work"]])
        else:
            feat = np.append(feat, global_average)
    
    feat  = np.append(feat,  1)
            
    return feat

# print(len(get_feat(data[0], [True, False, False])))

def get_stars_one_hot(stars):
    stars = int(stars)
    out = np.zeros((6))
    out[stars] = 1
    return out

test = np.zeros((10))
print(np.append(test, 1))

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]


In [10]:

experiments = [
    (False, [True, True, True, True, True]),
    (False, [False, True, True, True, True]),
    (False, [True, False, True, True, True]),
    (False, [True, True, False, True, True]),
    (False, [True, True, True, False, True]),
    (False, [True, True, True, True, False]),
]

for oh, experiment in experiments:
    X_train = []
    y_train =[]
    for t in tqdm(train):
        feat = get_feat(t, experiment)
        X_train.append(feat)
        y_train.append(t["stars"])

    X_val = []
    y_val  = []
    for t in tqdm(val):
        feat = get_feat(t, experiment)
        X_val.append(feat)
        y_val.append(t["stars"])

    
    ## regression,  

  0%|          | 0/90000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/90000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/90000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/90000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [11]:
from torch.utils.data import Dataset, DataLoader

class ThingDataset(Dataset):
    """Dataset class for Argoverse"""
    def __init__(self, d, experiment, stars_oh=False):
        super(ThingDataset, self).__init__()
        self.data = d
        self.experiment = experiment
        self.stars_oh = stars_oh
#         self.all_feats = {}
#         for d in tqdm(self.data):
#             uid, bid = d["user"], d["work"]
#             feat = get_feat(d, [True, False, False])
#             self.all_feats[(uid, bid)] = feat
        
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        review = self.data[i]
        
#         feat = self.all_feats[review]
        feat = get_feat(review, self.experiment)
        
        if self.stars_oh:
            label = get_stars_one_hot(review["stars"])
        else:
            label = review["stars"]
        
        return feat, label

def my_collate(batch):
    """ collate lists of samples into batches, create [ batch_sz x agent_sz x seq_len x feature] """
    feats = [scene[0] for scene in batch]
    labels = [scene[1] for scene in batch]
    
    return [feats, labels]

def test_collate(batch):
    """ collate lists of samples into batches, create [ batch_sz x agent_sz x seq_len x feature] """
    feats = [scene[0] for scene in batch]
    
    return [feats]

In [12]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class SimilarityModel(nn.Module):
    def __init__(self, input_vector_size, stars_oh):
        super().__init__()
        
        self.stars_oh = stars_oh
        
        # FC
        self.fc1 = nn.Linear(input_vector_size, 4096)
        self.fc2 = nn.Linear(4096, 8192)
        self.fc3 = nn.Linear(8192, 4096)
        self.fc4 = nn.Linear(4096, 2048)
        self.fc5 = nn.Linear(2048, 1024)
        self.fc6 = nn.Linear(1024, 512)
        self.fc7 = nn.Linear(512, 256)
        if stars_oh:
            self.fc8 = nn.Linear(256, 6)
        else:
            self.fc8 = nn.Linear(256, 1)
            
        self.sig = nn.Sigmoid()
        
    def forward(self, x):
        x = x.float()
            
        out = F.relu(self.fc1(x))
        out = F.relu(self.fc2(out))
        out = F.relu(self.fc3(out))
        out = F.relu(self.fc4(out))
        out = F.relu(self.fc5(out))
        out = F.relu(self.fc6(out))
        out = self.fc7(out)
        out = self.fc8(out)
        
        if self.stars_oh:
            out = self.sig(out)
        
        return out
        

In [13]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class RNN(nn.Module):
    def __init__(self, input_size=120, num_layers=2, hidden_size=256):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.s1 = 512
        self.s2 = 1024
        self.s3 = 2048
        
        # Encoder
        self.lstm1 = nn.LSTMCell(input_size, self.s1)
        self.lstm2 = nn.LSTMCell(self.s1, self.s2)
        self.lstm3 = nn.LSTMCell(self.s2, self.s3)
        
        # Decoder
        self.lstm4 = nn.LSTMCell(input_size, self.s1)
        self.lstm5 = nn.LSTMCell(self.s1, self.s2)
        self.lstm6 = nn.LSTMCell(self.s2, self.s3)
        
        # FC
        self.fc1 = nn.Linear(self.s3, 2048)
        self.fc2 = nn.Linear(2048, 1024)
        self.fc3 = nn.Linear(1024, 512)
        self.fc4 = nn.Linear(512, 256)
        self.fc5 = nn.Linear(256, 120)
        
    def forward(self, x, future=0):
        x = x.float()
        outputs = []
        n_samples = x.size(0)
        
        ht = torch.zeros(n_samples, self.s1, dtype=torch.float32).to(device)
        ct = torch.zeros(n_samples, self.s1, dtype=torch.float32).to(device)
        ht2 = torch.zeros(n_samples, self.s2, dtype=torch.float32).to(device)
        ct2 = torch.zeros(n_samples, self.s2, dtype=torch.float32).to(device)
        ht3 = torch.zeros(n_samples, self.s3, dtype=torch.float32).to(device)
        ct3 = torch.zeros(n_samples, self.s3, dtype=torch.float32).to(device)
        for input_t in x.split(1, dim=1):
            input_t = input_t.reshape((n_samples, 120))
            ht, ct = self.lstm1(input_t, (ht, ct))
            ht2, ct2 = self.lstm2(ht, (ht2, ct2))
#             ht3, ct3 = self.lstm3(ht2, (ht3, ct3))
            out = input_t
        
        for i in range(future):
            ht, ct = self.lstm4(out, (ht, ct))
            ht2, ct2 = self.lstm5(ht, (ht2, ct2))
#             ht3, ct3 = self.lstm6(ht2, (ht3, ct3))
            
            out = F.relu(self.fc1(ht3))
            out = F.relu(self.fc2(out))
            out = F.relu(self.fc3(ht2))
            out = F.relu(self.fc4(out))
            out = self.fc5(out)
            outputs.append(out)
            
        outputs = torch.cat(outputs, dim=1)
        return outputs

In [14]:
def train_func(model, device, train_loader, optimizer, epoch, stars_oh):
    model.train()
    iterator = tqdm(train_loader, total=int(len(train_loader)))

    total = 0
    count = 0
    losses = []
    for i, batch in enumerate(iterator):
        feat, target = batch
           
#         print(len(feat[0]))
            
        feat = np.array(feat)
        feat = torch.from_numpy(feat).to(device)
        
        target  = np.array(target)
        target = torch.from_numpy(target).float().to(device)
            
        optimizer.zero_grad()
        output = model(feat)
            
        if stars_oh:
            target = torch.argmax(target, dim=1)
            loss =  nn.CrossEntropyLoss()(output, target)
        else:
            loss =  nn.MSELoss()(output, target)
            
        loss.backward()
        optimizer.step()
    
        loss_val = loss.item()
        total += loss_val
        count += 1
        losses.append(loss_val)
        iterator.set_postfix_str("loss={}, avg.={}".format(loss.item(), total/count))
        
    return losses
        
def val_func(model, device, test_loader, stars_oh):
    model.eval()
    iterator = tqdm(test_loader, total=int(len(test_loader)))

    total = 0
    count = 0
    for i, batch in enumerate(iterator):
        feat, target = batch
           
        feat = np.array(feat)
        feat = torch.from_numpy(feat).to(device)
        
        target  = np.array(target)
        target = torch.from_numpy(target).float().to(device)
            
        output = model(feat)
#         print(output.cpu())
        
        if stars_oh:
            target = torch.argmax(target, dim=1)
            loss =  nn.CrossEntropyLoss()(output, target)
        else:
            loss =  nn.MSELoss()(output, target)
    
        total += loss.item()
        count += 1
        iterator.set_postfix_str("avg.={}".format(loss.item(), total/count))
        
    return total/count

In [15]:
torch.cuda.is_available()

True

In [17]:
learning_rate = 0.0001
momentum = 0.5
device = "cuda"
batch_sz = 100
num_epoch = 5

experiments = [
    (False, [True, True, True, True, True]),
    (False, [False, True, True, True, True]),
    (False, [True, False, True, True, True]),
    (False, [True, True, False, True, True]),
    (False, [True, True, True, False, True]),
    (False, [True, True, True, True, False]),
]


experiment_losses_b = []
for stars_oh, experiment in experiments:
    input_vector_size = len(get_feat(data[0], experiment))
    print(f"{stars_oh}, {experiment}:{input_vector_size}")
    
    model = SimilarityModel(input_vector_size, stars_oh).to(device) #using cpu here
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    
    
    train_data = ThingDataset(train, experiment, stars_oh)
    val_data = ThingDataset(val, experiment, stars_oh)

    train_loader = DataLoader(train_data, batch_size=batch_sz, shuffle = True, collate_fn=my_collate, num_workers=4)
    val_loader = DataLoader(val_data, batch_size=batch_sz, shuffle = True, collate_fn=my_collate, num_workers=4)

#     total = 0
#     count = 0
#     for v in val_loader:
#         feat, target = v
#         total += sum([s for s in target])
#         count += len(target)
#     print(f"average: {total / count}")
    
    
    
    PATH = f"a2_{experiment}.pth"

    all_losses = []
    best_val =  100000
    best_model = None
    for epoch in range(1, num_epoch + 1):
        if epoch != 1:
            model.load_state_dict(torch.load(PATH))
        
        print("EPOCH: {} -----------------------------------".format(epoch))
        losses = train_func(model, device, train_loader, optimizer, epoch, stars_oh)
        all_losses.append(losses) # per epoch
        val_mse = val_func(model, device, val_loader, stars_oh)
        if val_mse <  best_val:
            best_val  =  val_mse
            state = model.state_dict()
            best_model = state
            torch.save(state, PATH)
    experiment_losses_b.append(all_losses)

False, [True, True, True, True, True]:525
EPOCH: 1 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  return F.mse_loss(input, target, reduction=self.reduction)


  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 2 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 3 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 4 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 5 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

False, [False, True, True, True, True]:24
EPOCH: 1 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 2 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 3 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 4 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 5 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

False, [True, False, True, True, True]:505
EPOCH: 1 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 2 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 3 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 4 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 5 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

False, [True, True, False, True, True]:524
EPOCH: 1 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 2 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 3 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 4 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 5 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

False, [True, True, True, False, True]:523
EPOCH: 1 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 2 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 3 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 4 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 5 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

False, [True, True, True, True, False]:525
EPOCH: 1 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 2 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 3 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 4 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

EPOCH: 5 -----------------------------------


  0%|          | 0/900 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
allRatings = []
userRatings = defaultdict(list)

for d in train:
    uid = d["user"]
    r = int(d['stars'])
    allRatings.append(r)
    userRatings[uid].append(r)

In [None]:
globalAverage = sum(allRatings) / len(allRatings)
userAverage = {}
for u in userRatings:
    userAverage[u] = sum(userRatings[u]) / len(userRatings[u])
        
count = 0
total = 0
for d in val:
    uid = d["user"]
    if uid in userAverage:
        pred = userAverage[u]
    else:
        pred = globalAverage
    count += (pred - d["stars"]) ** 2
    total += 1

print(f"Baseline performance on val: {count/total}")

In [None]:
rating_stars = {}
for d in train:
    r = int(d['stars'])
    rating_stars[r] = 0
print(rating_stars)