In [6]:
import sys
import numpy as np

import spotlight
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.datasets.synthetic import generate_sequential
from spotlight.cross_validation import random_train_test_split
import json
sys.path.insert(0, 'src')
from models.main_model import build_model
from models.mean_baseline import build_mean_baseline_model


from data.data import generate_data, save_data



def main(targets):
    data_config = json.load(open('config/data-params.json'))
    main_model_config = json.load(open('config/main-model-params.json'))
    
    if 'test' in targets:
        dataset = generate_data(**data_config)
        print(dataset)
        save_data(dataset, **data_config)
        
        main_rsme = build_model(dataset, **main_model_config)
        mean_baseline_rsme = build_mean_baseline_model(dataset)
        print(main_rsme, mean_baseline_rsme)

if __name__ == "__main__":
    main(sys.argv)

In [2]:
main(['test'])

<Interactions dataset (944 users x 1683 items x 100000 interactions)>
Epoch 0: loss 13.131227529501613
Epoch 1: loss 7.341110045396829
Epoch 2: loss 1.7469144652161417
Epoch 3: loss 1.0693844089025184
Epoch 4: loss 0.9435770405998712
Epoch 5: loss 0.899013837681541
Epoch 6: loss 0.8750966915601417
Epoch 7: loss 0.8663170020791549
Epoch 8: loss 0.8521912950503675
Epoch 9: loss 0.8412833832487275
RangeIndex(start=0, stop=1683, step=1)
RangeIndex(start=0, stop=1683, step=1)


KeyError: 'movieId'

In [8]:
import pandas as pd
import numpy as np
import sklearn.model_selection
import sklearn
import math
from spotlight.cross_validation import random_train_test_split
from spotlight.evaluation import rmse_score

def train(train_data):
    df = pd.DataFrame(train_data.tocoo().toarray())
    print(df.head())
    avg_ratings =  df.groupby('movieId').rating.mean().to_dict()
    for k, v in avg_ratings.items():
            avg_ratings[k] = round(v*2)/2
    mean_avg = round(train_data['rating'].mean()*2)/2
    return avg_ratings, mean_avg

def make_predictions(movieId, avg_ratings, mean_avg):
    if movieId in avg_ratings:
        return avg_ratings[movieId]
    else: 
        return mean_avg
    
def predict(test_data, avg_ratings, mean_avg):
    test_data['predictions'] = test_data['movieId'].apply(make_predictions, args=[avg_ratings, mean_avg])
    rmse = math.sqrt(sklearn.metrics.mean_squared_error(test_data['rating'], test_data['predictions']))
    return rmse

def build_mean_baseline_model(data):
    train_data, test_data = random_train_test_split(data)
    avg_ratings, mean_avg = train(train_data)
    rmse = predict(test_data, avg_ratings, mean_avg)
    return rmse

In [12]:
import pandas as pd
dataset = get_movielens_dataset(variant='100K')
print(dataset)
df = pd.DataFrame(dataset.tocoo().toarray())
print(df.shape)
df.head()

<Interactions dataset (944 users x 1683 items x 100000 interactions)>
(944, 1683)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
import numpy as np
import pandas as pd
import spotlight
from spotlight.datasets.movielens import get_movielens_dataset
import sys
from spotlight.datasets.synthetic import generate_sequential

    


    
print('dataset: ', dataset)

from spotlight.cross_validation import random_train_test_split

train, test = random_train_test_split(dataset, random_state=np.random.RandomState(42))

print('Split into \n {} and \n {}.'.format(train, test))

dataset:  <Interactions dataset (944 users x 1683 items x 100000 interactions)>
Split into 
 <Interactions dataset (944 users x 1683 items x 80000 interactions)> and 
 <Interactions dataset (944 users x 1683 items x 20000 interactions)>.


In [20]:
df = pd.DataFrame(dataset.tocoo().toarray())
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
train = pd.DataFrame(dataset.tocoo().toarray())

In [None]:
data = Dataset.load_builtin('ml-100k')

In [37]:
from surprise import SVD
from surprise import AlgoBase
from surprise import KNNBasic
from surprise import BaselineOnly
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import train_test_split


In [39]:
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=.25)

# We'll use the famous SVD algorithm.
algo = KNNBasic()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9823


0.9823434681054423

In [35]:
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9425  0.9457  0.9455  0.9496  0.9353  0.9437  0.0048  
MAE (testset)     0.7474  0.7493  0.7484  0.7532  0.7424  0.7481  0.0035  
Fit time          0.15    0.18    0.21    0.16    0.15    0.17    0.02    
Test time         0.29    0.09    0.10    0.17    0.10    0.15    0.08    


{'test_rmse': array([0.94246628, 0.94571014, 0.94548723, 0.94964678, 0.93526156]),
 'test_mae': array([0.74744733, 0.74934312, 0.74837466, 0.75317016, 0.74239851]),
 'fit_time': (0.1461350917816162,
  0.17673778533935547,
  0.20891499519348145,
  0.15533018112182617,
  0.15259456634521484),
 'test_time': (0.2896537780761719,
  0.0895228385925293,
  0.09906172752380371,
  0.16896653175354004,
  0.09778738021850586)}

In [25]:
from surprise import dump
from surprise import KNNBasic
from surprise.accuracy import rmse

algo = KNNBasic(sim_options=sim_options, min_k=3)
for trainset, testset in data.folds():
    algo.train(trainset)
    predictions = algo.test(testset)
    rmse(predictions)
    dump('./dump_KNN', predictions, trainset, algo)

NameError: name 'sim_options' is not defined

In [15]:
import torch

from spotlight.factorization.explicit import ExplicitFactorizationModel

model = ExplicitFactorizationModel(loss='regression',
                                   embedding_dim=128,  # latent dimensionality
                                   n_iter=10,  # number of epochs of training
                                   batch_size=1024,  # minibatch size
                                   l2=1e-9,  # strength of L2 regularization
                                   learning_rate=1e-3,
                                   use_cuda=torch.cuda.is_available())

In [16]:
%%time
model.fit(train, verbose=True)

Epoch 0: loss 13.094090727311146
Epoch 1: loss 7.283141341390489
Epoch 2: loss 1.7687220829951613
Epoch 3: loss 1.0708536935757986
Epoch 4: loss 0.9388363934770415
Epoch 5: loss 0.8925132245957097
Epoch 6: loss 0.8678643197952947
Epoch 7: loss 0.8519294518458692
Epoch 8: loss 0.8403563280648823
Epoch 9: loss 0.8315414145023008
CPU times: user 2.3 s, sys: 134 ms, total: 2.43 s
Wall time: 2.44 s


In [7]:
from spotlight.evaluation import rmse_score

train_rmse = rmse_score(model, train)
test_rmse = rmse_score(model, test)

print('Train RMSE {:.3f}, test RMSE {:.3f}'.format(train_rmse, test_rmse))

Train RMSE 0.895, test RMSE 0.938
