Let's install `h2o4gpu`

In [1]:
!apt-get install -y libopenblas-dev pbzip2
!pip install -U tabulate==0.8.2
!pip install h2o4gpu
import h2o4gpu




The following additional packages will be installed:
  libopenblas-base
The following NEW packages will be installed:
  libopenblas-base libopenblas-dev pbzip2
0 upgraded, 3 newly installed, 0 to remove and 46 not upgraded.
Need to get 7644 kB of archives.
After this operation, 91.6 MB of additional disk space will be used.
Get:1 http://deb.debian.org/debian stretch/main amd64 libopenblas-base amd64 0.2.19-3 [3793 kB]
Get:2 http://deb.debian.org/debian stretch/main amd64 libopenblas-dev amd64 0.2.19-3 [3809 kB]
Get:3 http://deb.debian.org/debian stretch/main amd64 pbzip2 amd64 1.1.9-1+b1 [41.8 kB]
Fetched 7644 kB in 0s (40.6 MB/s)
debconf: delaying package configuration, since apt-utils is not installed
Selecting previously unselected package libopenblas-base.
(Reading database ... 37072 files and directories currently installed.)
Preparing to unpack .../libopenblas-base_0.2.19-3_amd64.deb ...
Unpacking libopenblas-base (0.2.19-3) ...
Selecting previously unselect

Read and process netflix dataset to scipy sparse matrix

In [2]:
import gc
import numpy as np
import scipy
from sklearn.model_selection import train_test_split

files = [
    '../input/combined_data_1.txt',
    '../input/combined_data_2.txt',
    '../input/combined_data_3.txt',
    '../input/combined_data_4.txt',
]

coo_row = []
coo_col = []
coo_val = []

for file_name in files:
    print('processing {0}'.format(file_name))
    with open(file_name, "r") as f:
        movie = -1
        for line in f:
            if line.endswith(':\n'):
                movie = int(line[:-2]) - 1
                continue
            assert movie >= 0
            splitted = line.split(',')
            user = int(splitted[0])
            rating = float(splitted[1])
            coo_row.append(user)
            coo_col.append(movie)
            coo_val.append(rating)
    gc.collect()

print('transformation...')

coo_val = np.array(coo_val, dtype=np.float32)
coo_col = np.array(coo_col, dtype=np.int32)
coo_row = np.array(coo_row)
user, indices = np.unique(coo_row, return_inverse=True)
user = user.astype(np.int32)

gc.collect()

coo_matrix = scipy.sparse.coo_matrix((coo_val, (indices, coo_col)))
shape = coo_matrix.shape
print('R matrix size', shape)

gc.collect()

print('splitting into training and validation set')
train_row, test_row, train_col, test_col, train_data, test_data = train_test_split(
    coo_matrix.row, coo_matrix.col, coo_matrix.data, test_size=0.2, random_state=42)

train = scipy.sparse.coo_matrix(
    (train_data, (train_row, train_col)), shape=shape)
test = scipy.sparse.coo_matrix(
    (test_data, (test_row, test_col)), shape=shape)


processing ../input/combined_data_1.txt
processing ../input/combined_data_2.txt
processing ../input/combined_data_3.txt
processing ../input/combined_data_4.txt
transformation...
R matrix size (480189, 17770)
splitting into training and validation set


Let's factorize matrix R 

In [3]:
n_components = 40
_lambda = 0.01
# increase it in case out-of GPU memory, but n_components / BATCHES has to be a multiple of 10
BATCHES=1



scores = []
factorization = h2o4gpu.solvers.FactorizationH2O(
    n_components, _lambda, max_iter=100)
factorization.fit(train, X_test=test, X_BATCHES=BATCHES,
                      THETA_BATCHES=BATCHES, scores=scores, verbose=True, early_stopping_rounds=5)
print('best iteration:',factorization.best_iteration)

iteration 0 train: 0.8289686441421509 cv: 0.9866130352020264
iteration 1 train: 0.7518881559371948 cv: 0.9291425347328186
iteration 2 train: 0.721118152141571 cv: 0.9005951285362244
iteration 3 train: 0.7065906524658203 cv: 0.8870471119880676
iteration 4 train: 0.6984377503395081 cv: 0.8793769478797913
iteration 5 train: 0.6933264136314392 cv: 0.8745821118354797
iteration 6 train: 0.6898868680000305 cv: 0.871374785900116
iteration 7 train: 0.6874406933784485 cv: 0.8690966963768005
iteration 8 train: 0.6856245994567871 cv: 0.8673988580703735
iteration 9 train: 0.6842295527458191 cv: 0.8660860657691956
iteration 10 train: 0.6831294894218445 cv: 0.865032434463501
iteration 11 train: 0.6822428107261658 cv: 0.8641777634620667
iteration 12 train: 0.6815149188041687 cv: 0.8634673357009888
iteration 13 train: 0.6809085011482239 cv: 0.8628664612770081
iteration 14 train: 0.6803968548774719 cv: 0.8623507618904114
iteration 15 train: 0.6799608469009399 cv: 0.8619084358215332
iteration 16 train: 0

And now `factorization.XT` and `factorization.thetaT` contain dense representation of users and movies respectively.

In [4]:
print('X shape:', factorization.XT.shape)
print('ThetaT shape:', factorization.thetaT.shape)


X shape: (480189, 40)
ThetaT shape: (17770, 40)
