In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext autoreload
%autoreload 2
np.random.seed(2)

## Load the Data

In [2]:
data = np.loadtxt('data/data.txt').astype(int)
movies = np.genfromtxt('data/movies.txt', delimiter='\t', dtype=None,encoding=None, 
                       names=['Movie ID','Title','Unknown','Action','Adventure','Animation','Childrens','Comedy','Crime',
                             'Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi',
                              'Thriller','War','Western'], deletechars='')

# Section 5

## Simple Factorization Parameter Selection

In [3]:
# choice of optimizer: 'plain', 'bias' or 'oos'
optimization = 'plain'

Y_train = np.loadtxt('data/train.txt').astype(int)
Y_test = np.loadtxt('data/test.txt').astype(int)

M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users
N = movies.shape[0] # movies
    
if optimization == 'plain':
    from collab_0 import train_model, get_err

    print('Factorizing with plain model.')

    regs = [1e-2, 1e-1, 0.2, 0.5]
    eta = 0.03 # learning rate
    # Epochs to plot learning curve
    checkpoints = np.arange(40)
    
    K = 20
    E_ins = []
    E_outs = []
    epochs = []
    
    for reg in regs:
        E_in_reg = []
        E_out_reg = []
        print("Training model with M = %s, N = %s, k = %s, eta = %s, reg = %s" % (M, N, K, eta, reg))
        Us, Vs, epochs_reg = train_model(M, N, K, eta, reg, Y_train, checkpoints=checkpoints)
        # Compute insample and out of sample errors
        for idx in range(len(Us)):
            E_in_reg.append(get_err(Us[idx], Vs[idx], Y_train))
            E_out_reg.append(get_err(Us[idx], Vs[idx], Y_test))
        E_ins.append(E_in_reg)
        E_outs.append(E_out_reg)
        epochs.append(epochs_reg)
    
    # Plot E_ins
    for i in range(len(regs)):
        plt.plot(epochs[i], E_ins[i], label='$E_{in}, \lambda=$'+str(regs[i]))
    plt.title('$E_{in}$ learning curve')
    plt.xlabel('epoch')
    plt.ylabel('MSE')
    plt.legend()
    plt.savefig('figs/plain_E_in.png')
    plt.clf()
    
    # Plot E_outs
    for i in range(len(regs)):
        plt.plot(epochs[i], E_outs[i], label='$E_{out}, \lambda=$'+str(regs[i]))
    plt.title('$E_{out}$ learning curve')
    plt.xlabel('epoch')
    plt.ylabel('MSE')
    plt.legend()
    plt.savefig('figs/plain_E_out.png')
    plt.clf()
    print('Printed figures!')
    
    
elif optimization == 'bias':
    from collab_bias import train_model, get_err

    print('Factorizing with bias model.')

    reg = 0.1
    eta = 0.03 # learning rate
    K = 20

    print("Training model with M = %s, N = %s, k = %s, eta = %s, reg = %s"%(M, N, K, eta, reg))
    U, V, a, b, err_train = train_model(M, N, K, eta, reg, Y_train)
    print('Factorization complete.')
    
elif optimization == 'oos':
    # oos model using https://github.com/benfred/implicit
    """
    import implicit
    from scipy import sparse
    
    print('Factorizing with off-the-shelf model.')
    
    m = sparse.coo_matrix((Y_train[:,2].astype(np.float32),
                    (Y_train[:,1], Y_train[:,0])))
    
    model = implicit.als.AlternatingLeastSquares(factors=20)
    model.fit(m.tocsr())
    
    V = model.item_factors
    print('Factorization complete.')
    """
    
    
    # oos model using http://surprise.readthedocs.io/en/stable/matrix_factorization.html
    from surprise.prediction_algorithms.matrix_factorization import NMF
    from surprise import Dataset
    from surprise import Reader
    from surprise import accuracy
    
    reader = Reader(line_format='user item rating', sep='\t')
    model = NMF(n_factors=20)
    surprise_train = Dataset.load_from_file('data/data.txt', reader=reader)
    #surprise_test = Dataset.load_from_file('data/test.txt', reader=reader)
    model.fit(surprise_train.build_full_trainset())
    #predictions = model.test(surprise_test.build_full_trainset().build_testset())
    #err_test = accuracy.rmse(predictions)**2
    V = model.qi
    #print('Test error: %f' % err_test)
    
else:
    print('Invalid optimization method specified')
    

Factorizing with plain model.
Training model with M = 943, N = 1682, k = 20, eta = 0.03, reg = 0.01
('Error:', 0.5180921475496216)
('Added!', 0)
('Error:', 0.3989173858254322)
('Added!', 1)
('Error:', 0.35409415536851546)
('Added!', 2)
('Error:', 0.3262124255733348)
('Added!', 3)
('Error:', 0.303014249709784)
('Added!', 4)
('Error:', 0.28545023955273585)
('Added!', 5)
('Error:', 0.2723045809760114)
('Added!', 6)
('Error:', 0.2588631499175684)
('Added!', 7)
('Error:', 0.2515403640907767)
('Added!', 8)
('Error:', 0.2403697837906989)
('Added!', 9)
('Error:', 0.23469271656679674)
('Added!', 10)
('Error:', 0.22953856817582943)
('Added!', 11)
('Error:', 0.22602968247358912)
('Added!', 12)
('Error:', 0.22454977988226418)
('Added!', 13)
('Error:', 0.21609722270465842)
('Added!', 14)
('Error:', 0.2157399220991249)
('Stopped!', 15)
Training model with M = 943, N = 1682, k = 20, eta = 0.03, reg = 0.1
('Error:', 0.5093922872607904)
('Added!', 0)
('Error:', 0.4350846031441088)
('Added!', 1)
('Error

<matplotlib.figure.Figure at 0x7f7420380f50>

In [5]:
V

NameError: name 'V' is not defined

## SVD and Projection

In [4]:
# number of movies to visualize
nmovies = 10

# Perform the SVD

# Following convention in the guide where V is KxN
V_centered = (V-np.tile(np.mean(V,axis=1),(V.shape[1],1)).T).T
u,s,vh = np.linalg.svd(V_centered, full_matrices=False)
VT = np.dot(u[:,0:2].T,V_centered)

# Normalize data for the plots
VT[0] = (VT[0] - np.mean(VT[0]))/np.std(VT[0])
VT[1] = (VT[1] - np.mean(VT[1]))/np.std(VT[1])

# Find the data range for the plots
ylim = [min(VT[1,:]),max(VT[1,:])]
xlim = [min(VT[0,:]),max(VT[0,:])]

## Popular movies
r_counts = np.bincount(data[:,1])
pop_inds_mov = np.argpartition(r_counts,-nmovies)[-nmovies:]+1
inds = pop_inds_mov[:nmovies]

fig,ax = plt.subplots(figsize=(14,8))
ax.scatter(VT[0,inds],VT[1,inds])
plt.title('Principal components of the %i most popular movies' % nmovies);
plt.xlabel('PC1');
plt.ylabel('PC2');
plt.ylim(ylim)
plt.xlim(xlim)

xrange = max(VT[0,inds])-min(VT[0,inds])
yrange = max(VT[1,inds])-min(VT[1,inds])
    
for j in inds:
    # Make titles consistent wrt "
    mtitle = movies[j][1].replace('"', '')[:-7]
    # Make the titles more readable
    if mtitle[-5:] == ', The':
        mtitle = 'The ' + mtitle[:-5]
    if mtitle[-3:] == ', A':
        mtitle = 'A ' + mtitle[:-3]
    ax.annotate(mtitle,(VT[0,j]+0.015*xrange,VT[1,j]-0.008*yrange))
        

## Movies by genre
genres = ['Action','Adventure','Animation','Childrens','Comedy','Crime',
                             'Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi',
                              'Thriller','War','Western']

for idx,val in enumerate(genres):
    g_inds_mov = np.array(np.where(movies[val]==1))[0]
    inds = g_inds_mov[:nmovies]

    fig,ax = plt.subplots(figsize=(14,8))
    ax.scatter(VT[0,inds],VT[1,inds])
    plt.title('Principal components of %i movies in \'%s\' genre' % (nmovies,val));
    plt.xlabel('PC1');
    plt.ylabel('PC2');
    plt.ylim(ylim)
    plt.xlim(xlim)
    
    xrange = max(VT[0,inds])-min(VT[0,inds])
    yrange = max(VT[1,inds])-min(VT[1,inds])

    for j in inds:
        # Make titles consistent wrt "
        mtitle = movies[j][1].replace('"', '')[:-7]
        # Make the titles more readable
        if mtitle[-5:] == ', The':
            mtitle = 'The ' + mtitle[:-5]
        if mtitle[-3:] == ', A':
            mtitle = 'A ' + mtitle[:-3]
        ax.annotate(mtitle,(VT[0,j]+0.015*xrange,VT[1,j]-0.008*yrange))

NameError: name 'V' is not defined

(20, 20)

(1682, 20)