In [3]:
import numpy as np
import numpy.random as npr
import matplotlib.pyplot as plt
import pandas as pd

from pmf import PoissonMF
from scipy import sparse

%matplotlib inline

## Simulate data

In [4]:
n_users, n_items = 1000, 1000
K = 10

def simulate_data(corr, gamma):
    theta_A = npr.gamma(0.3, scale=0.3, size=(n_users, K))
    beta = npr.gamma(0.3, scale=0.3, size=(n_items, K))
    A = np.minimum(npr.poisson(theta_A.dot(beta.T)), 1)
    theta_Y = corr * theta_A + (1 - corr) * npr.gamma(0.3, scale=0.3, size=(n_users, K))
    y = npr.poisson(theta_Y.dot(beta.T) + gamma * theta_A.dot(beta.T))
    y = np.minimum(y+1, 5)
    y_obs = np.multiply(A, y)
    y = sparse.coo_matrix(y)
    A = sparse.coo_matrix(A)
    ydf = pd.DataFrame({'uid': y.row, 'sid': y.col, 'rating':y.data})
    Adf = pd.DataFrame({'uid': A.row, 'sid': A.col, 'obs':A.data})
    return ydf, Adf

ydf, Adf = simulate_data(0.5, 0.5)

In [5]:
ydf["uid"].shape

(1000000,)

In [6]:
def load_data(df, colnames=["uid", "sid", "rating"], shape=(n_users, n_items)):
    user, item, rating = colnames[0], colnames[1], colnames[2]
    rows, cols, vals = np.array(df[user]), np.array(df[item]), np.array(df[rating])
    data = sparse.csr_matrix((vals, (rows, cols)), dtype=np.float32, shape=shape)
    return data


def exp_to_imp(data, cutoff=1e-10):
    data_imp = data.copy()
    data_imp.data[data_imp.data < cutoff] = 0
    data_imp.data[data_imp.data >= cutoff] = 1
    data_imp.data = data_imp.data.astype('int32')
    data_imp.eliminate_zeros()
    return data_imp

In [7]:
data = load_data(ydf)
data_imp = exp_to_imp(data, 0.5)
data_coo = data_imp.tocoo()
row_tr, col_tr = data_coo.row, data_coo.col

In [8]:
pf = PoissonMF(n_components=10, max_iter=1)
pf.fit(data, row_tr, col_tr)
pi, lamb = pf.Eb.copy(), pf.Et.T

In [9]:
print(pi.shape)
print(lamb.shape)

(1000, 10)
(1000, 10)


## Movie data

In [10]:
data_dir = "./data/ml-100k/"

# Read movies
movies_colnames = ["id", "name", "year", "", "url", "unknown", "Action", "Adventure",
                   "Animation", "Children", "Comedy", "Crime", "Documentary", "Drama",
                   "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance",
                   "Sci-fi", "Thriller", "War", "Western"]
movies = pd.read_csv(data_dir + "u.item", sep="|", encoding="latin-1",
                     names = movies_colnames, header=None)
movies = movies.drop(["", "url", "id"], axis=1)
movies["year"] = movies["year"].str.split("-").str[-1]
# print(movies.head())
print(movies.shape)

# Read users
users_colnames = ["id", "age", "gender", "occupation", "zip"]
users = pd.read_csv(data_dir + "u.user", sep="|", encoding="latin-1",
                    names = users_colnames, header=None)
users = users.drop(['occupation', "id"],axis = 1).join(pd.get_dummies(users['occupation']))
# print(users.head())
print(users.shape)

# Read ratings
ratings_colnames = ["user", "movie", "rating", "timestamp"]
ratings = pd.read_csv(data_dir + "u.data", sep="\t", encoding="latin-1",
                     names=ratings_colnames, header=None)
ratings["user"] -= 1
ratings["movie"] -= 1
# print(ratings.head())
print(ratings.shape)

(1682, 21)
(943, 24)
(100000, 4)


In [11]:
n_users = users.shape[0]
n_movies = movies.shape[0]
data = load_data(ratings, colnames=["user", "movie", "rating"], shape=(n_users, n_movies))
print(data.shape)

data_imp = exp_to_imp(data, 0.5)
data_coo = data_imp.tocoo()
row_tr, col_tr = data_coo.row, data_coo.col

(943, 1682)


In [12]:
pf = PoissonMF(n_components=10, max_iter=100)
pf.fit(data, row_tr, col_tr)
pi, lamb = pf.Eb.copy(), pf.Et.T

print(pi.shape)
print(lamb.shape)

(943, 10)
(1682, 10)
