In [1]:
import argparse
from utils.io import load_numpy
import numpy as np
from experiment.latent_analysis import latent_analysis
from experiment.popular_analysis import popular_overlapping
from utils.argument import shape
from utils.argument import check_float_positive, check_int_positive, shape

from models.lrec import embedded_lrec_items
from models.weighted_lrec import weighted_lrec_items
from models.pure_svd import pure_svd, eigen_boosted_pure_svd
from models.als import als
from models.pmi_lrec import pmi_lrec_items
from models.weighted_pmi_lrec import weighted_pmi_lrec_items
from models.chainitemitem import chain_item_item
from models.predictor import predict
import pandas as pd


Widget registration using a string name has been deprecated. Widget registration now uses a plain `@register` decorator.



In [2]:
argpath = 'data/'
argtrain = 'Rtrain.npz'
argvalid = 'Rvalid.npz'
argindex = 'Index.npy'
argside = 'ml-20m/movies.csv'

In [3]:
R_train = load_numpy(path=argpath, name=argtrain)
Index = np.load(argpath+argindex)
Side_info = pd.read_csv(argpath+argside, delimiter=',', encoding='utf-8')

In [4]:
Side_info

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [5]:
R_train.shape

(138494, 22884)

In [6]:
Side_info.size

81834

In [7]:
np.asarray(np.sum(R_train,axis=0)).reshape(-1).shape

(22884,)

In [9]:
Side_info = Side_info[Side_info['movieId'].isin(Index)].reset_index(drop=True)

In [10]:
Side_info['popularity'] = np.asarray(np.sum(R_train,axis=0)).reshape(-1)

In [12]:
Side_info['notes'] = Side_info['title'] + '<br>' + Side_info['genres'] + '<br>Popularity:' +  Side_info['popularity'].astype(str)

In [13]:
Side_info

Unnamed: 0,movieId,title,genres,popularity,notes
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,25021.0,Toy Story (1995)<br>Adventure|Animation|Childr...
1,2,Jumanji (1995),Adventure|Children|Fantasy,4298.0,Jumanji (1995)<br>Adventure|Children|Fantasy<b...
2,3,Grumpier Old Men (1995),Comedy|Romance,2570.0,Grumpier Old Men (1995)<br>Comedy|Romance<br>P...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,230.0,Waiting to Exhale (1995)<br>Comedy|Drama|Roman...
4,5,Father of the Bride Part II (1995),Comedy,2070.0,Father of the Bride Part II (1995)<br>Comedy<b...
5,6,Heat (1995),Action|Crime|Thriller,9082.0,Heat (1995)<br>Action|Crime|Thriller<br>Popula...
6,7,Sabrina (1995),Comedy|Romance,2951.0,Sabrina (1995)<br>Comedy|Romance<br>Popularity...
7,8,Tom and Huck (1995),Adventure|Children,129.0,Tom and Huck (1995)<br>Adventure|Children<br>P...
8,9,Sudden Death (1995),Action,381.0,Sudden Death (1995)<br>Action<br>Popularity:381.0
9,10,GoldenEye (1995),Action|Adventure|Thriller,8914.0,GoldenEye (1995)<br>Action|Adventure|Thriller<...


In [15]:
np.sum(R_train,axis=0).shape

(1, 22884)

In [16]:
params = {
    'models': {"PLRec": embedded_lrec_items,
               "PmiPLRec": pmi_lrec_items,
               "ALS": als
               },
    'alphas': 10,
    'rank': 100,
    'lambda': 0.01,
    'topK': 10,
    'iter': 7,
    'metric': ['R-Precision', 'NDCG'],
}

In [17]:
item_popularity = np.array(np.sum(R_train, axis=0)).flatten()

In [426]:
RQ_pmi = np.load('latent/U_{0}_{1}.npy'.format("PmiPLRec", params['rank']))
Y_pmi = np.load('latent/V_{0}_{1}.npy'.format("PmiPLRec", params['rank']))

NDCG:(0.2166897925036953, 0.0008172874169728984)
R-Precision:(0.13134723220563294, 0.0006043677976043569)
Recall@50:(0.27740511280609054, 0.001203150507462379)
Clicks:(0.7533201203634049, 0.0077082474670282205)
Precision@50:(0.14607444592326835, 0.0006567643132106959)
Elapsed: 00:01:05


In [33]:
RQ_lrec = np.load('latent/U_{0}_{1}.npy'.format("PLRec", params['rank']))
Y_lrec = np.load('latent/V_{0}_{1}.npy'.format("PLRec", params['rank']))

NDCG:(0.1678184369426288, 0.0006633969240322184)
R-Precision:(0.10747242788033418, 0.0005131395650126645)
Recall@50:(0.20278737039541192, 0.0008773051481912347)
Clicks:(0.90515595162317, 0.0084544466779494)
Precision@50:(0.11765262426942887, 0.0005800609541356605)
Elapsed: 00:01:01


In [106]:
RQ_als = np.load('latent/U_{0}_{1}.npy'.format("ALS", params['rank']))
Y_als = np.load('latent/V_{0}_{1}.npy'.format("ALS", params['rank']))

NDCG:(0.1830385913490645, 0.0007789548514031727)
R-Precision:(0.10710897321868414, 0.0005531485311658711)
Recall@50:(0.25291314842944895, 0.001255835182705188)
Clicks:(0.9313407788901106, 0.00826187273725927)
Precision@50:(0.12009359990741279, 0.0005156618276501792)
Elapsed: 00:01:00


In [19]:
from sklearn.manifold import TSNE
import plotly.plotly as py
import plotly
plotly.tools.set_credentials_file(username='wuga', api_key='gJlTOHX9OA6h7rlUlkWo')
import plotly.graph_objs as go

### Get Some User Records

In [108]:
R_valid = load_numpy(path=argpath, name=argvalid)

In [48]:
np.argpartition(-np.asarray(np.sum(R_train,axis=1)).reshape(-1)[np.asarray(np.sum(R_train,axis=1)).reshape(-1)>=2], 10)[:10]

array([117851, 125418,   8375,  82165, 131508, 131498,  20070,  78916,
       130376,  54299])

In [365]:
can = np.argsort(np.asarray(np.sum(R_train,axis=1)).reshape(-1))
can[np.sort(np.asarray(np.sum(R_train,axis=1)).reshape(-1))>=5][:20]

array([ 87541,  96219,  99482,  70315,  69898,  12414,  12422, 102675,
       113039,  10458,  69897, 112459,  12635, 113048,  70104,  87771,
       112383,  69801,  70493,   8855])

In [280]:
R_train[13201].nonzero()[1]

array([  15, 4197], dtype=int32)

In [90]:
pd.set_option('display.max_colwidth', -1)

In [91]:
def getRecommendation(R_train, RQ, Y, index, k):
    user = RQ[index]
    idx = np.argsort(-Y.dot(user))
    filtered = np.delete(idx, np.isin(idx, R_train[index].nonzero()[1]).nonzero()[0])
    return Side_info['notes'].values[filtered][:k]

In [300]:
case = 129385
Side_info['notes'].values[R_train[case].nonzero()[1]]

array([u'Apollo 13 (1995)<br>Adventure|Drama|IMAX<br>Popularity:25913.0',
       u'True Lies (1994)<br>Action|Adventure|Comedy|Romance|Thriller<br>Popularity:16725.0'],
      dtype=object)

In [301]:
Side_info['notes'].values[R_valid[case].nonzero()[1]]

array([u'Outbreak (1995)<br>Action|Drama|Sci-Fi|Thriller<br>Popularity:7936.0',
       u'Clear and Present Danger (1994)<br>Action|Crime|Drama|Thriller<br>Popularity:13092.0'],
      dtype=object)

In [302]:
getRecommendation(R_train, RQ_pmi, Y_pmi, case, 5)

array([u'Clear and Present Danger (1994)<br>Action|Crime|Drama|Thriller<br>Popularity:13092.0',
       u'Die Hard: With a Vengeance (1995)<br>Action|Crime|Thriller<br>Popularity:12631.0',
       u'Speed (1994)<br>Action|Romance|Thriller<br>Popularity:12650.0',
       u'Terminator 2: Judgment Day (1991)<br>Action|Sci-Fi<br>Popularity:24787.0',
       u'While You Were Sleeping (1995)<br>Comedy|Romance<br>Popularity:7526.0'],
      dtype=object)

In [303]:
getRecommendation(R_train, RQ_lrec, Y_lrec, case, 5)

array([u'GoldenEye (1995)<br>Action|Adventure|Thriller<br>Popularity:8914.0',
       u'While You Were Sleeping (1995)<br>Comedy|Romance<br>Popularity:7526.0',
       u'Beauty and the Beast (1991)<br>Animation|Children|Fantasy|Musical|Romance|IMAX<br>Popularity:15101.0',
       u'L\xe9on: The Professional (a.k.a. The Professional) (L\xe9on) (1994)<br>Action|Crime|Drama|Thriller<br>Popularity:11163.0',
       u'Die Hard: With a Vengeance (1995)<br>Action|Crime|Thriller<br>Popularity:12631.0'],
      dtype=object)

In [304]:
getRecommendation(R_train, RQ_als, Y_als, case, 5)

array([u'Dances with Wolves (1990)<br>Adventure|Drama|Western<br>Popularity:20929.0',
       u'Clear and Present Danger (1994)<br>Action|Crime|Drama|Thriller<br>Popularity:13092.0',
       u'Fugitive, The (1993)<br>Thriller<br>Popularity:27495.0',
       u'Aladdin (1992)<br>Adventure|Animation|Children|Comedy|Musical<br>Popularity:18780.0',
       u'Beauty and the Beast (1991)<br>Animation|Children|Fantasy|Musical|Romance|IMAX<br>Popularity:15101.0'],
      dtype=object)

In [253]:
np.argmin(x[x>2])

0

In [391]:
case = 12414
Side_info['notes'].values[R_train[case].nonzero()[1]]

array([u'Die Hard: With a Vengeance (1995)<br>Action|Crime|Thriller<br>Popularity:12631.0',
       u'Clear and Present Danger (1994)<br>Action|Crime|Drama|Thriller<br>Popularity:13092.0',
       u'True Lies (1994)<br>Action|Adventure|Comedy|Romance|Thriller<br>Popularity:16725.0',
       u'Jurassic Park (1993)<br>Action|Adventure|Sci-Fi|Thriller<br>Popularity:24316.0',
       u'Beauty and the Beast (1991)<br>Animation|Children|Fantasy|Musical|Romance|IMAX<br>Popularity:15101.0'],
      dtype=object)

In [392]:
Side_info['notes'].values[R_valid[case].nonzero()[1]]

array([u'Seven (a.k.a. Se7en) (1995)<br>Mystery|Thriller<br>Popularity:21370.0',
       u'Braveheart (1995)<br>Action|Drama|War<br>Popularity:29159.0',
       u'Star Trek: Generations (1994)<br>Adventure|Drama|Sci-Fi<br>Popularity:7914.0',
       u'Firm, The (1993)<br>Drama|Thriller<br>Popularity:7270.0',
       u'Terminator 2: Judgment Day (1991)<br>Action|Sci-Fi<br>Popularity:24787.0'],
      dtype=object)

In [393]:
getRecommendation(R_train, RQ_pmi, Y_pmi, case, 5)

array([u'Aladdin (1992)<br>Adventure|Animation|Children|Comedy|Musical<br>Popularity:18780.0',
       u'Crimson Tide (1995)<br>Drama|Thriller|War<br>Popularity:10241.0',
       u'Stargate (1994)<br>Action|Adventure|Sci-Fi<br>Popularity:9789.0',
       u'GoldenEye (1995)<br>Action|Adventure|Thriller<br>Popularity:8914.0',
       u'Star Trek: Generations (1994)<br>Adventure|Drama|Sci-Fi<br>Popularity:7914.0'],
      dtype=object)

In [394]:
getRecommendation(R_train, RQ_lrec, Y_lrec, case, 5)

array([u'Aladdin (1992)<br>Adventure|Animation|Children|Comedy|Musical<br>Popularity:18780.0',
       u'Crimson Tide (1995)<br>Drama|Thriller|War<br>Popularity:10241.0',
       u'Cliffhanger (1993)<br>Action|Adventure|Thriller<br>Popularity:5052.0',
       u'Outbreak (1995)<br>Action|Drama|Sci-Fi|Thriller<br>Popularity:7936.0',
       u'Die Hard (1988)<br>Action|Crime|Thriller<br>Popularity:13357.0'],
      dtype=object)

In [395]:
getRecommendation(R_train, RQ_als, Y_als, case, 5)

array([u'Aladdin (1992)<br>Adventure|Animation|Children|Comedy|Musical<br>Popularity:18780.0',
       u'Dances with Wolves (1990)<br>Adventure|Drama|Western<br>Popularity:20929.0',
       u'Apollo 13 (1995)<br>Adventure|Drama|IMAX<br>Popularity:25913.0',
       u'Fugitive, The (1993)<br>Thriller<br>Popularity:27495.0',
       u'Batman (1989)<br>Action|Crime|Thriller<br>Popularity:15158.0'],
      dtype=object)

In [433]:
case = 69801
Side_info['notes'].values[R_train[case].nonzero()[1]]

array([u'28 Days Later (2002)<br>Action|Horror|Sci-Fi<br>Popularity:4700.0',
       u'Assassination of Richard Nixon, The (2004)<br>Crime|Drama|Thriller<br>Popularity:121.0',
       u'Children of Men (2006)<br>Action|Adventure|Drama|Sci-Fi|Thriller<br>Popularity:3237.0',
       u'Zodiac (2007)<br>Crime|Drama|Thriller<br>Popularity:943.0',
       u'28 Weeks Later (2007)<br>Horror|Sci-Fi|Thriller<br>Popularity:1042.0'],
      dtype=object)

In [434]:
Side_info['notes'].values[R_valid[case].nonzero()[1]]

array([u'Clockwork Orange, A (1971)<br>Crime|Drama|Sci-Fi|Thriller<br>Popularity:10989.0',
       u'Shining, The (1980)<br>Horror<br>Popularity:10234.0',
       u'Insider, The (1999)<br>Drama|Thriller<br>Popularity:3990.0',
       u'Donnie Darko (2001)<br>Drama|Mystery|Sci-Fi|Thriller<br>Popularity:8788.0',
       u'Machinist, The (Maquinista, El) (2004)<br>Mystery|Thriller<br>Popularity:1852.0'],
      dtype=object)

In [435]:
getRecommendation(R_train, RQ_pmi, Y_pmi, case, 5)

array([u'Donnie Darko (2001)<br>Drama|Mystery|Sci-Fi|Thriller<br>Popularity:8788.0',
       u'300 (2007)<br>Action|Fantasy|War|IMAX<br>Popularity:3919.0',
       u'Shaun of the Dead (2004)<br>Comedy|Horror<br>Popularity:4388.0',
       u'Sin City (2005)<br>Action|Crime|Film-Noir|Mystery|Thriller<br>Popularity:6412.0',
       u'V for Vendetta (2006)<br>Action|Sci-Fi|Thriller|IMAX<br>Popularity:5717.0'],
      dtype=object)

In [436]:
getRecommendation(R_train, RQ_lrec, Y_lrec, case, 5)

array([u'Donnie Darko (2001)<br>Drama|Mystery|Sci-Fi|Thriller<br>Popularity:8788.0',
       u'300 (2007)<br>Action|Fantasy|War|IMAX<br>Popularity:3919.0',
       u'Sin City (2005)<br>Action|Crime|Film-Noir|Mystery|Thriller<br>Popularity:6412.0',
       u'V for Vendetta (2006)<br>Action|Sci-Fi|Thriller|IMAX<br>Popularity:5717.0',
       u'Shaun of the Dead (2004)<br>Comedy|Horror<br>Popularity:4388.0'],
      dtype=object)

In [437]:
getRecommendation(R_train, RQ_als, Y_als, case, 5)

array([u'300 (2007)<br>Action|Fantasy|War|IMAX<br>Popularity:3919.0',
       u"Pan's Labyrinth (Laberinto del fauno, El) (2006)<br>Drama|Fantasy|Thriller<br>Popularity:4690.0",
       u'Borat: Cultural Learnings of America for Make Benefit Glorious Nation of Kazakhstan (2006)<br>Comedy<br>Popularity:2117.0',
       u'3:10 to Yuma (2007)<br>Action|Crime|Drama|Western<br>Popularity:1649.0',
       u'Hot Fuzz (2007)<br>Action|Comedy|Crime|Mystery<br>Popularity:2550.0'],
      dtype=object)