# Data Import

In [217]:
from __future__ import division, absolute_import, print_function
import pandas as pd
import numpy as np
from tfrec import Recommender
import pickle
import os
from sklearn.metrics import mean_squared_error

In [179]:
joke_ratings = pd.read_csv("jester_train.csv", header=0)

In [180]:
joke_ratings.describe()

Unnamed: 0,user_id,joke_id,rating
count,1000000.0,1000000.0,1000000.0
mean,32737.97955,70.710539,1.618454
std,18282.777812,46.004394,5.303466
min,1.0,5.0,-10.0
25%,17217.0,21.0,-2.031
50%,34837.0,69.0,2.25
75%,47302.0,112.0,5.719
max,63978.0,150.0,10.0


In [181]:
joke_ratings.shape

(1000000, 3)

In [182]:
joke_matrix = pd.pivot_table(joke_ratings,
                             values='rating',
                             index='user_id',
                             columns='joke_id'
                            )
joke_matrix.fillna(value=0, inplace=True)

joke_matrix.head()

joke_id,5,7,8,13,15,16,17,18,19,20,...,141,142,143,144,145,146,147,148,149,150
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,-9.281,-9.281,0.0,0.875,0.0,0.0,0.0,-8.719,-9.156,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,9.938,0.0,9.938,0.406,3.719,9.656,-2.688,0.0,-9.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-9.844,-9.844,-7.219,-2.031,0.0,-9.969,-9.875,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-5.812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,4.75,0.0,0.0,0.0,0.0,6.219,0.0,0.0,5.406,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Train Model

In [183]:
jr1 = joke_ratings[joke_ratings['user_id'] < 17000]
jr2 = joke_ratings[(joke_ratings['user_id'] < 35000) & (joke_ratings['user_id']>=17500)]
jr3 = joke_ratings[(joke_ratings['user_id'] < 47500) & (joke_ratings['user_id']>=35000)]          
jr4 = joke_ratings[joke_ratings['user_id'] >=47500]

In [218]:
model = Recommender(k=50,
                    dtype='float32',
                    lambda_factors=0.0001,
                    lambda_biases=0.001,
                    init_factor_mean=0.17, #0.0, 
                    init_factor_stddev=0.01, # 0.01,
                    n_iter=3000,
                    learning_rate=1e-05,
                    batch_size=-1)

X = joke_ratings[['user_id', 'joke_id']].values
y = joke_ratings['rating'].values

model.fit(X, y, verbose=True, verbose_period=1000)

2017-05-15 15:34:00,194: tfrec.recommender : INFO      : will `fit()` fresh
INFO:tfrec.recommender:will `fit()` fresh
2017-05-15 15:34:00,847: tfrec.recommender : INFO      : new_num_users: 58127, new_num_items: 142
INFO:tfrec.recommender:new_num_users: 58127, new_num_items: 142
2017-05-15 15:34:00,848: tfrec.recommender : INFO      : num_users: 58127, num_items: 142
INFO:tfrec.recommender:num_users: 58127, num_items: 142
2017-05-15 15:34:01,750: tfrec.recommender : INFO      : instantiated a new TensorFlow session
INFO:tfrec.recommender:instantiated a new TensorFlow session
2017-05-15 15:34:04,058: tfrec.recommender : INFO      : Starting Gradient Descent for 3000 iterations
INFO:tfrec.recommender:Starting Gradient Descent for 3000 iterations
2017-05-15 15:34:04,198: tfrec.recommender : INFO      : training set RMSE = 5.49710655212
INFO:tfrec.recommender:training set RMSE = 5.49710655212
2017-05-15 15:34:04,624: tfrec.recommender : INFO      : Finished iteration #1
INFO:tfrec.recommen

Recommender(batch_size=-1, dtype='float32', init_factor_mean=0.17,
      init_factor_stddev=0.01, k=50, lambda_biases=0.001,
      lambda_factors=0.0001, learning_rate=1e-05, n_iter=3000)

# Check Jokes

In [109]:
def predict_jokes_for_user(user,
                           userid,
                           model, 
                           new=0, 
                           jokenum=5, 
                           verbosity=0
                          ):
    """INPUT:
            - user_id ('enter arbitrary number if new')
            - tfrec Recommender fitted model
            - number of jokes to compare
            
        OUTPUT:
            - List of (predicted rating, joke_id) for top
              jokes recommended for the user
              
        OPTIONAL:
            - Verbosity greater than 0 will print the
              recommended jokes
    """
    
    filename = 'jester_jokes.pkl' #No need to add as a parameter
    
    if new == 0:
        userpredict = model.predict(user)
    elif new == 1:
        userpredict = model.predict_new_user(user)
        
    jokes_for_user = sorted([(val, model.index_to_item_map_[i])\
                          for i, val in enumerate(userpredict)],\
                          reverse=True
                       )
    
    with open(filename, 'r') as f:
        up = pickle.Unpickler(f)
        jokelist = up.load()
    
    if verbosity > 0:
        for joke in jokes_for_user[:jokenum]:
            score = round(joke[0], 1)
            realscore = "New user"
            print userid
            print joke[1]
            
            if (joke[1] == "__unknown__") or (joke[1] == "__new_entry__"):
                if new == 0:
                    realscore = joke[1]
                joketext = "Not found"
            else:
                if new == 0:
                    realscore = joke_matrix.loc[userid, joke[1]]
                joketext = jokelist[joke[1]]
                
            print "-" * 50
            print "Index: {}\t Score: {}\t Real Score: {}".format(
                                                                  joke[1],
                                                                  score,
                                                                  realscore
                                                                 )
            
            print joketext
            
    return jokes_for_user

In [110]:
user_three = X[X[:, 0] == 700]
predict_jokes_for_user(user_three, 700, model, verbosity=1)

700
18
--------------------------------------------------
Index: 18	 Score: 7.7	 Real Score: 5.125
Q: If a person who speaks three languages is called "trilingual," and a person who speaks two languages is called "bilingual," what do you call a person who only speaks one language?

A: American!
700
69
--------------------------------------------------
Index: 69	 Score: 4.8	 Real Score: 0.0
Employer to applicant: "In this job we need someone who is responsible."

Applicant: "I'm the one you want. On my last job, every time anything went wrong, they said I was responsible."
700
66
--------------------------------------------------
Index: 66	 Score: 4.0	 Real Score: 0.0
Once upon a time, two brooms fell in love and decided to get married. Before the ceremony, the bride broom informed the groom broom that she was expecting a little whiskbroom. The groom broom was aghast!

"How is this possible?" he asked. "We've never swept together!"
700
__new_entry__
-------------------------------------

[(7.7228003, 18),
 (4.8356233, 69),
 (4.0065742, 66),
 (3.637743, '__new_entry__'),
 (2.2578909, 144),
 (1.91643, '__unknown__'),
 (1.7863848, 29),
 (1.722373, 16),
 (1.3911861, 42),
 (1.3106754, 148),
 (1.2847807, 60),
 (1.2172879, 96),
 (1.0438631, 109),
 (0.69476438, 7),
 (0.52396357, 8),
 (-0.33350694, 119),
 (-0.63891971, 106),
 (-2.2273121, 33),
 (-2.3053966, 15),
 (-2.7273655, 114),
 (-3.8159094, 129),
 (-5.5110168, 99),
 (-5.6595535, 17)]

In [111]:
X_test = jr2[['user_id', 'joke_id']].values
y_test = jr2['rating'].values

user_new = X_test[X_test[:, 0] == 18000]
predict_jokes_for_user(user_new, 18000, model, verbosity=1)

18000
99
--------------------------------------------------
Index: 99	 Score: 3.4	 Real Score: 0.0
Q: Whats the difference between greeting a queen and greeting the President of the United States?

A: You only have to get on one knee to greet the queen.
18000
29
--------------------------------------------------
Index: 29	 Score: 3.2	 Real Score: 7.062
Q: What's the difference between a lawyer and a plumber? 

A: A plumber works to unclog the system.
18000
__unknown__
--------------------------------------------------
Index: __unknown__	 Score: 3.2	 Real Score: __unknown__
Not found
18000
16
--------------------------------------------------
Index: 16	 Score: 3.2	 Real Score: 0.0
How many men does it take to screw in a light bulb?

One. Men will screw anything.
18000
66
--------------------------------------------------
Index: 66	 Score: 3.1	 Real Score: 0.0
Once upon a time, two brooms fell in love and decided to get married. Before the ceremony, the bride broom informed the groom bro

[(3.3892884, 99),
 (3.2215571, 29),
 (3.2038801, '__unknown__'),
 (3.1751614, 16),
 (3.1459892, 66),
 (3.0017977, 18),
 (2.4766316, 33),
 (2.3004045, 7),
 (1.9519881, '__new_entry__'),
 (1.8357029, 109),
 (1.8251524, 8),
 (-1.531932, 60),
 (-2.5158315, 129)]

In [219]:
def predict_test(model, filename='./test_predictions.csv'):
    
    joke_test = pd.read_csv("jester_test.csv", header=0)
    jt = joke_test.values
    
    predictions = model.predict(jt)
    
    joke_test['rating_target'] = predictions

    joke_test.to_csv(filename, index=False)

In [220]:
predict_test(model)

# Graphlab

In [146]:
import graphlab

In [205]:
joke_ratings2 = pd.read_csv("jester_train.csv", header=0)
joke_ratings2['item_id'] = joke_ratings2['joke_id']
del joke_ratings2['joke_id']

In [None]:
sf = graphlab.SFrame(joke_ratings2)
fr = graphlab.recommender.factorization_recommender

m = fr.create(sf,
               target='rating',
               num_factors=150,
               regularization=1e-12,
               linear_regularization=1e-12,
               nmf=False,
               max_iterations=100,
               solver='als',
               verbose=False
              )
recs = m.recommend()
print recs

In [None]:
def predict_graphlab_test(model, filename='./test_predictions.csv'):
    
    joke_test = pd.read_csv("jester_test.csv", header=0)
    joke_test['item_id'] = joke_test['joke_id']
    
    jt = sf = graphlab.SFrame(joke_test)

    predictions = model.predict(jt).to_numpy()
    
    del joke_test['item_id']
    joke_test['rating_target'] = predictions
    joke_test.to_csv('./test_predictions_temp.csv', index=False)

In [None]:
predict_graphlab_test(m)
%run scoring.py test_predictions.csv

# Grid Search

In [234]:
joke_test = pd.read_csv("jester_test.csv", header=0)
joke_test['item_id'] = joke_test['joke_id']

sf_test = graphlab.SFrame(joke_test)
sf_train = graphlab.SFrame(joke_ratings2)

In [235]:
import scoring

def scorer(model, train, test):
    """
    For each user, this scoring metric will select the 5% of jokes
    predicted to be most highly rated by that user. It then looks
    at the actual ratings (in the test data) that the user gave
    those jokes. Your score is the average of those ratings.

    Use this metric when reporting the score of your joke recommender.
    """
    joke_test = pd.read_csv("jester_test.csv", header=0)
    joke_test['item_id'] = joke_test['joke_id']
    
    jt = sf = graphlab.SFrame(joke_test)

    predictions = model.predict(jt).to_numpy()
    
    del joke_test['item_id']
    joke_test['rating_target'] = predictions
    joke_test.to_csv('./test_predictions_temp.csv', index=False)
    
    predictions = pd.read_csv('./test_predictions_temp.csv')
    score = scoring.score_top_5_percent(predictions)
    
    
    return {'average rankings': score}

In [236]:
scorer(m, 'tain', 'test')

{'average rankings': 2.279299794497302}

In [254]:
gs = graphlab.toolkits.model_parameter_search.grid_search

params = dict([('target', ['rating']),
               ('num_factors', [50, 8, 25, 100]),
               ('regularization', [0.01, 0.1, 1]),
               ('linear_regularization', [0.001, 0.1, 1]),
               ('nmf', [False, True]),
               ('max_iterations', [500, 1000, 3000]),
               ('solver', ['sgd', 'als']),
               ('sgd_step_size', [1e-05, 0.0001, 0.001, 0.1, 1]),
               ('verbose', [False]),
              ])

job = gs.create((sf_train, sf_test),
          graphlab.recommender.factorization_recommender.create,
          params,
          evaluator=scorer,
          environment=mycluster
         )



NameError: name 'mycluster' is not defined

In [248]:
job.get_results()

KeyboardInterrupt: 

In [253]:
config = graphlab.deploy.Ec2Config(aws_access_key_id='AKIAJNOKRDAJBCJPMEDQ',
                         aws_secret_access_key='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'
                         )


mycluster = graphlab.deploy.ec2_cluster.create('ec2', 's3://tylerandkeisukesbucket/', config)

mycluster.start()

[INFO] graphlab.connect.aws._ec2: Launching an m3.xlarge instance in the us-west-2c availability zone, with id: i-04a89355b0b32fa54. You will be responsible for the cost of this instance.
INFO:graphlab.connect.aws._ec2:Launching an m3.xlarge instance in the us-west-2c availability zone, with id: i-04a89355b0b32fa54. You will be responsible for the cost of this instance.
[INFO] graphlab.deploy._executionenvironment: Waiting for i-04a89355b0b32fa54 to start up.
INFO:graphlab.deploy._executionenvironment:Waiting for i-04a89355b0b32fa54 to start up.


RuntimeError: Unable to start host(s). Please terminate manually from the AWS console.

In [255]:
graphlab.deploy.ec2_cluster.load('s3://tylerandkeisukesbucket/')

NoAuthHandlerFound: No handler was ready to authenticate. 1 handlers were checked. ['HmacAuthV1Handler'] Check your credentials