In [5]:
import string
import re
import graphlab
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [6]:
def clean_data():
    #with open('../data/jester_dataset_2/jester_items.dat') as f:
    with open('../data/Jokes_labelling.txt') as f:
        #text = f.read().lower().split('</p>')
        text = f.read().lower().splitlines()
        text = [" ".join(i.split('\t')[1].split('|||')) for i in text[1:]]
        text = [re.sub(r'([^\.\s\w]|_)+', '', i).replace(".", ". ") for i in text]
        text = [line.replace('\r', '') for line in text]
        text = [line.replace('\n', '') for line in text]
        text = [line.replace('<br />', '') for line in text]
        text = [line.replace('<p>', '') for line in text]
        text = [line.replace('&quot;', '') for line in text]
        text = [line.replace('&#039;', '') for line in text]
        #text = [re.sub(" \d+", " ", line) for line in text]
        text = [line.split(':', 1)[-1] for line in text]
        #text = text[:150]
    return text

In [7]:

#def load_train_test_data(filename_train, filename_test):
def load_train_test_data():
    ratings_data = pd.read_csv('../data/jester_dataset_2/jester_ratings.dat', sep='\t')
    msk = np.random.rand(len(ratings_data)) < 0.8
    user_ratings_train = ratings_data[msk]
    user_ratings_test = ratings_data[~msk]
    train_data = graphlab.SFrame(user_ratings_train)
    test_data = graphlab.SFrame(user_ratings_test)
    return train_data, test_data

def matrix_factorization_model(data, num_factors):
    mf_model = graphlab.recommender.factorization_recommender.create(data,target
    ='rating', user_id = 'user_id', item_id = 'joke_id', num_factors=num_factors)
    return mf_model

def get_model_results(mf_model, data):
    predicted_rating = mf_model.predict(data)
    coeffs = mf_model.get('coefficients')
    user_factors = coeffs['user_id']['factors'].to_numpy()
    joke_factors = coeffs['joke_id']['factors'].to_numpy().T
    jokes =  coeffs['joke_id']['joke_id']
    users = coeffs['user_id']['user_id']
    pred_rating_matrix_multiply = np.dot(user_factors,joke_factors) + coeffs['intercept']
    print('predicted rating: {}'.format(predicted_rating))
    print('user factors: {}'.format(user_factors))
    print('joke factors: {}'.format(joke_factors))
    return user_factors, joke_factors

def grid_search_mf_model(data, params, num_folds = 5):
    folds = graphlab.cross_validation.KFold(data, num_folds)
    job = graphlab.grid_search.create(folds,
        graphlab.recommender.factorization_recommender.create,params)
    print job.get_results()

def test_rating_rmse(mf_model, test_data):
    predicted_test_rating = mf_model.predict(test_data)
    test_rmse = np.sqrt(mean_squared_error(predicted_test_rating, test_data['rating']))
    return predicted_test_rating, test_rmse

def find_latent_features(text, user_factors, joke_factors):
    joke_factors_sorted = np.argsort(joke_factors, axis = 1)[:,-10:]
    for i in range(user_factors.shape[1]):
        print [text[i-1] for i in joke_factors_sorted[i]][::-1]

if __name__ == '__main__':
    '''Load clean jokes text data and user ratings
    '''
    #text = clean_data('../data/jokes.dat')
    text = clean_data()
    #train_data, test_data = load_train_test_data('../data/ratings.csv', '../data/test_ratings.csv')
    train_data, test_data = load_train_test_data()
    
    '''Grid search for the best parameters
    '''
    params = {'user_id':'user_id', 'item_id':'joke_id', 'target':'rating',
                'num_factors': [2, 4, 6, 8]}
    grid_search_mf_model(train_data, params, num_folds = 5)

    '''Build matrix factorization model
    '''
    mf_model = matrix_factorization_model(train_data, 8)
    user_factors, joke_factors = get_model_results(mf_model, train_data)

    '''Predict the ratings on the test data
    '''
    predicted_test_rating, test_rmse = test_rating_rmse(mf_model, test_data)
    print('predicted test rating: {}'.format(predicted_test_rating))
    print('test rmse: {}'.format(test_rmse))

    '''Find the latent features in the jokes
    '''
    find_latent_features(text, user_factors, joke_factors)











    #


[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.job: Creating a LocalAsync environment called 'async'.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Nov-21-2017-03-26-5900000' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-Nov-21-2017-03-26-5900000' scheduled.
[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: A job with name 'Model-Parameter-Search-Nov-21-2017-03-26-5900000' already exists. Renaming the job to 'Model-Parameter-Search-Nov-21-2017-03-26-5900000-738f1'.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Nov-21-2017-03-26-5900000-738f1' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Model-Parameter-Search-Nov-21-2017-03-26-5900000-738f1' scheduled.
[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Model-Parameter-Search-Nov-21-2017-03-26-5900001' ready for execution


+---------+-------------+--------+---------+----------------------+
| item_id | num_factors | target | user_id |       model_id       |
+---------+-------------+--------+---------+----------------------+
| joke_id |      8      | rating | user_id | [19, 18, 15, 17, 16] |
| joke_id |      2      | rating | user_id |   [1, 0, 3, 2, 4]    |
| joke_id |      6      | rating | user_id | [11, 10, 13, 12, 14] |
| joke_id |      4      | rating | user_id |   [9, 8, 5, 7, 6]    |
+---------+-------------+--------+---------+----------------------+
+----------------------+--------------------+------------------------+
| mean_validation_rmse | mean_training_rmse | mean_training_recall@5 |
+----------------------+--------------------+------------------------+
|    5.00307448707     |   3.02898736494    |    0.0618822521335     |
|    5.00047999873     |   3.69368862721    |    0.0729669909063     |
|    5.00268152467     |   3.19812982249    |    0.0716571826426     |
|    5.00320363846     |   3.4

predicted rating: [-2.14621202694319, -3.617419168819411, -3.410405084957106, -7.455473110546095, -16.123871490825636, -6.587657616008742, -5.621249959339125, -5.9561999295844865, -7.153216526378615, 1.7912305141792464, -1.0723653291358781, -3.4090430592193437, -8.424137518276197, 1.8847626592025923, 6.913835361133592, 7.389894797931688, 6.6954631353721785, 6.171342327724473, 7.7662087227211165, 2.8242824221954512, 2.67413648357548, 2.51606784714319, 4.73115499271013, 3.602174773345964, 6.601862981449144, 6.411162092815416, 7.249295070301073, 6.325973227153795, 7.849288060794847, 8.057376458774584, 2.739910617004411, 0.6071099485263991, 7.607539131771104, 5.657790973316209, 5.050221874843614, 9.388224079738634, 3.8954559112892317, 5.678210809360521, 7.852153852115648, 8.153966977726, 6.757111623416917, 1.8944786931381392, 7.479228212963121, 0.44887907756425616, 7.157579257617967, 3.2835836107835936, 0.8749645198688674, 2.369883194099443, 5.825430109630601, -1.5037215505733323, 7.424101