In [3]:
import graphlab as gl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree

This non-commercial license of GraphLab Create for academic use is assigned to surbhi.jain@sjsu.edu and will expire on November 21, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1511465701.log


In [24]:

def score(df_true, df_pred):

    df = pd.concat([df_pred,
                    df_true], axis=1)

    g = df.groupby('user_id')

    top_5 = g.pred_rating.apply(
        lambda x: x >= x.quantile(.95)
    )

    return df_true[top_5==1].mean()['true_rating']

def extract_key_words(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue
    
    return continuous_chunk
    

def clean_joke(joke):
    joke = re.sub(r'([^\.\s\w]|_)+', '', joke).replace(".", ". ")
    joke = joke.replace('\r', '') 
    joke = joke.replace('\n', '')
    joke = joke.replace('<br />', '')
    joke = joke.replace('<p>', '')
    joke = joke.replace('&quot;', '')
    joke = joke.replace('&#039;', '')
    joke = " ".join(extract_key_words(joke))
    return joke

def load_joke_classes_and_text():
    data = pd.read_csv("../data/Jokes_labelling.txt", delimiter="\t")
    data['Jokes'] = data['Jokes'].map(lambda j: clean_joke(j))
    data.drop('joke_category', axis=1, inplace=True)
    cat_feats = pd.get_dummies(data['joke_category_reduced'], prefix='cat')
    data = pd.concat([data['joke_id'], data['Jokes'], cat_feats], axis=1)
    
    data_sf = gl.SFrame(data)
    
    return data_sf
    

def load_data():
    # Input data
    sf = gl.SFrame("../data/ratings.dat", format='tsv')

    # Data to test predictions on
    df_sample = pd.read_csv("../data/sample_submission.csv")
    sf_sample = gl.SFrame(df_sample)

    return sf, sf_sample, df_sample

def load_joke_classes_text_and_glove_vectors():
    id_vectors = pd.read_csv("../data/Jokes_id_with_vectors.txt", delimiter="\t")
    data = pd.read_csv("../data/Jokes_labelling.txt", delimiter="\t")
    cat_feats = pd.get_dummies(data['joke_category_reduced'], prefix='cat')
    

    all_data = pd.merge(data, id_vectors, on='joke_id', how='inner').set_index('joke_id').reset_index()
    
    X = pd.concat([all_data, cat_feats], axis=1)
    #print X.columns
    #print X.describe(include='all')
    X.drop(['Jokes','joke_category', 'joke_category_reduced','Unnamed: 301'], axis=1, inplace=True)
    X = X.fillna(0)
    X = gl.SFrame(X)
    
    cat_feats['joke_id'] = range(1,151)
    cat_feats = gl.SFrame(cat_feats)
    return X, cat_feats

def recommendation_modules(sf, num_factors, regularization = None):
    
    #joke_data_and_category = load_joke_classes_and_text()
    joke_vector_and_cat, joke_cat = load_joke_classes_text_and_glove_vectors()
    
    
    
    '''
    ranking_model = gl.recommender.ranking_factorization_recommender.create(observation_data=sf,
                                                     user_id="user_id",
                                                     item_id="joke_id",
                                                     target='rating',
                                                     solver='auto',
                                                     num_factors = num_factors,
                                                     regularization = regularization,
                                                     verbose = False,
                                                     random_seed = 42)
    
    factorization_model = gl.recommender.factorization_recommender.create(observation_data=sf,
                                                     user_id="user_id",
                                                     item_id="joke_id",
                                                     target='rating',
                                                     solver='auto',
                                                     num_factors = num_factors,
                                                     regularization = regularization,
                                                     verbose = False,
                                                     random_seed = 42)
    '''
    item_sim_model = gl.recommender.item_similarity_recommender.create(observation_data=sf,
                                                     user_id="user_id",
                                                     item_id="joke_id",
                                                     target='rating',
                                                     #solver='auto',
                                                     #num_factors = num_factors,
                                                     #regularization = regularization,
                                                     verbose = False,
                                                     #random_seed = 42, 
                                                     similarity_type='jaccard')
    
    item_sim_model_with_categories = gl.recommender.item_similarity_recommender.create(observation_data=sf,
                                                     user_id="user_id",
                                                     item_id="joke_id",
                                                     target='rating',
                                                     #solver='auto',
                                                     #num_factors = num_factors,
                                                     #regularization = regularization,
                                                     verbose = False,
                                                     #random_seed = 42,
                                                     similarity_type='jaccard',
                                                     item_data=  joke_cat                                      
                                                     )
    item_sim_model_with_vectors_and_categories = gl.recommender.item_similarity_recommender.create(observation_data=sf,
                                                     user_id="user_id",
                                                     item_id="joke_id",
                                                     target='rating',
                                                     #solver='auto',
                                                     #num_factors = num_factors,
                                                     #regularization = regularization,
                                                     verbose = False,
                                                     #random_seed = 42,
                                                     similarity_type='jaccard',
                                                     item_data=  joke_vector_and_cat                                      
                                                     )
    
    
    
    return item_sim_model, item_sim_model_with_categories, item_sim_model_with_vectors_and_categories
    #return ranking_model, factorization_model, item_sim_model_pearson


In [26]:
if __name__ == "__main__":
    sf, sf_sample, df_sample = load_data()

    training_data, validation_data = gl.recommender.util.random_split_by_user(sf, 'user_id', 'joke_id')

    df_true = pd.DataFrame()
    df_pred = pd.DataFrame()

    df_true['user_id'] = validation_data['user_id']
    df_true['joke_id'] = validation_data['joke_id']

    df_true['true_rating'] = validation_data['rating']

    # Plot scores vs num_factors
    num_factors = range(2,100)
    num_factors = [2, 4, 8, 16, 32, 50, 64, 80, 100]
    #num_factors = [2, 4]
    scores = []
    for n in num_factors:
        for m in recommendation_modules(training_data, num_factors = n):
            #m = create_factorization_recommender(training_data, num_factors = n)
            df_pred['pred_rating'] = m.predict(validation_data)
            rc = score(df_true, df_pred)
            scores.append(rc)
            print 'Num Factors:', n, ' Score:', rc
        print "\n\n\n"
    plt.plot(num_factors, scores)
    plt.xlabel('Number of Latent Features')
    plt.ylabel('Score')
    plt.title('Score vs Number of Latent Features')
    plt.show()


------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,int,float]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


Num Factors: 2  Score: 1.53297483766
Num Factors: 2  Score: 1.39583333333
Num Factors: 2  Score: 1.38078327922






Num Factors: 4  Score: 1.561620671
Num Factors: 4  Score: 1.37736742424
Num Factors: 4  Score: 1.4053030303






Num Factors: 8  Score: 1.51234442641
Num Factors: 8  Score: 1.47602137446
Num Factors: 8  Score: 1.38876488095






Num Factors: 16  Score: 1.60044642857
Num Factors: 16  Score: 1.37432359307
Num Factors: 16  Score: 1.3304586039






Num Factors: 32  Score: 1.50656114719
Num Factors: 32  Score: 1.39174107143
Num Factors: 32  Score: 1.34307359307






Num Factors: 50  Score: 1.57727949134
Num Factors: 50  Score: 1.29464285714
Num Factors: 50  Score: 1.34591450216






Num Factors: 64  Score: 1.49188311688
Num Factors: 64  Score: 1.28121617965
Num Factors: 64  Score: 1.41734307359






Num Factors: 80  Score: 1.51924377706
Num Factors: 80  Score: 1.37489853896
Num Factors: 80  Score: 1.32700892857






Num Factors: 100  Score: 1.58850784632
Num Factors: 100  Score: 1.38274485931
Num Factors: 100  Score: 1.39126758658






ValueError: x and y must have same first dimension, but have shapes (9,) and (27,)