In [1]:
import numpy as np
import pandas as pd
import os
import glob
import pickle
from collections import defaultdict
from blob import blobConn
from sklearn.decomposition import NMF
from scipy import sparse
from sklearn.metrics import mean_squared_error
path = "/Users/yang_home/Documents/learning/AI_dev/"

In [2]:
# read data
filenames = glob.glob(path + 'news-portal-user-interactions-by-globocom/clicks/*.csv' )
#filenames = glob.glob(path + 'news-portal-user-interactions-by-globocom/clicks/clicks_hour_[0-1][0-9][0-9].csv' )
print(len(filenames))
click = pd.DataFrame()
for file in filenames:
    f = pd.read_csv(file)
    # add day number, 16 days of data in total
    f['day_number'] = int(file.split('_')[-1].split('.')[0]) // 24
    click = pd.concat([click, f], axis = 0)
click.head()

385


Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type,day_number
0,93863,1507865792177843,1507865792000,2,96210,1507865832925,4,3,2,1,21,2,12
1,93863,1507865792177843,1507865792000,2,158094,1507865862925,4,3,2,1,21,2,12
2,294036,1507865795185844,1507865795000,2,20691,1507865819095,4,3,20,1,9,2,12
3,294036,1507865795185844,1507865795000,2,96210,1507865849095,4,3,20,1,9,2,12
4,77136,1507865796257845,1507865796000,2,336245,1507866133178,4,3,2,1,25,2,12


In [29]:
def get_rmse(actual, pred):
    """Pred and actual are arrays
    """
    actual1 = actual[actual.nonzero()].flatten() # Ignore zero terms
    pred1 = pred[actual.nonzero()].flatten()     # Ignore zero terms

    return np.sqrt(mean_squared_error(actual1, pred1))

def get_sparse_matrix(data, shape=None):
    """data is the user article click dataframe"""
    row = data.iloc[:,0].values
    col = data.iloc[:,1].values
    val = data.iloc[:,2].values
    #n_user = len(data.user_id.unique())
    #n_article = len(data.click_article_id.unique())
    max_user_idx = max(row)
    max_article_idx = max(col)
    if not shape:
        sparse_mat = sparse.csc_matrix((val, (row, col)), shape=(max_user_idx+1, max_article_idx+1))

    else:
        sparse_mat = sparse.csc_matrix((val, (row, col)), shape=(shape[0], shape[1]))

    return sparse_mat


def NMFModel(R, **params_NMF):
    """R is sparse matrix"""
    model = NMF(**params_NMF)
    W = model.fit_transform(R)
    H = model.components_
    R_hat = np.dot(W,H)
    #R_hat_sparse = sparse.csc_matrix(R_hat)
    return R_hat, H, model


def get_interaction(raw_data):
    user_article_click = raw_data.groupby(['user_id','click_article_id'],as_index=False).size()
    #user_article_click.rename(columns = {'size':'click'},inplace=True)
    user_article_click['click']=(user_article_click['size'] >=1).astype(int)
    return user_article_click[['user_id','click_article_id','click']]

def sample_by_index(data, user_limit, article_limit):
    ''' the click data is too big
    this function is to get a portion of users and articles to train the model
    Params:
    data: user article click interaction
    user_limit: user_id upper limit. (user_id start from 0 to 323896)
    article_limit: article_id upper limit. (article_id start from 0 to 300k)
    '''
    df = data[(data['user_id']<=user_limit) & (data['click_article_id']<=article_limit)]
    return df


In [10]:
train0, test0 = click[click['day_number']<=10], click[click['day_number']>10]
# further select user and article to reduce computation time
max_user = 9999
max_article = 9999
train1, test1 = sample_by_index(train0,max_user,max_article), sample_by_index(test0,max_user,max_article)

train = get_interaction(train1)
test = get_interaction(test1)
print(f'train shape is {train.shape}, test shape is {test.shape}')

R_train = get_sparse_matrix(train,shape=[max_user+1, max_article+1])
R_test = get_sparse_matrix(test, shape=[max_user+1, max_article+1])

print(R_train.toarray().shape)
print(R_test.toarray().shape)

params_NMF = {
                'n_components' : 20,
                #'alpha_W' : 0.01,
                'l1_ratio' : 0, 
                'max_iter' : 200
            }

R_pred, H, model = NMFModel(R_train, **params_NMF)

perf = get_rmse(R_test.toarray(), R_pred)
print(perf)

train shape is (1106, 3), test shape is (677, 3)
(10000, 10000)
(10000, 10000)


# Grid search



In [152]:
def cv_fold(interaction_df,max_user, max_article, fold):
    mid_user = max_user // 2
    mid_article = max_article// 2
    idx_dict = {
                0: [[0,mid_user],[0, mid_article]],
                1: [[0,mid_user],[mid_article, max_article + 1]],
                2: [[mid_user, max_user + 1], [0, mid_article]],
                3: [[mid_user, max_user + 1], [mid_article, max_article + 1]]
    }

    idx = idx_dict[fold]

    test = interaction_df[(interaction_df['user_id']>= idx[0][0]) \
                          & (interaction_df['user_id'] < idx[0][1]) \
                          & (interaction_df['click_article_id']>= idx[1][0]) \
                          & (interaction_df['click_article_id'] < idx[1][1])]
    train = interaction_df[(interaction_df['user_id'] < idx[0][0]) \
                           | (interaction_df['user_id'] >= idx[0][1]) \
                            | (interaction_df['click_article_id'] < idx[1][0]) \
                            | (interaction_df['click_article_id'] >= idx[1][1])]
    return train, test

def grid_search_train(params, max_user, max_article):
    train0, test0 = click[click['day_number']<=10], click[click['day_number']>10]

    train1, test1 = sample_by_index(train0,max_user,max_article), sample_by_index(test0,max_user,max_article)

    train = get_interaction(train1)
    test = get_interaction(test1)
    print(f'train shape is {train.shape}, test shape is {test.shape}')



    grid_search_res = []
    n_round = 0
    # columns: max_user, max_article, n_compoment, alpha, l1_ratio, max_iter, rmse
    for n_component in params['n_components']:
        for alpha_W in params['alpha_W']:
            for l1_ratio in params['l1_ratio']:
                for max_iter in params['max_iter']:
                    n_round += 1
                    print(f'round {n_round}')
                    params_gs = {
                                        'n_components' : n_component,
                                        'init' : 'random', 
                                        'random_state' : 0, 
                                        'alpha_W' : alpha_W,
                                        'l1_ratio' : l1_ratio,
                                        'max_iter' : max_iter
                                        }
                    # genereate n fold for train and cross validation
                    err = []
                    for i in range(4):
                        print(f'fold {i}')
                        X, Y = cv_fold(train, max_user, max_article,i)
                        R_X = get_sparse_matrix(X,shape=[max_user+1, max_article+1])
                        R_Y = get_sparse_matrix(Y, shape=[max_user+1, max_article+1])

                        R_hat, estimator = NMFModel(R_X, **params_gs)

                        rmse = get_rmse(R_Y.toarray(), R_hat)
                        err.append(rmse)
                    avg_rmse = sum(err)/4
                    print(f'this is average rmse: {avg_rmse}')
                        
                    grid_search_res.append([max_user, max_article, 
                            params_gs['n_components'], params_gs['alpha_W'], params_gs['l1_ratio'], params_gs['max_iter'], 
                            avg_rmse] + err)
        
    gs_df = pd.DataFrame(grid_search_res, columns=['max_user','max_article',
                                      'n_components','alpha_W','l1_ratio','max_iter', 
                                      'avg_rmse','fold_1_rmse','fold_2_rmse','fold_3_rmse','fold_4_rmse'])
    best_params = gs_df.sort_values(by='avg_rmse')

    return best_params, gs_df


In [156]:
# Grid search train
params = {
                'n_components' : [50, 100],
                'alpha_W' : [0],
                'l1_ratio' : [0], 
                'max_iter' : [200]
            }


best_params, record_df = grid_search_train(params, 5000, 100000)




train shape is (15676, 3), test shape is (6570, 3)
round 1
fold 0
fold 1




fold 2
fold 3
this is average rmse: 0.9960317485571636
round 2
fold 0




fold 1




fold 2




fold 3




this is average rmse: 0.9973239730110631


In [157]:
record_df

Unnamed: 0,max_user,max_article,n_components,alpha_W,l1_ratio,max_iter,avg_rmse,fold_1_rmse,fold_2_rmse,fold_3_rmse,fold_4_rmse
0,5000,100000,50,0,0,200,0.996032,0.994264,0.997619,0.994784,0.99746
1,5000,100000,100,0,0,200,0.997324,0.996156,0.99834,0.99642,0.998379


In [217]:
opt_params = {
                'n_components' :50,
                'alpha_W' : 0,
                'l1_ratio' : 0, 
                'max_iter' : 200
            }
max_user = 10000
max_article = 100000
train0, test0 = click[click['day_number']<=10], click[click['day_number']>10]

train1, test1 = sample_by_index(train0,max_user,max_article), sample_by_index(test0,max_user,max_article)

train = get_interaction(train1)
test = get_interaction(test1)
R_train = get_sparse_matrix(train,shape=[max_user+1, max_article+1])
#R_test = get_sparse_matrix(test, shape=[max_user+1, max_article+1])
#R_pred, estimator = NMFModel(R_train, **opt_params)
#performance = get_rmse(R_test.toarray(),R_pred)
#performance


In [22]:
#np.save(path + 'recommender_system/model/NMF_sparse2.npy', R_pred_sparse)

## make prediction

In [172]:
# save model
def save_model(model):
    with open(path + 'recommender_system/model/NMF_model_v1.pickle','wb') as f:
        pickle.dump(model, f)
        f.close()
# load model
def load_mdoel(file_path):
    with open(file_path, 'rb') as f:
        model = pickle.load(f)
        f.close()
    return model

In [28]:
# make prediction for user with history
all_user_article_interaction = click.groupby(['user_id','click_article_id'],as_index=False).size()
all_user_article_interaction.rename(columns = {'size':'click'},inplace=True)


In [52]:
# new user
from model import NMF_recommendation

new_id = 322482
xx = NMF_recommendation(all_user_article_interaction, model,H,R_pred, new_id)
xx

array([9999, 3329, 3336, 3335, 3334])

In [42]:
all_user_article_interaction[all_user_article_interaction['click_article_id']<10000]

Unnamed: 0,user_id,click_article_id,click
248,10,5341,1
249,10,7744,1
669,23,2137,1
720,24,5341,1
872,26,4658,1
...,...,...,...
2949745,322482,5349,1
2950173,322666,1973,1
2950560,322829,5583,1
2950561,322829,5595,1


In [38]:
def NMF_recommendation(data, estimator, H, R_pred, user_id):
    """R_pred is array
        data is interaction data generated already
    """
    user_id = int(user_id)
    if user_id < R_pred.shape[0]:
        # user in prediction matrix
        sort_article = np.argsort(R_pred[user_id])[::-1][:5]
    else:
        # user not in prediction matrix (0-10000)
        # convert click history to preference matrix, use existing model
        history = np.full((1,R_pred.shape[1]),0)
        user_df = data[(data['user_id'] == user_id) & (data['click_article_id']<=R_pred.shape[1])]
        print(user_df.head())
        if len(user_df) == 0:
            #completely new with no click history
            return '404'
        indexs = user_df.click_article_id.values
        history[0][indexs] = 1
        #new_R = np.concatenate((R_train.toarray(),history),axis = 0)
        new_record = estimator.transform(history)
        new_R_hat = np.dot(new_record,H)
        sort_article = np.argsort(new_R_hat[-1])[::-1][:5]

    return sort_article

In [226]:
rec_test = NMF_recommendation(all_user_article_interaction,estimator, R_pred, 12345)
rec_test

(1, 100001)
(10001, 100001)


array([30970,  2647, 58565, 36080, 95977])

## Appendix

In [None]:
"""
def cv_matrix(X, fold):
    '''
    Given a matrix X, the function creates 4 sets of train + test matrices
    where each train matrix is masked with zeros in 0.25 of the values, and the
    test matrix is masked zeros in 0.75 of them.
    X - numpy array
    fold - is an integer from 0-3.
    Returns the masked data and also the masks for train and test
    '''
    # Create a dict with the slicing indices
    rows = X.shape[0]
    cols = X.shape[1]
    mid_rows = int(rows/2)
    mid_cols = int(cols/2)
    
    idx_dict = {
                0: [[0,mid_rows],[0, mid_cols]],
                1: [[0,mid_rows],[mid_cols, cols]],
                2: [[mid_rows, rows], [0, mid_cols]],
                3: [[mid_rows, rows], [mid_cols, cols]]
    }
    
    idexes = idx_dict[fold]
    # Create masks
    train_mask = np.full((rows, cols), 1)
    train_mask[idexes[0][0]:idexes[0][1], idexes[1][0]:idexes[1][1]] = 0
    test_mask = 1 - train_mask
    
    
    # Create X_train
    X_train = X.copy()
    X_train[train_mask==0] = 0
    
    # Create X_test
    X_test = X.copy()
    X_test[train_mask==1] = 0
        
    return X_train, X_test, train_mask, test_mask   
"""