In [2]:
import numpy as np
import pandas as pd
import os
import glob
import pickle
from collections import defaultdict
from blob import blobConn
from sklearn.decomposition import NMF
from scipy import sparse
from sklearn.metrics import mean_squared_error

path = "/Users/yang_home/Documents/learning/AI_dev/"

In [3]:
# read data
filenames = glob.glob(path + 'news-portal-user-interactions-by-globocom/clicks/*.csv' )
#filenames = glob.glob(path + 'news-portal-user-interactions-by-globocom/clicks/clicks_hour_[0-1][0-9][0-9].csv' )
print(len(filenames))
click = pd.DataFrame()
for file in filenames:
    f = pd.read_csv(file)
    # add day number, 16 days of data in total
    f['day_number'] = int(file.split('_')[-1].split('.')[0]) // 24
    click = pd.concat([click, f], axis = 0)
click.head()

385


Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type,day_number
0,93863,1507865792177843,1507865792000,2,96210,1507865832925,4,3,2,1,21,2,12
1,93863,1507865792177843,1507865792000,2,158094,1507865862925,4,3,2,1,21,2,12
2,294036,1507865795185844,1507865795000,2,20691,1507865819095,4,3,20,1,9,2,12
3,294036,1507865795185844,1507865795000,2,96210,1507865849095,4,3,20,1,9,2,12
4,77136,1507865796257845,1507865796000,2,336245,1507866133178,4,3,2,1,25,2,12


In [52]:
print(len(click.click_article_id.unique()))
print(click.click_article_id.max())

print(len(click.user_id.unique()))
print(click.user_id.max())

46033
364046
322897
322896


In [39]:
from sklearn.metrics import mean_squared_error

def get_rmse(pred, actual):
    """Pred and actual are arrays
    """
    print(f'actual non zero {len(actual.nonzero())}')
    print(f'pred non zero {len(pred.nonzero())}')

    pred = pred[actual.nonzero()].flatten()     # Ignore zero terms
    actual = actual[actual.nonzero()].flatten() # Ignore zero terms
    return np.sqrt(mean_squared_error(pred, actual))

def get_sparse_matrix(data, shape=None):
    """data is the user article click dataframe"""
    row = data.iloc[:,0].values
    col = data.iloc[:,1].values
    val = data.iloc[:,2].values
    n_user = len(data.user_id.unique())
    n_article = len(data.click_article_id.unique())
    max_user_idx = max(row)
    max_article_idx = max(col)
    if not shape:
        sparse_mat = sparse.csc_matrix((val, (row, col)), shape=(max_user_idx+1, max_article_idx+1))

    else:
        sparse_mat = sparse.csc_matrix((val, (row, col)), shape=(shape[0], shape[1]))

    return sparse_mat


def NMFModel(R, **params_NMF):
    """R is sparse matrix"""
    model = NMF(**params_NMF)
    W = model.fit_transform(R)
    #H = model.components_
    R_hat = model.inverse_transform(model.transform(R))
    return R_hat

'''
def NMFTrain(R):
    #train, test split
    # train model
    R_hat = NMFModel(R)
    R_array = R.toarray()
    # evaluate test result
    rmse = get_rmse(R_hat, R_array)
'''

def get_interaction(raw_data):
    user_article_click = raw_data.groupby(['user_id','click_article_id'],as_index=False).size()
    user_article_click.rename(columns = {'size':'click'},inplace=True)
    return user_article_click

def sample_by_index(data, user_limit, article_limit):
    ''' the click data is too big
    this function is to get a portion of users and articles to train the model
    Params:
    data: user article click interaction
    user_limit: user_id upper limit. (user_id start from 0 to 323896)
    article_limit: article_id upper limit. (article_id start from 0 to 300k)
    '''
    df = data[(data['user_id']<=user_limit) & (data['click_article_id']<=article_limit)]
    return df


In [45]:
train0, test0 = click[click['day_number']<=10], click[click['day_number']>10]
# further select user and article to reduce computation time
max_user = 10000
max_article = 100000
train1, test1 = sample_by_index(train0,max_user,max_article), sample_by_index(test0,max_user,max_article)

train = get_interaction(train1)
test = get_interaction(test1)
print(f'train shape is {train.shape}, test shape is {test.shape}')

R_train = get_sparse_matrix(train,shape=[max_user+1, max_article+1])
R_test = get_sparse_matrix(test, shape=[max_user+1, max_article+1])

print(R_train.toarray().shape)
print(R_test.toarray().shape)



train shape is (29904, 3), test shape is (12278, 3)
(10001, 100001)
(10001, 100001)


In [53]:
params_NMF = {
                'n_components' : 10,
                #'alpha_W' : 0.01,
                'l1_ratio' : 0, 
                'max_iter' : 200
            }

R_pred = NMFModel(R_train, **params_NMF)


In [54]:
perf = get_rmse(R_pred,R_test.toarray())
print(perf)
res.append([params_NMF, perf])


(array([    1,     2,     5, ..., 10000, 10000, 10000]), array([36162, 30760, 50405, ..., 38625, 96210, 97616]))
1.036346361177813


In [55]:
res

[[{'n_components': 10, 'l1_ratio': 0, 'max_iter': 200}, 1.0169295709720507],
 [{'n_components': 50, 'l1_ratio': 0, 'max_iter': 200}, 1.017592297333309],
 [{'n_components': 200, 'l1_ratio': 0, 'max_iter': 200}, 1.018296430847357],
 [{'n_components': 30, 'l1_ratio': 0, 'max_iter': 200}, 1.0173389514411704],
 [{'n_components': 10, 'l1_ratio': 0, 'max_iter': 100}, 1.0363461268469882],
 [{'n_components': 10, 'l1_ratio': 0, 'max_iter': 200}, 1.036346361177813]]

672
498


154

Unnamed: 0,user_id,article_id,click
2950705,322894,168401,1
2950706,322895,63746,1
2950707,322895,289197,1
2950708,322896,30760,1
2950709,322896,157507,1
