In [69]:
import os
import sys
from time import gmtime, strftime

from keras.models import load_model
from sklearn.metrics import precision_recall_fscore_support
import numpy as np
import pandas as pd
import math
import keras.backend as K
from scipy import sparse
from scipy.sparse import vstack

sys.path.append('../../data')
sys.path.append('../../src/models')
from recommenders.cf_recommender import CFRecommender

k = 5 #int(sys.argv[1])
autoencoder_model = 'train_autoencoder_128_cdae_users_projects' #str(sys.argv[2]) # 'train_autoencoder_32_cdae_users_projects'
dataSource = 'users_projects' # str(sys.argv[3]) # 'movies' 

# Load the autoencoder to use
model = load_model('../../data/autoencoders/' + autoencoder_model + '.h5')
projects = pd.read_pickle('../../data/processed/project_data')

In [70]:
def load_users_projects():
    cf = pd.read_pickle('../../data/processed/cf_projects.pkl')
    #cf = pd.read_pickle('data/processed/cf_profiles.pkl')

    train_x = sparse.load_npz("../../data/processed/train_sparse.npz")
    val_x = sparse.load_npz("../../data/processed/val_sparse.npz")
    test_x = sparse.load_npz("../../data/processed/test_sparse.npz")
    
    train_labels = cf
    val_labels = cf
    test_labels = cf

    return train_labels, train_x, val_labels, val_x, test_labels, test_x

def load_profile_labels():
    cf_profiles = pd.read_pickle('../../data/processed/cf_profiles.pkl')

    return cf_profiles

In [71]:
# Load out time consistent collaborative filtering data
train_labels, train_x, val_labels, val_x, test_labels, test_x = load_users_projects()

In [72]:
recommender = CFRecommender(k)

#### Create our user profile

In [73]:
profile_idx = 0

# Get the projects used to make the prediction
profile_col = np.squeeze(np.asarray(train_x.getcol(profile_idx).todense())).reshape(1,-1)
labels = np.asarray(train_labels.index)

In [74]:
labels[np.nonzero(profile_col)[1]]

array([ 57, 207, 210, 238, 406, 718, 759, 761, 784, 812, 829])

In [75]:
# Make a prediction for 
predictions = model.predict([profile_col, labels])

In [76]:
np.count_nonzero(predictions[0])

1029

#### Get the Top-K Predictions

In [77]:
# Get the indices of all projects that have already been done by this 
done_idx = profile_col.nonzero()

In [78]:
done_idx

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([ 57, 207, 210, 238, 406, 718, 759, 761, 784, 812, 829]))

In [79]:
# Set all the done projects to 0 in the predictions (so we don't pick them again)
predictions[done_idx] = 0

In [80]:
np.count_nonzero(predictions[0])

1018

In [81]:
# Get the indices of the top projects
y_pred_indices = predictions.reshape(train_labels.shape[0]).argsort()[-k:][::-1]

### Investigate the projects that were picked

In [67]:
y_pred_indices

array([513, 600, 228, 745, 589])

In [86]:
predictions[0][y_pred_indices]

array([0.05665603, 0.0566352 , 0.05638409, 0.0561032 , 0.05594108],
      dtype=float32)

In [85]:
project_ids = train_labels.iloc[y_pred_indices].values.flatten()
projects[projects['project_id'].isin(project_ids)]

Unnamed: 0,UN_regions,country,description,error,guid,origin,regions,tags,title,topics,url,project_id
370,[],,The Tucson Bird Count is an annual project in ...,,230bc5d1-e929-5aec-843f-bbe9c2ba4e7b,scistarter,"[{'geometry': {'type': 'MultiPolygon', 'coordi...","[arizona, bird, birdwatching, desert, reconcil...",Tucson Bird Count,"[Nature & Outdoors, Birds]",https://scistarter.com/project/606-Tucson-Bird...,606
839,[],USA,"FLOW, which stands for Follow and Learn about ...",,ae184466-e5ea-5fb0-bbd1-0c5219cf2ab9,scistarter,"[{'geometry': {'type': 'MultiPolygon', 'coordi...","[california, education, hab, harmful algal blo...",FLOW Program,"[Science Policy, Education, Nature & Outdoors,...",https://scistarter.com/project/1316-FLOW-Progr...,1316
957,[],,As part of the White House’s Precision Medicin...,,wilsoncenter:167-None,scistarter,"[{'geometry': {'type': 'MultiPolygon', 'coordi...","[crowdsourcing, genomics, informatics, innovat...",precisionFDA,[Health & Medicine],https://scistarter.com/project/1675-precisionF...,1675
976,[],,Invasive weedy plants are a widespread problem...,,wilsoncenter:188-None,wilsoncenter,"[{'geometry': {'type': 'MultiPolygon', 'coordi...","[aquatic plants, Aquatic plants, invasive terr...",Invasive Plant Atlas of the MidSouth,[],https://scistarter.com/project/1694-Invasive-P...,1694
1168,[],,We are a multidisciplinary research team based...,,2cd9706f-d81f-596a-bd9a-413c24344713,scistarter,[],"[chagas disease, disease, kissing bug, parasit...",Kissing Bug Citizen Science Program,"[Ecology & Environment, Biology, Insects & Pol...",https://scistarter.com/project/17016-Kissing-B...,17016


In [91]:
train_x[y_pred_indices].shape

(5, 880)

In [98]:
for i in range(0, train_x[y_pred_indices].shape[0]):
    row = train_x[y_pred_indices].getrow(i)
    print(str(row.nnz))

2
2
2
17
12


In [47]:
# return the project_ids of the top recommendations
recommendations = train_labels.iloc[y_pred_indices]

#### Generate the y_pred and y_true for evaluation

In [99]:
y_true, y_pred = recommender.generate_y(recommendations, train_labels, test_x.getcol(profile_idx), val_x=val_x.getcol(profile_idx))

#### Get precision and recall

In [100]:
precision, recall, fscore, support = precision_recall_fscore_support(y_true, y_pred, average='binary', pos_label=1)

  'recall', 'true', average, warn_for)
