In [1]:
from UserProfileHelper import UserProfileHelper 
import pandas as pd
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

### Method that generates the profile for a user

In [2]:
def generateProfile(projects, userProjects):
    # Get the ids of projects that the user has interacted with
    project_ids = list(userProjects[userProjects == 1].index)

    project_titles = []

    for project in project_ids:       
        project_titles.extend(projects[projects['project_id'] == int(project)]['title'].tolist())

    # Make a 'user profile' out of these projects
    projects_fields_combined = ' '.join(project_titles)

    # Set this part of the profile to the user profile
    return projects_fields_combined

### Setup or tf-idf vectorization

In [3]:
# Setup our TF-IDF model
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')

### Load Projects Data

In [4]:
projectsFile = '../Data/project_data'
all_projects = pd.read_pickle(projectsFile)

In [5]:
# Filter out the project we are going to test using
test_project_id = 169
filtered_projects = all_projects[all_projects['project_id'] != test_project_id]

### Load the projects and profiles links

In [6]:
profilesProjects = pd.read_pickle('../Data/useful_profile_project_adj')

In [7]:
# Generate the meta_title for each profile
profilesProjects['profile_titles'] = profilesProjects.apply(lambda x: generateProfile(filtered_projects, x), axis=1)
profilesProjects = profilesProjects.reset_index()

In [8]:
profilesProjects.head()

project,profile,4.0,5.0,6.0,7.0,8.0,19.0,20.0,22.0,24.0,...,20568.0,20569.0,20571.0,20572.0,20573.0,20577.0,20591.0,20603.0,20650.0,profile_titles
0,001bedb58aa43c8d3596b5b522ba1040,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Journey North Pollinators.info Bumble Bee Phot...
1,0021e5df03d7feb6ba9558cc2828d616,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Project Implicit MAPPER AgeGuess The Royal Soc...
2,00300cba5401183830a6a82b80c8ff7f,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Nature's Notebook ISeeChange GLOBE Observer: C...
3,0033882471572a66322d0747c6a4b12d,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Project Squirrel Stream Selfie
4,00536e1575193e409e255cd02ed9d205,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,GRB cosmology project Backyard Worlds: Planet 9


### Generate Predictions 

In [9]:
# Convert all the user profile titles into a list of strings
fields = profilesProjects['profile_titles'].tolist()

# Get the title of the project that we are trying to predict
new_project_title = str(all_projects[all_projects['project_id'] == test_project_id].iloc[0]['title'])
print(new_project_title)

# Complete the list of titles
fields = fields + [new_project_title]

Globe at Night


In [10]:
# Check that the last field is the project title we are trying to predict
fields[-1] == new_project_title

True

### Generate cosine similarities for ALL profiles

In [28]:
# Generate the cosine similarities for all the fields
tfidf_matrix = tf.fit_transform(fields)
print(type(tfidf_matrix))
cosine_similarities = cosine_similarity(tfidf_matrix, Y=None, dense_output=True)

<class 'scipy.sparse.csr.csr_matrix'>


In [12]:
# Check that the cosine similarity is symmetrical
def check_symmetric(a, rtol=1e-05, atol=1e-08):
    return np.allclose(a, a.T, rtol=rtol, atol=atol)
check_symmetric(cosine_similarities)

True

### Extract the top 10 predictions

In [13]:
# Get the predictions for each user profiles and their likelihood of picking the new project
predictions = cosine_similarities[-1][:-1]

In [14]:
print(predictions.shape)
print(profilesProjects.shape)

(4866,)
(4866, 1222)


In [15]:
# Get the index of the top 10 likely projects
top_10_idx = predictions.argsort()[-10:][::-1]
print(top_10_idx)

[2092 2252 4099 1745 3961 2461  672 4312  602 2846]


In [16]:
# Get the IDs of the profiles that would pick this project
topMatchedProfiles = profilesProjects.iloc[top_10_idx]['profile'].tolist()

In [17]:
topMatchedProfiles

['6d5c9634642fc196d0bd9ff4363fdc74',
 '76adf5149e8b84299df0fdd846225bbb',
 'd7e88c594631815d34955309f22ec329',
 '5afd76d8d1dccf78c2612404f1a7eebc',
 'd072e2f6220eeface469d7030df08e74',
 '829fba241248a3edf995c7ea601dcfc7',
 '24384a5d0f2239b9c56fd7cc85ce00fe',
 'e24d9e0917deafdac2ce9bd16d6f2bdd',
 '2074f4ce0ab57aafc32d20f6d59d6ce8',
 '94310374ddbe2cd489c9ab09e92d86b3']

### Extract all > 0.1 predictions

In [18]:
# Get the index of all the profiles that are greater than 0.1 likelihood
pred = (predictions > 0.1)*1
pred_idx = np.argwhere(pred).flatten()
print(pred_idx)

[ 602  612  672 1111 1649 1745 2092 2252 2461 2846 3665 3961 4099 4312
 4423]


In [19]:
# Get the IDs of the profiles that would pick this project
matchedProfiles = profilesProjects.iloc[pred_idx]['profile'].tolist()

In [20]:
matchedProfiles

['2074f4ce0ab57aafc32d20f6d59d6ce8',
 '2141c910dea1e351510b821203bc1261',
 '24384a5d0f2239b9c56fd7cc85ce00fe',
 '3a2ceee160a0ae1f5798c965dac3e2e1',
 '55c3a0cfdfdb149092f28db9b76bb2da',
 '5afd76d8d1dccf78c2612404f1a7eebc',
 '6d5c9634642fc196d0bd9ff4363fdc74',
 '76adf5149e8b84299df0fdd846225bbb',
 '829fba241248a3edf995c7ea601dcfc7',
 '94310374ddbe2cd489c9ab09e92d86b3',
 'c03caceeb4ac5ec796271393194e5b90',
 'd072e2f6220eeface469d7030df08e74',
 'd7e88c594631815d34955309f22ec329',
 'e24d9e0917deafdac2ce9bd16d6f2bdd',
 'e8f6cb3b7cb818ee8e0fee62937a2f48']

# Precision and Recall

In [21]:
print(test_project_id)

169


In [22]:
# Extract the true values for this project
y_true = profilesProjects[test_project_id]

In [23]:
# Get the predicted values for this project
y_pred =  

In [24]:
# Check the sizes
print(y_true.shape == y_pred.shape)

True


In [25]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_true, y_pred, average='macro')

(0.790599876314162, 0.5068573335413131, 0.47302859264587055, None)

### Review the IDs of the profiles for y_true and y_pred

In [26]:
# Get the index of profiles that actually participated in the project
y_true_idx = np.argwhere(y_true).flatten()

# Get the IDs of the profiles that would pick this project
trueProfiles = profilesProjects.iloc[y_true_idx]['profile'].tolist()
print(trueProfiles)

['001bedb58aa43c8d3596b5b522ba1040', '00536e1575193e409e255cd02ed9d205', '009e5dfe604e3dc2568717dccc438392', '00d4005887bcce01071866724babfe28', '012c7142fd399d7eaf7cb0e933d27593', '02ef22d6e0457a15e725755722fd1ae9', '031620af1a9bf80899f701432ea1fe78', '0348d3fd3368aec4c52f54ca6347555d', '040cac92698a77a5e1e999fdd65d3121', '0455a6da9d6491a526c87062681b22a5', '04b445275781a10310b19fed6ee94172', '0504ffa689ac23ca0ab0d5553f07cbfd', '0510761b7894865537070a1511629c99', '055d2fe39f4c06a8c6c3ae1c4c336d33', '0573cab0f18af5b59dce30bcbac61bc8', '0574aac1b04ac557c1ec936a2f94c0e8', '06218b96c81734acf53ca0be9428758f', '064ed8e79a321aa1b352c799348eba6e', '075b86d756561f8eedce820564db87f3', '07c5f66554a2e9697ed7a413bae2247c', '07f91ce1ac65934c95a83f78ec82d65e', '0858addc162ee5d95aa5d21887984e7f', '087df1d77a390412129203fa5b39ce1b', '094906e6a2d185ea0d1d9e644386a85b', '095c506355e8ffb386433f18f57f9d00', '0972d5ac1f7bdfbb0cf5605fbfbfdd77', '09d134a07f769f2da2b425b5129c3e10', '0a31886a18b1637f3a1333461c

  return getattr(obj, method)(*args, **kwds)


In [27]:
# Get the index of profiles that actually participated in the project
y_pred_idx = np.argwhere(y_pred).flatten()

# Get the IDs of the profiles that would pick this project
predProfiles = profilesProjects.iloc[y_pred_idx]['profile'].tolist()
print(predProfiles)

['2074f4ce0ab57aafc32d20f6d59d6ce8', '2141c910dea1e351510b821203bc1261', '24384a5d0f2239b9c56fd7cc85ce00fe', '3a2ceee160a0ae1f5798c965dac3e2e1', '55c3a0cfdfdb149092f28db9b76bb2da', '5afd76d8d1dccf78c2612404f1a7eebc', '6d5c9634642fc196d0bd9ff4363fdc74', '76adf5149e8b84299df0fdd846225bbb', '829fba241248a3edf995c7ea601dcfc7', '94310374ddbe2cd489c9ab09e92d86b3', 'c03caceeb4ac5ec796271393194e5b90', 'd072e2f6220eeface469d7030df08e74', 'd7e88c594631815d34955309f22ec329', 'e24d9e0917deafdac2ce9bd16d6f2bdd', 'e8f6cb3b7cb818ee8e0fee62937a2f48']
