In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse

# ML Libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import random
from sklearn.neural_network import MLPClassifier
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline

# Create a model that uses one-hot encoding of the topics

### Load in the project data

In [2]:
all_projects = pd.read_pickle("../Data/project_topics_data")

In [3]:
all_projects.head()

Unnamed: 0,UN_regions,country,description,error,guid,origin,regions,tags,title,topics,...,topics_Sound,topics_Psychology,topics_Chemistry,topics_Animals,topics_Food,topics_Astronomy_and_Space,topics_Archeology_and_Cultural,topics_Geography,topics_Geology_and_Earth_Science,topics_Biology
0,[],,Foldit is a revolutionary new computer game en...,,5f80760d-8398-5091-b3c6-f34c39216e88,scistarter,[],"[dna, protein]",Foldit,"[topics_Computers_and_Technology, topics_Biolo...",...,0,0,1,0,0,0,0,0,0,1
1,[],,Firefly Watch combines an annual summer evenin...,,c9664e0c-819a-5a42-b4bb-5f25d83a486d,scistarter,"[{'geometry': {'type': 'MultiPolygon', 'coordi...","[boston, cambridge, fireflies, insects, lightn...",Firefly Watch,"[topics_Nature_and_Outdoors, topics_Animals, t...",...,0,0,0,1,0,0,0,0,0,1
2,[],,Galaxy Zoo needs your help to classify galaxie...,,11f65e99-b463-5e01-ac11-ae59c021dfe7,scistarter,[],"[astronomy & space, space, space science, zoon...",Galaxy Zoo,"[topics_Astronomy_and_Space, topics_Computers_...",...,0,0,0,0,0,1,0,0,0,0
3,[],,Pay attention to the plants and animals in you...,,wilsoncenter:27-107,scistarter,"[{'geometry': {'type': 'MultiPolygon', 'coordi...","[android, animal, animals, app, biology, clima...",Nature's Notebook,"[topics_Ocean,_Water,_Marine_and_Terrestrial, ...",...,0,0,0,1,0,0,0,0,0,1
4,[],,A recent issue of Make magazine (http://makezi...,,ae91e967-6eec-5aef-ab3a-7d86ceff737a,scistarter,[],[],Laser Harp: Build It Yourself,"[topics_Computers_and_Technology, topics_Sound...",...,1,0,0,0,0,0,0,0,0,0


### Get arrays for topics

In [4]:
topic_cols = [col for col in all_projects.columns if 'topics_' in col]

In [5]:
# Combine the topics columns into an array
all_projects['combined'] = all_projects[topic_cols].values.tolist()

In [6]:
len(all_projects)

1781

In [7]:
# Filter out the project we are going to test using
test_project_id = 169
filtered_projects = all_projects[all_projects['project_id'] != test_project_id]

### Generate user profile topic vectors

In [8]:
profilesProjects = pd.read_pickle('../Data/useful_profile_project_adj')

In [9]:
def generateProfile(projects, userProjects):
    # Get the ids of projects that the user has interacted with
    project_ids = list(userProjects[userProjects == 1].index)

    project_topics = []

    for project_id in project_ids:       
        project_topics.extend(projects[projects['project_id'] == int(project_id)]['combined'].tolist())
        
    # Make a 'user profile' out of these projects
    projects_fields_combined = [ sum(x) for x in zip(*project_topics) ]

    # Set this part of the profile to the user profile
    return projects_fields_combined

In [10]:
profilesProjects['profile_topics'] = profilesProjects.apply(lambda x: generateProfile(filtered_projects, x), axis=1)
profilesProjects = profilesProjects.reset_index()

In [11]:
profilesProjects['profile_topics'].head()

0    [2, 3, 4, 0, 0, 0, 4, 6, 4, 2, 1, 10, 9, 0, 0,...
1    [1, 0, 1, 0, 0, 0, 0, 0, 3, 1, 0, 0, 0, 0, 1, ...
2    [1, 2, 3, 0, 1, 1, 3, 2, 2, 1, 2, 3, 3, 0, 1, ...
3    [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 2, 2, 0, 1, ...
4    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: profile_topics, dtype: object

### Construct our list of vectors to find similarity

In [42]:
# Convert all the user profile titles into a list of strings
fields = profilesProjects['profile_topics'].tolist()

# Get the title of the project that we are trying to predict
new_project_topics = all_projects[all_projects['project_id'] == test_project_id].iloc[0]['combined']

# Complete the list of titles
fields = fields + [new_project_topics]

In [45]:
fields = np.array([np.array(xi) for xi in fields])

In [46]:
fields.shape

(4867, 27)

In [47]:
fields = sparse.csr_matrix(np.array(fields)) 

### Generate Cosin Similarities for the vectors

In [48]:
# Generate the cosine similarities for all the fields
cosine_similarities = cosine_similarity(fields, Y=None, dense_output=True)

### Make predictions

In [49]:
predictions = cosine_similarities[-1][:-1]

In [50]:
print(predictions.shape)
print(profilesProjects.shape)

(4866,)
(4866, 1222)


# Precision and Recall

In [51]:
print(test_project_id)

169


In [52]:
# Extract the true values for this project
y_true = profilesProjects[test_project_id]

In [62]:
# Get the predicted values for this project
y_pred = (predictions > 0.5)*1

In [63]:
# Check the sizes
print(y_true.shape == y_pred.shape)

True


In [64]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_true, y_pred, average='macro')

(0.5200611934416226, 0.5383792429254713, 0.44395599814311537, None)

### Review the ids of profiles

In [65]:
# Get the index of profiles that actually participated in the project
y_true_idx = np.argwhere(y_true).flatten()

# Get the IDs of the profiles that would pick this project
trueProfiles = profilesProjects.iloc[y_true_idx]['profile'].tolist()
print(len(trueProfiles))
print(trueProfiles)

749
['001bedb58aa43c8d3596b5b522ba1040', '00536e1575193e409e255cd02ed9d205', '009e5dfe604e3dc2568717dccc438392', '00d4005887bcce01071866724babfe28', '012c7142fd399d7eaf7cb0e933d27593', '02ef22d6e0457a15e725755722fd1ae9', '031620af1a9bf80899f701432ea1fe78', '0348d3fd3368aec4c52f54ca6347555d', '040cac92698a77a5e1e999fdd65d3121', '0455a6da9d6491a526c87062681b22a5', '04b445275781a10310b19fed6ee94172', '0504ffa689ac23ca0ab0d5553f07cbfd', '0510761b7894865537070a1511629c99', '055d2fe39f4c06a8c6c3ae1c4c336d33', '0573cab0f18af5b59dce30bcbac61bc8', '0574aac1b04ac557c1ec936a2f94c0e8', '06218b96c81734acf53ca0be9428758f', '064ed8e79a321aa1b352c799348eba6e', '075b86d756561f8eedce820564db87f3', '07c5f66554a2e9697ed7a413bae2247c', '07f91ce1ac65934c95a83f78ec82d65e', '0858addc162ee5d95aa5d21887984e7f', '087df1d77a390412129203fa5b39ce1b', '094906e6a2d185ea0d1d9e644386a85b', '095c506355e8ffb386433f18f57f9d00', '0972d5ac1f7bdfbb0cf5605fbfbfdd77', '09d134a07f769f2da2b425b5129c3e10', '0a31886a18b1637f3a1333

In [66]:
# Get the index of profiles that actually participated in the project
y_pred_idx = np.argwhere(y_pred).flatten()

# Get the IDs of the profiles that would pick this project
predProfiles = profilesProjects.iloc[y_pred_idx]['profile'].tolist()
print(len(predProfiles))
print(predProfiles)

2575
['001bedb58aa43c8d3596b5b522ba1040', '00300cba5401183830a6a82b80c8ff7f', '007350803bc77f892db9c666fc106e95', '0075788f90e355569d544d9c63078c9e', '0094d73eeabb1806ea8c2535cb92b126', '0096d19926386978726c09c33ab63c86', '009e5dfe604e3dc2568717dccc438392', '00be614f60877db80975ae8bb2f5f7a9', '00d4005887bcce01071866724babfe28', '010aa4391835c4ff279a85093e0e527a', '012c7142fd399d7eaf7cb0e933d27593', '0132ed8d86c34003806ca57d9cb0980e', '013b67e4f87be5a44443686a2173c8ad', '01562e889b51b84044304cbc0e3b8bca', '01addf5d53be7b35ca8fda7bd140585c', '01b5f42d8970e242737997104edbb3d4', '01c6be05bd7a7c0d049b5e814b44289e', '01f37cecfed7792a428293146f88efdc', '01fc03843e310b53fe00c76c6a2f1e60', '022cd309e4fb530d583b2cb5d26e5758', '022e66f355aafedcf0fcc2e3c74a90c4', '024adb9b397682f16b42512da85cb9de', '024c907f88d1b61a128240318436a377', '0250082e9105f5a50f3e6283a761f3e3', '02a31b00efb72e1df9c5ed7c76bde267', '02cde6acf3a30f883d585c53b8412de7', '02ef22d6e0457a15e725755722fd1ae9', '02fd16b58b02bb42b558d