# Build content-based filters

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

### Load Projects

In [2]:
projects = pd.read_pickle("../../data/raw/project_data")

In [3]:
projects[projects['project_id'] == 588]

Unnamed: 0,UN_regions,country,description,error,guid,origin,regions,tags,title,topics,url,project_id
355,[],,Help scientist improve maps of Mars and partic...,,9a672783-d2b1-59e3-b0a6-c5036119a035,scistarter,[],"[crater, mars, martian, nasa, rover]",Be a Martian,"[Astronomy & Space, Climate & Weather, Compute...",https://scistarter.com/project/588-Be-a-Martia...,588


### Setup the TF-IDF Model

In [4]:
v = TfidfVectorizer()

### Generate TF-IDF Vectors

In [5]:
title_idf = v.fit_transform(projects['title'])
desc_idf = v.fit_transform(projects['description'])

### Find similarity in projects based on description

In [6]:
count = CountVectorizer()
count_matrix = count.fit_transform(projects['description'])

# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [7]:
titles = pd.Series(projects['title'])

#  defining the function that takes in movie title 
# as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
    
    # initializing the empty list of recommended projects
    recommended_projects = []
    
    # gettin the index of the project that matches the title
    idx = titles[titles == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar projects
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching projects
    for i in top_10_indexes:
        recommended_projects.append(list(projects['title'])[i])
        
    return recommended_projects

In [8]:
print(projects.iloc[5]['title'])
print(recommendations(projects.iloc[5]['title']))

North American Amphibian Monitoring Program
['Spokane Area Amphibian Monitoring ', 'FrogWatch USA™', 'Frog Listening Network', 'Wisconsin Frog and Toad Survey', 'Saving Elephants by Helping People', 'DC/Baltimore Cricket Crawl', 'NYC Cricket Crawl', 'BirdTrack', 'Loudoun Amphibian Monitoring Program', 'Butterflies & Moths of North America']


In [9]:
print(projects.iloc[100]['title'])
print(recommendations(projects.iloc[100]['title']))

Cape Cod Osprey Project
['Cities at Night', 'The American Chestnut Foundation', 'LCA Koala Program', 'Our Radioactive Ocean', 'Backyard Biodiversity Project: Pools', 'British Trust for Ornithology The Nest Record Scheme', 'BiodiversiTREE', 'Maui Coastal Marine Debris Monitoring Program', 'Chimp&See', 'Drought Risk and You (DRY)']
