In [34]:
## Import necessary packages
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

In [49]:
## Read dataset and adjust first column
## Replace NaN's with 0 for now
ratings = pd.read_csv("hobbies.csv",index_col=0)
ratings = ratings.fillna(0)
ratings

Unnamed: 0,Acting,Action Figures,Adventure park,Aerobics,Air hockey,Air sports,Airbrushing,Aircraft,Aircraft spotting,Airsoft,...,Worldbuilding.1,Wrestling.1,Writing.1,Writing Music,Writing Songs,Yo-yoing.1,Yoga.1,YoYo.1,Ziplining,Zumba.1
user 1,4,5,1,2,3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user 2,5,3,5,2,2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user 3,1,5,1,4,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user 4,2,4,4,2,5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user 5,1,2,3,1,4,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
## Method which standardises data values to be more
## accurately represent a user's opinion.
def standardise(row):
    new_row = (row - row.mean()) / (row.max() - row.min())
    return new_row

## Apply standardise method to new table
ratings_std = ratings.apply(standardise)

## Create item similarity matrix and use transposed table 
## since we want similarity between items which need to be in rows.
item_similarity = cosine_similarity(ratings_std.T)
print(item_similarity)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [45]:
## Create data frame from item similarity matrix
## Values represented as percentages on how similar they are to another item
item_similarity_df = pd.DataFrame(item_similarity,index=ratings.columns,columns=ratings.columns)
item_similarity_df

Unnamed: 0,Acting,Action Figures,Adventure park,Aerobics,Air Hockey
Acting,1.0,0.06333,0.353888,-0.201008,-0.174078
Action Figures,0.06333,1.0,-0.664557,0.735147,-0.363803
Adventure park,0.353888,-0.664557,1.0,-0.484795,0.353553
Aerobics,-0.201008,0.735147,-0.484795,1.0,-0.721688
Air Hockey,-0.174078,-0.363803,0.353553,-0.721688,1.0


In [48]:
## Create recommendation method
def get_similar_item(movie_name, user_rating):
    ## Scale movie by user's rating
    similar_score = item_similarity_df[movie_name]*(user_rating-2.5)
    similar_score = similar_score.sort_values(ascending=False)
    
    return similar_score

print(get_similar_item("Acting", 5))

Acting            2.500000
Adventure park    0.884720
Action Figures    0.158325
Air Hockey       -0.435194
Aerobics         -0.502519
Name: Acting, dtype: float64


In [9]:
action_lover = [("football",5), ("basketball", 4), ("running", 1)]
similar_movies = pd.DataFrame()
for movie, rating in action_lover:
    similar_movies = similar_movies.append(get_similar_item(movie, rating),ignore_index=True)
    
similar_movies.head()
similar_movies.sum().sort_values(ascending=False)

football      3.989790
netball       3.665337
basketball    2.648597
cycling      -3.096648
walking      -3.774488
running      -4.391385
dtype: float64