In [1]:
## Import necessary packages
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

In [38]:
## Read dataset and adjust first column
## Replace NaN's with 0 for now
ratings = pd.read_csv("test.csv",index_col=0)
ratings = ratings.fillna(0)
ratings

Unnamed: 0,Ballet,Baseball,Basketball,Boxing,Cycling,Dancing,Fencing,Field Hockey,Football,Golf,...,Rugby,Skiing,Snowboarding,Surfing,Swimming,Tennis,Volleyball,Water Polo,Weight Lifting,Wrestling
user 1,1,3,3,3,3,1,4,3,5,2,...,4,3,3,2,1,3,3,4,2,2
user 2,5,4,5,4,2,4,3,5,3,1,...,3,1,2,4,5,1,2,2,3,1


In [40]:
## Method which standardises data values to be more
## accurately represent a user's opinion.
def standardise(row):
    new_row = (row - row.mean()) / (row.max() - row.min())
    return new_row

## Apply standardise method to new table
ratings_std = ratings.apply(standardise)

## Create item similarity matrix and use transposed table 
## since we want similarity between items which need to be in rows.
item_similarity = cosine_similarity(ratings_std.T)
print(item_similarity)

[[ 1.  1.  1.  1. -1.  1. -1.  1. -1. -1.  1. -1.  1. -1.  1.  1. -1. -1.
  -1.  1.  1. -1. -1. -1.  1. -1.]
 [ 1.  1.  1.  1. -1.  1. -1.  1. -1. -1.  1. -1.  1. -1.  1.  1. -1. -1.
  -1.  1.  1. -1. -1. -1.  1. -1.]
 [ 1.  1.  1.  1. -1.  1. -1.  1. -1. -1.  1. -1.  1. -1.  1.  1. -1. -1.
  -1.  1.  1. -1. -1. -1.  1. -1.]
 [ 1.  1.  1.  1. -1.  1. -1.  1. -1. -1.  1. -1.  1. -1.  1.  1. -1. -1.
  -1.  1.  1. -1. -1. -1.  1. -1.]
 [-1. -1. -1. -1.  1. -1.  1. -1.  1.  1. -1.  1. -1.  1. -1. -1.  1.  1.
   1. -1. -1.  1.  1.  1. -1.  1.]
 [ 1.  1.  1.  1. -1.  1. -1.  1. -1. -1.  1. -1.  1. -1.  1.  1. -1. -1.
  -1.  1.  1. -1. -1. -1.  1. -1.]
 [-1. -1. -1. -1.  1. -1.  1. -1.  1.  1. -1.  1. -1.  1. -1. -1.  1.  1.
   1. -1. -1.  1.  1.  1. -1.  1.]
 [ 1.  1.  1.  1. -1.  1. -1.  1. -1. -1.  1. -1.  1. -1.  1.  1. -1. -1.
  -1.  1.  1. -1. -1. -1.  1. -1.]
 [-1. -1. -1. -1.  1. -1.  1. -1.  1.  1. -1.  1. -1.  1. -1. -1.  1.  1.
   1. -1. -1.  1.  1.  1. -1.  1.]
 [-1. -1. -1. -1.  

In [43]:
## Create data frame from item similarity matrix
## Values represented as percentages on how similar they are to another item
item_similarity_df = pd.DataFrame(item_similarity,index=ratings.columns,columns=ratings.columns)
item_similarity_df

Unnamed: 0,Ballet,Baseball,Basketball,Boxing,Cycling,Dancing,Fencing,Field Hockey,Football,Golf,...,Rugby,Skiing,Snowboarding,Surfing,Swimming,Tennis,Volleyball,Water Polo,Weight Lifting,Wrestling
Ballet,1.0,1.0,1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0
Baseball,1.0,1.0,1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0
Basketball,1.0,1.0,1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0
Boxing,1.0,1.0,1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0
Cycling,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,1.0,1.0,...,1.0,1.0,1.0,-1.0,-1.0,1.0,1.0,1.0,-1.0,1.0
Dancing,1.0,1.0,1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0
Fencing,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,1.0,1.0,...,1.0,1.0,1.0,-1.0,-1.0,1.0,1.0,1.0,-1.0,1.0
Field Hockey,1.0,1.0,1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0
Football,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,1.0,1.0,...,1.0,1.0,1.0,-1.0,-1.0,1.0,1.0,1.0,-1.0,1.0
Golf,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,1.0,1.0,...,1.0,1.0,1.0,-1.0,-1.0,1.0,1.0,1.0,-1.0,1.0


In [42]:
## Create recommendation method
def get_similar_item(activity_name, user_rating):
    ## Scale activity by user's rating
    similar_score = item_similarity_df[activity_name]*(user_rating-2.5)
    similar_score = similar_score.sort_values(ascending=False)
    
    return similar_score

print((get_similar_item("Football", 2).to_json().replace("{", "").replace("{", "").split(","))[1])

KeyError: 'Football'

In [7]:
ball_games_lover = [("Football",5), ("Rugby", 2), ("Walking", 1)]
similar_activities = pd.DataFrame()
for activity, rating in ball_games_lover:
    similar_activities = similar_activities.append(get_similar_item(activity, rating),ignore_index=True)
    
similar_activities.head()
similar_activities.sum().sort_values(ascending=False)

Football          3.478094
GaelicFootball    2.209769
Rugby             1.336512
IceHockey         0.676085
Aerobics         -2.908372
Walking          -3.181888
dtype: float64