In [1]:
## Import necessary packages
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

In [61]:
## Read dataset and adjust first column
## Replace NaN's with 0 for now
ratings = pd.read_csv("test.csv",index_col=0)
ratings = ratings.fillna(0)
ratings

Unnamed: 0,Ballet,Baseball,Basketball,Boxing,Cycling,Dancing,Fencing,FieldHockey,Football,Golf,...,Rugby,Skiing,Snowboarding,Surfing,Swimming,Tennis,Volleyball,WaterPolo,WeightLifting,Wrestling
user1,1,3,3,3,3,1,4,3,5,2,...,4,3,3,2,1,3,3,4,2,2
user2,5,4,5,4,2,4,3,5,3,1,...,3,1,2,4,5,1,2,2,3,1


In [62]:
## Method which standardises data values to be more
## accurately represent a user's opinion.
def standardise(row):
    new_row = (row - row.mean()) / (row.max() - row.min())
    return new_row

## Apply standardise method to new table
ratings_std = ratings.apply(standardise)

## Create item similarity matrix and use transposed table 
## since we want similarity between items which need to be in rows.
item_similarity = cosine_similarity(ratings_std.T)
print(item_similarity)

[[ 1.  1.  1.  1. -1.  1. -1.  1. -1. -1.  1. -1.  1. -1.  1.  1. -1. -1.
  -1.  1.  1. -1. -1. -1.  1. -1.]
 [ 1.  1.  1.  1. -1.  1. -1.  1. -1. -1.  1. -1.  1. -1.  1.  1. -1. -1.
  -1.  1.  1. -1. -1. -1.  1. -1.]
 [ 1.  1.  1.  1. -1.  1. -1.  1. -1. -1.  1. -1.  1. -1.  1.  1. -1. -1.
  -1.  1.  1. -1. -1. -1.  1. -1.]
 [ 1.  1.  1.  1. -1.  1. -1.  1. -1. -1.  1. -1.  1. -1.  1.  1. -1. -1.
  -1.  1.  1. -1. -1. -1.  1. -1.]
 [-1. -1. -1. -1.  1. -1.  1. -1.  1.  1. -1.  1. -1.  1. -1. -1.  1.  1.
   1. -1. -1.  1.  1.  1. -1.  1.]
 [ 1.  1.  1.  1. -1.  1. -1.  1. -1. -1.  1. -1.  1. -1.  1.  1. -1. -1.
  -1.  1.  1. -1. -1. -1.  1. -1.]
 [-1. -1. -1. -1.  1. -1.  1. -1.  1.  1. -1.  1. -1.  1. -1. -1.  1.  1.
   1. -1. -1.  1.  1.  1. -1.  1.]
 [ 1.  1.  1.  1. -1.  1. -1.  1. -1. -1.  1. -1.  1. -1.  1.  1. -1. -1.
  -1.  1.  1. -1. -1. -1.  1. -1.]
 [-1. -1. -1. -1.  1. -1.  1. -1.  1.  1. -1.  1. -1.  1. -1. -1.  1.  1.
   1. -1. -1.  1.  1.  1. -1.  1.]
 [-1. -1. -1. -1.  

In [63]:
## Create data frame from item similarity matrix
## Values represented as percentages on how similar they are to another item
item_similarity_df = pd.DataFrame(item_similarity,index=ratings.columns,columns=ratings.columns)
item_similarity_df

Unnamed: 0,Ballet,Baseball,Basketball,Boxing,Cycling,Dancing,Fencing,FieldHockey,Football,Golf,...,Rugby,Skiing,Snowboarding,Surfing,Swimming,Tennis,Volleyball,WaterPolo,WeightLifting,Wrestling
Ballet,1.0,1.0,1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0
Baseball,1.0,1.0,1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0
Basketball,1.0,1.0,1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0
Boxing,1.0,1.0,1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0
Cycling,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,1.0,1.0,...,1.0,1.0,1.0,-1.0,-1.0,1.0,1.0,1.0,-1.0,1.0
Dancing,1.0,1.0,1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0
Fencing,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,1.0,1.0,...,1.0,1.0,1.0,-1.0,-1.0,1.0,1.0,1.0,-1.0,1.0
FieldHockey,1.0,1.0,1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0
Football,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,1.0,1.0,...,1.0,1.0,1.0,-1.0,-1.0,1.0,1.0,1.0,-1.0,1.0
Golf,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,1.0,1.0,...,1.0,1.0,1.0,-1.0,-1.0,1.0,1.0,1.0,-1.0,1.0


In [66]:
## Create recommendation method
def get_similar_item(activity_name, user_rating):
    ## Scale activity by user's rating
    similar_score = item_similarity_df[activity_name]*(user_rating-2.5)
    similar_score = similar_score.sort_values(ascending=False)
    
    return similar_score

['"Ballet":0.5', '"Gymnastics":0.5', '"Rowing":0.5', '"MartialArts":0.5', '"Baseball":0.5', '"IceHockey":0.5', '"Swimming":0.5', '"Surfing":0.5', '"FieldHockey":0.5', '"Dancing":0.5', '"WeightLifting":0.5', '"Boxing":0.5', '"Basketball":0.5', '"Snowboarding":-0.5', '"Tennis":-0.5', '"WaterPolo":-0.5', '"Volleyball":-0.5', '"Karting":-0.5', '"Skiing":-0.5', '"Rugby":-0.5', '"Handball":-0.5', '"Golf":-0.5', '"Football":-0.5', '"Fencing":-0.5', '"Cycling":-0.5', '"Wrestling":-0.5}']


In [68]:
ball_games_lover = [("Football",5), ("Rugby", 3), ("Basketball", 3)]
similar_activities = pd.DataFrame()
for activity, rating in ball_games_lover:
    similar_activities = similar_activities.append(get_similar_item(activity, rating),ignore_index=True)
    
similar_activities.head()
similar_activities.sum().sort_values(ascending=False)

Karting          2.5
Golf             2.5
WaterPolo        2.5
Volleyball       2.5
Tennis           2.5
Snowboarding     2.5
Skiing           2.5
Rugby            2.5
Handball         2.5
Wrestling        2.5
Football         2.5
Fencing          2.5
Cycling          2.5
Gymnastics      -2.5
IceHockey       -2.5
Baseball        -2.5
MartialArts     -2.5
Rowing          -2.5
FieldHockey     -2.5
Dancing         -2.5
Surfing         -2.5
Swimming        -2.5
Boxing          -2.5
Basketball      -2.5
WeightLifting   -2.5
Ballet          -2.5
dtype: float64