In [8]:
## Import necessary packages
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
## Read dataset and adjust first column
## Replace NaN's with 0 for now
ratings = pd.read_csv("test.csv",index_col=0)
ratings = ratings.fillna(0)
ratings

Unnamed: 0,Ballet,Baseball,Basketball,Boxing,Cycling,Dancing,Fencing,Field_hockey,Football,Golf,...,Rugby,Skiing,Snowboarding,Surfing,Swimming,Tennis,Volleyball,Water_polo,Weight_lifting,Wrestling
user1,1,3,3,3,3,1,4,3,5,2,...,4,3,3,2,1,3,3,4,2,2
user2,1,4,5,4,2,4,3,5,3,1,...,3,1,2,4,5,1,2,2,3,1
user3,5,1,2,3,1,1,0,2,3,3,...,2,2,3,3,2,4,1,2,4,5
user4,2,5,4,1,1,1,3,2,4,1,...,4,3,3,2,2,4,2,2,2,2
user5,2,1,2,4,2,1,4,1,1,3,...,2,1,1,4,2,4,1,3,1,3
user6,2,3,2,4,2,4,4,1,1,3,...,2,5,1,4,2,4,1,3,1,3


In [27]:
## Method which standardises data values to be more
## accurately represent a user's opinion.
def standardise(row):
    new_row = (row - row.mean()) / (row.max() - row.min())
    return new_row

## Apply standardise method to new table
ratings_std = ratings.apply(standardise)

## Create item similarity matrix and use transposed table 
## since we want similarity between items which need to be in rows.
item_similarity = cosine_similarity(ratings_std.T)
print(item_similarity)

[[ 1.00000000e+00 -5.79537864e-01 -5.37086156e-01 -1.35596798e-01
  -6.91905363e-01 -3.50823208e-01 -8.77058019e-01 -3.91076944e-01
  -1.55485768e-01  5.29751456e-01 -2.48069469e-01  3.50823208e-01
   0.00000000e+00  4.64095481e-01  6.91905363e-01 -4.80384461e-01
  -5.29751456e-01 -4.47960820e-02  2.53359392e-01 -2.30326720e-02
  -2.32047740e-01  5.23570314e-01 -6.10170216e-01 -4.43760157e-01
   5.61758162e-01  9.28190962e-01]
 [-5.79537864e-01  1.00000000e+00  7.89542034e-01 -5.16131906e-01
  -2.76394996e-02  3.22329186e-01  3.22329186e-01  4.42231994e-01
   4.54545455e-01 -8.67639859e-01  2.27921153e-01 -8.05822964e-01
  -1.61164593e-01 -6.09144904e-01  2.76394996e-02  8.82734830e-01
   7.40668172e-01  2.88104066e-01  2.75105321e-01 -3.59753112e-01
   3.04572452e-01 -3.77964473e-01  5.60611911e-01 -2.03858877e-01
  -8.89882597e-02 -7.61431130e-01]
 [-5.37086156e-01  7.89542034e-01  1.00000000e+00 -2.70500890e-01
   0.00000000e+00  3.06186218e-01  1.02062073e-01  8.40168050e-01
   4.9

In [28]:
## Create data frame from item similarity matrix
## Values represented as percentages on how similar they are to another item
item_similarity_df = pd.DataFrame(item_similarity,index=ratings.columns,columns=ratings.columns)
item_similarity_df

Unnamed: 0,Ballet,Baseball,Basketball,Boxing,Cycling,Dancing,Fencing,Field_hockey,Football,Golf,...,Rugby,Skiing,Snowboarding,Surfing,Swimming,Tennis,Volleyball,Water_polo,Weight_lifting,Wrestling
Ballet,1.0,-0.579538,-0.537086,-0.135597,-0.691905,-0.350823,-0.877058,-0.391077,-0.155486,0.529751,...,-0.529751,-0.04479608,0.253359,-0.023033,-0.232048,0.52357,-0.6101702,-0.4437602,0.561758,0.928191
Baseball,-0.579538,1.0,0.789542,-0.516132,-0.027639,0.322329,0.3223292,0.442232,0.454545,-0.86764,...,0.740668,0.2881041,0.275105,-0.359753,0.304572,-0.377964,0.5606119,-0.2038589,-0.088988,-0.761431
Basketball,-0.537086,0.789542,1.0,-0.270501,0.0,0.306186,0.1020621,0.840168,0.493464,-0.964901,...,0.643268,-0.3127716,0.321634,-0.160817,0.694365,-0.783349,0.5809475,-0.3872983,0.270501,-0.810093
Boxing,-0.135597,-0.516132,-0.270501,1.0,0.49241,0.552158,0.2208631,0.075755,-0.622918,0.493013,...,-0.667017,-0.16921,-0.725018,0.841021,0.333914,-0.329617,-0.3492151,0.2793721,-0.170732,0.041739
Cycling,-0.691905,-0.027639,0.0,0.49241,1.0,0.171499,0.6859943,0.235294,0.138197,0.045038,...,0.225189,0.08759357,-0.225189,0.045038,-0.129641,-0.365636,0.5423261,0.8677218,-0.416655,-0.453743
Dancing,-0.350823,0.322329,0.306186,0.552158,0.171499,1.0,0.25,0.342997,-0.402911,-0.131306,...,-0.262613,0.255377,-0.525226,0.656532,0.661438,-0.533002,-0.1581139,-0.1581139,-0.110432,-0.377964
Fencing,-0.877058,0.322329,0.102062,0.220863,0.685994,0.25,1.0,-0.085749,-0.161165,-0.131306,...,0.262613,0.255377,-0.525226,0.131306,-0.094491,-0.1066,0.3162278,0.6324555,-0.883452,-0.661438
Field_hockey,-0.391077,0.442232,0.840168,0.075755,0.235294,0.342997,-0.08574929,1.0,0.52515,-0.720604,...,0.450377,-0.4379679,0.360302,-0.045038,0.713024,-0.950654,0.5965588,-0.2169305,0.530288,-0.615794
Football,-0.155486,0.454545,0.493464,-0.622918,0.138197,-0.402911,-0.1611646,0.52515,1.0,-0.613696,...,0.86764,-0.04115772,0.909964,-0.86764,-0.152286,-0.274883,0.8664002,0.1019294,0.444941,-0.304572
Golf,0.529751,-0.86764,-0.964901,0.493013,0.045038,-0.131306,-0.1313064,-0.720604,-0.613696,1.0,...,-0.793103,0.2011958,-0.448276,0.37931,-0.496292,0.615882,-0.6643638,0.3321819,-0.203005,0.794067


In [29]:
## Create recommendation method
def get_similar_item(activity_name, user_rating):
    ## Scale activity by user's rating
    similar_score = item_similarity_df[activity_name]*(user_rating-2.5)
    similar_score = similar_score.sort_values(ascending=False)
    
    return similar_score

In [30]:
ball_games_lover = [("Weight_lifting",1), ("Wrestling", 1), ("Ballet", 5), ("Dancing", 5)]
similar_activities = pd.DataFrame()
for activity, rating in ball_games_lover:
    similar_activities = similar_activities.append(get_similar_item(activity, rating),ignore_index=True)
    
similar_activities.head()
similar_activities.sum().sort_values(ascending=False)

Dancing           2.355536
Surfing           1.813813
Swimming          1.269033
Boxing            1.234891
Gymnastics        1.046149
Skiing            0.973902
Rowing            0.892283
Fencing           0.749690
Baseball          0.632607
Basketball        0.232138
Golf              0.109520
Field_hockey      0.008059
Cycling           0.004579
Martial_arts     -0.004579
Ice_hockey       -0.227147
Handball         -0.408804
Water_polo       -0.487284
Tennis           -0.556577
Wrestling        -0.562695
Ballet           -0.611982
Weight_lifting   -0.809945
Volleyball       -1.039411
Rugby            -1.056643
Karting          -1.432485
Football         -1.606546
Snowboarding     -1.754635
dtype: float64