# One part of the first part (a simple recommender system)

In [1]:
import pandas as pd
from scipy import sparse 
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
ratings = pd.read_csv("dataset/dataset2.csv", index_col=0)

#Replace all NaN values with 0
ratings = ratings.fillna(0)
ratings

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
user 1,4.0,5.0,3.0,0.0,2.0,1.0
user 2,5.0,3.0,3.0,2.0,2.0,0.0
user 3,1.0,0.0,0.0,4.0,5.0,4.0
user 4,0.0,2.0,1.0,4.0,0.0,3.0
user 5,1.0,0.0,2.0,3.0,3.0,4.0


In [3]:
# Take each row as input and covert it to that the new rating will be 
# New rating = Orginal rating - the mean of all the ratings and then divide be the range of the ratings that the user gives
# That will result in we standardizing all the values that the user has given
def standardize(row):
    new_row = (row - row.mean()) / (row.max() - row.min())
    return new_row

# So we apply the standardize on all ratings
ratings_standardize = ratings.apply(standardize)
ratings_standardize

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
user 1,0.36,0.6,0.4,-0.65,-0.08,-0.35
user 2,0.56,0.2,0.4,-0.15,-0.08,-0.6
user 3,-0.24,-0.4,-0.6,0.35,0.52,0.4
user 4,-0.44,0.0,-0.266667,0.35,-0.48,0.15
user 5,-0.24,-0.4,0.066667,0.1,0.12,0.4


In [4]:
# We need to know similarities between the movies. So we use an item to item collaborative filtering with cosine that was imported
# We also need to transpose it with the T 
item_similarity = cosine_similarity(ratings_standardize.T)
print(item_similarity)

[[ 1.          0.70668875  0.81368151 -0.79941088 -0.02539184 -0.91410609]
 [ 0.70668875  1.          0.72310153 -0.84515425 -0.5189993  -0.84337386]
 [ 0.81368151  0.72310153  1.         -0.84794611 -0.3799803  -0.80218063]
 [-0.79941088 -0.84515425 -0.84794611  1.          0.14803913  0.72374686]
 [-0.02539184 -0.5189993  -0.3799803   0.14803913  1.          0.39393939]
 [-0.91410609 -0.84337386 -0.80218063  0.72374686  0.39393939  1.        ]]


In [5]:
# creating a dataframe of it
# Create a dataframe from this numpy which is item similarity and the index to all the movies and columns too. Because we can then see how close to each other they are
item_similarity_dataframe = pd.DataFrame(item_similarity, index=ratings.columns, columns=ratings.columns)
item_similarity_dataframe

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
action1,1.0,0.706689,0.813682,-0.799411,-0.025392,-0.914106
action2,0.706689,1.0,0.723102,-0.845154,-0.518999,-0.843374
action3,0.813682,0.723102,1.0,-0.847946,-0.37998,-0.802181
romantic1,-0.799411,-0.845154,-0.847946,1.0,0.148039,0.723747
romantic2,-0.025392,-0.518999,-0.37998,0.148039,1.0,0.393939
romantic3,-0.914106,-0.843374,-0.802181,0.723747,0.393939,1.0


In [7]:
# This will give us a score for all the movies that are similar
# So similar score = the movie that the user has watched * the rating that he gave
# We need to add the *(user_rating - 2,5) because if the user gives a low rating we dont want to return similar movies
def get_similar_movies(movie_name, user_rating):
    similar_score = item_similarity_dataframe[movie_name]*(user_rating-2.5)
    similar_score = similar_score.sort_values(ascending=False)

    return similar_score

print(get_similar_movies('romantic3',1))

action1      1.371159
action2      1.265061
action3      1.203271
romantic2   -0.590909
romantic1   -1.085620
romantic3   -1.500000
Name: romantic3, dtype: float64


In [10]:
# Trying the same thing with more ratings
# Works really good
romantic_girl = [('action2', 2), ('romantic2', 4), ('romantic1', 5)]

similar_movies = pd.DataFrame()

for movie,rating in romantic_girl:
    similar_movies = similar_movies.append(get_similar_movies(movie, rating), ignore_index=True)

similar_movies.head()
similar_movies.sum().sort_values(ascending=False)

romantic1    3.144636
romantic3    2.821963
romantic2    2.129597
action1     -2.389959
action3     -3.051386
action2     -3.391385
dtype: float64