In [6]:
## Imports
# Data processing
import pandas as pd
import numpy as np
import scipy.stats
from sklearn.metrics.pairwise import cosine_similarity

In [105]:
# Read datasets

users_colnames=['userId', 'gender', 'age', 'profession']
movies_colnames=['movieId', 'year', 'title']
ratings_colnames=['userId', 'movieId', 'rating']

users = pd.read_csv('cse2525-reccommender-systems-challenge/users.csv', sep=';', names=users_colnames)
movies = pd.read_csv('cse2525-reccommender-systems-challenge/movies_v2.csv', sep=';', names=movies_colnames) # changed line 3601 from dataset : ';' -> '_'
ratings = pd.read_csv('cse2525-reccommender-systems-challenge/ratings.csv', sep=';', names=ratings_colnames)

# Check
users.head()
movies.head()
ratings.head()


Unnamed: 0,userId,gender,age,profession
0,1,F,1,10
1,2,M,56,16
2,3,M,25,15
3,4,M,45,7
4,5,M,25,20


In [50]:
# Count the # of ratings per movie
ratings_by_movie_id = ratings.groupby(['movieId'])['movieId']\
    .count()\
    .reset_index(name='counts')

ratings_by_movie_id.head()


Unnamed: 0,movieId,counts
0,1,1896
1,2,635
2,3,443
3,4,155
4,5,270


In [97]:
# Number of rows remaining after saving only the ones that have `>= threshold` ratings
movies_threshold = 100
ratings_by_movie_id_processed = ratings_by_movie_id[ratings_by_movie_id['counts'] >= movies_threshold] # drop in another variable
len(ratings_by_movie_id_processed)


1940

In [98]:
# Titles of the most popular (in # of ratings) movies
relevant_movie_ids = ratings_by_movie_id.sort_values(by='counts', ascending=False)['movieId'].array
# relevant_movie_ids.array
# type(relevant_movie_ids.array)
joined_ids = list(set(movies['movieId']) & set(relevant_movie_ids))
joined_ids = [x - 1 for x in joined_ids] # 0-indexed
movies.loc[joined_ids]

Unnamed: 0,movieId,year,title
0,1,1995,Toy_Story_(1995)
1,2,1995,Jumanji_(1995)
2,3,1995,Grumpier_Old_Men_(1995)
3,4,1995,Waiting_to_Exhale_(1995)
4,5,1995,Father_of_the_Bride_Part_II_(1995)
...,...,...,...
3689,3690,1943,"Phantom_of_the_Opera,_The_(1943)"
3690,3691,1984,Runaway_(1984)
3701,3702,2000,Meet_the_Parents_(2000)
3702,3703,2000,Requiem_for_a_Dream_(2000)


In [99]:
# Create utility (user-movie) matrix
# rows - users, cols - movies
matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating')
print(type(matrix))
matrix.head()

<class 'pandas.core.frame.DataFrame'>


movieId,1,2,3,4,5,6,7,8,9,10,...,3697,3698,3699,3700,3701,3702,3703,3704,3705,3706
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [96]:
# Statistics on ratings values

print('The ratings dataset has', ratings['userId'].nunique(), 'unique users')
print('The ratings dataset has', ratings['movieId'].nunique(), 'unique movies')
print('The ratings dataset has', ratings['rating'].nunique(), 'unique ratings')
print('The unique ratings are', sorted(ratings['rating'].unique()))

The ratings dataset has 6040 unique users
The ratings dataset has 3695 unique movies
The ratings dataset has 5 unique ratings
The unique ratings are [1, 2, 3, 4, 5]


In [107]:
# Statistics on users values

print('The users dataset has', users['userId'].nunique(), 'unique users')
print('The users dataset has',users['gender'].nunique(), 'unique genders')
print('The users dataset has', users['age'].nunique(), 'unique age values')
print('The users dataset has', users['profession'].nunique(), 'unique professions values')

The users dataset has 6040 unique users
The users dataset has 2 unique genders
The users dataset has 7 unique age values
The users dataset has 21 unique professions values


In [108]:
# Statistics on movies values

print('The movies dataset has', movies['movieId'].nunique(), 'unique ids')
print('The movies dataset has',movies['year'].nunique(), 'unique year values')
print('The movies dataset has', movies['title'].nunique(), 'unique titles')
print('The movies dataset has year values in range', movies['year'].min(), ' - ', movies['year'].max())

The movies dataset has 3706 unique ids
The movies dataset has 82 unique year values
The movies dataset has 3688 unique titles
The movies dataset has year values in range 0  -  2000


In [101]:
# Data normalization step
# Rating > user_avg => positive value; otherwise negative value
matrix_norm = matrix.subtract(matrix.mean(axis=1), axis = 'rows')
matrix_norm.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,3697,3698,3699,3700,3701,3702,3703,3704,3705,3706
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.808511,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [161]:
# User similarity matrix using alternative #1: Pearson correlation
user_similarity = matrix_norm.T.corr()
user_similarity.head()

userId,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.642857,-0.5625,0.333333,-0.172516,0.1139606,,-0.583333,0.644094,0.163446,...,-0.19803,0.852803,1.0,,0.381246,-0.150021,-0.394771,,0.061199,-5.0620030000000004e-17
2,0.642857,1.0,-0.208013,0.220863,-0.102728,2.7626380000000003e-17,0.2626,-0.015445,0.13525,-0.105465,...,-0.75,0.338062,0.326183,,0.338558,0.381157,0.183019,-0.5,0.483046,0.03193962
3,-0.5625,-0.208013,1.0,0.774597,-0.437621,-0.3492151,0.790569,-0.528594,0.108465,0.016264,...,-0.5,-0.342997,-0.711556,,0.559017,0.158237,-0.041345,1.0,0.315063,-0.4678087
4,0.333333,0.220863,0.774597,1.0,0.0,,-0.207514,0.534522,0.866025,-0.371479,...,-0.645497,,-0.038576,,-0.013558,0.489046,0.301511,-1.0,0.0,0.5358259
5,-0.172516,-0.102728,-0.437621,0.0,1.0,-0.7211103,-0.216966,0.168732,-0.033128,0.021635,...,0.073324,1.0,0.116775,,0.514356,0.142739,-0.074062,-1.0,1.0,0.325407


In [156]:
# User similarity matrix using alternative #2: cosine_similarity
# values in [-1, 1]; -1 : opposite, 1: really close
user_similarity_cosine = cosine_similarity(matrix_norm.fillna(0))
print(type(user_similarity_cosine)) # ndarray
user_similarity_cosine


<class 'numpy.ndarray'>


array([[ 1.        ,  0.03565432, -0.04347203, ...,  0.        ,
         0.0138397 ,  0.00903407],
       [ 0.03565432,  1.        , -0.01496278, ..., -0.01598902,
         0.02835177, -0.00176974],
       [-0.04347203, -0.01496278,  1.        , ...,  0.0414533 ,
         0.05292534, -0.04647062],
       ...,
       [ 0.        , -0.01598902,  0.0414533 , ...,  1.        ,
         0.04679397, -0.03514554],
       [ 0.0138397 ,  0.02835177,  0.05292534, ...,  0.04679397,
         1.        ,  0.05144363],
       [ 0.00903407, -0.00176974, -0.04647062, ..., -0.03514554,
         0.05144363,  1.        ]])

In [157]:
# Try finding similar users to user w/ `userID`
# Let's use `cosine_similarity` for this

userId = 1
print(type(user_similarity_cosine))
user_similarity_cosine = user_similarity_cosine[:, userId]
user_similarity_cosine = np.delete(user_similarity_cosine, userId, axis=0)
print(len(user_similarity_cosine))
user_similarity_cosine

<class 'numpy.ndarray'>
6039


array([ 0.03565432, -0.01496278, -0.02398623, ..., -0.01598902,
        0.02835177, -0.00176974])

In [162]:
# Try finding similar users to user w/ `userID`
# Let's use `Pearson correlation` for this

# Pick a user ID
picked_userid = 1
# Remove picked user ID from the candidate list
user_similarity.drop(index=picked_userid, inplace=True)
# Take a look at the data
user_similarity.head()

userId,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.642857,1.0,-0.208013,0.220863,-0.102728,2.7626380000000003e-17,0.2626,-0.015445,0.13525,-0.105465,...,-0.75,0.338062,0.326183,,0.338558,0.381157,0.183019,-0.5,0.483046,0.03194
3,-0.5625,-0.2080126,1.0,0.774597,-0.437621,-0.3492151,0.790569,-0.528594,0.108465,0.016264,...,-0.5,-0.342997,-0.711556,,0.559017,0.158237,-0.041345,1.0,0.315063,-0.467809
4,0.333333,0.2208631,0.774597,1.0,0.0,,-0.207514,0.534522,0.866025,-0.371479,...,-0.645497,,-0.038576,,-0.013558,0.489046,0.301511,-1.0,0.0,0.535826
5,-0.172516,-0.1027277,-0.437621,0.0,1.0,-0.7211103,-0.216966,0.168732,-0.033128,0.021635,...,0.073324,1.0,0.116775,,0.514356,0.142739,-0.074062,-1.0,1.0,0.325407
6,0.113961,2.7626380000000003e-17,-0.349215,,-0.72111,1.0,1.0,-0.590932,-0.216007,0.333863,...,-1.0,0.92582,,,-0.419573,-0.16525,-0.272166,-0.866025,-0.693375,0.197545


In [188]:
# Tests to decide on tuning the parameters in next code cell `user_similarity_threshold` and `k`
for_userId = user_similarity[user_similarity[userId] > user_similarity_threshold][userId]
print(type(for_userId))
print(len(for_userId))
print(len(for_userId[for_userId >= .7]))


<class 'pandas.core.series.Series'>
337
337


In [246]:
# Choose `k` most similar users based on the cosine_similarity distance
# Parameters to be tuned:
# k - # of similar users
# user_similarity_threshold

# Number of similar users - first 150 are 1.0, first 160 -> 0.94
k = 200
# User similarity threshold
user_similarity_threshold = 0.3
# Get top `k` similar users
print(user_similarity.shape)
similar_users = user_similarity[user_similarity[userId] > user_similarity_threshold][userId].sort_values(ascending=False)[:k] # Print out top `k` similar users
print(f'The similar users for user {userId} are', similar_users)

(6039, 6040)
The similar users for user 5498 are userId
4400    1.0
2019    1.0
4700    1.0
916     1.0
1990    1.0
       ... 
5867    1.0
2220    1.0
4515    1.0
298     1.0
5883    1.0
Name: 5498, Length: 200, dtype: float64


  similar_users = user_similarity[user_similarity[userId] > user_similarity_threshold][userId].sort_values(ascending=False)[:k] # Print out top `k` similar users


In [247]:
# Narrow item pool in 2 steps
# Step 1. Remove movies already watched by targeted `userId`

# Movies that the target user has watched
picked_userid_watched = matrix_norm[matrix_norm.index == picked_userid].dropna(axis=1, how='all')
picked_userid_watched

movieId,1,48,145,254,514,518,575,581,582,594,...,2489,2558,2587,2593,2600,2711,2890,2899,2970,3178
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.808511,0.808511,0.808511,-0.191489,0.808511,-0.191489,-0.191489,-0.191489,0.808511,-0.191489,...,-0.191489,-0.191489,-0.191489,-0.191489,0.808511,-0.191489,0.808511,-0.191489,-0.191489,-0.191489


In [248]:
# Step 2. Keep only the movies that similar users have watched

# Remove movies that none of the similar users have watched
similar_user_movies = matrix_norm[matrix_norm.index.isin(similar_users.index)].dropna(axis=1, how='all')
similar_user_movies

movieId,1,2,3,5,6,7,8,10,11,12,...,3692,3693,3694,3695,3696,3697,3700,3702,3703,3706
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,,,,,-0.357143,,,,,,...,,,,,,,,,,
170,,,,,,,,,,,...,,,,,,,,,,
174,,,,,,,,-0.693182,,,...,,,,,,,,,,
189,,,,,,,,,,,...,,,,,,,,,,
234,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5842,,,,,-1.081633,,,,,,...,,,,,,,,,,
5867,,,,,,,,,,,...,,,,,,,,,,
5883,,,,,,,,,,,...,,,,,,,,,,
5940,,,,,,,,,,,...,,,,,,,,,,


In [249]:
# Remove already watched movies
similar_user_movies.drop(picked_userid_watched.columns,axis=1, inplace=True, errors='ignore') # drop cols if exits without error msgs.
similar_user_movies

movieId,2,3,5,6,7,8,10,11,12,13,...,3692,3693,3694,3695,3696,3697,3700,3702,3703,3706
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,,,,-0.357143,,,,,,,...,,,,,,,,,,
170,,,,,,,,,,,...,,,,,,,,,,
174,,,,,,,-0.693182,,,,...,,,,,,,,,,
189,,,,,,,,,,,...,,,,,,,,,,
234,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5842,,,,-1.081633,,,,,,,...,,,,,,,,,,
5867,,,,,,,,,,,...,,,,,,,,,,
5883,,,,,,,,,,,...,,,,,,,,,,
5940,,,,,,,,,,,...,,,,,,,,,,


In [250]:
# Reccomend Items to targetUser
# weighted avg of user similarity score & movie rating
# User w/ high similarity -> higher weights; in other words, similarity score weights the resuts

item_score = {}

for movieId in similar_user_movies.columns:
  movie_rating = similar_user_movies[movieId]
  # Stores the score
  total = 0
  # Stores the number of scores
  count = 0

  for userId in similar_users.index:
    if not pd.isna(movie_rating[userId]): #     # If the movie has rating
      score = similar_users[userId] * movie_rating[userId]
      total += score
      count +=1
  item_score[movieId] = total / count# Convert dictionary to pandas dataframe

item_score = pd.DataFrame(item_score.items(), columns=['movie', 'movie_score'])

# Sort the movies by score
ranked_item_score = item_score.sort_values(by='movie_score', ascending=False)

# Select top `m` movies
m = 1000
ranked_item_score.head(m)


Unnamed: 0,movie,movie_score
506,1014,2.009524
52,98,2.009524
1761,3129,1.900000
2036,3541,1.900000
341,684,1.796591
...,...,...
480,968,-0.041981
1078,1954,-0.042366
11,16,-0.043687
195,352,-0.045455


In [251]:
# Test - Print titles of recommended movies\
print(type(ranked_item_score))
print(movies['movieId'])
print(type(ranked_item_score['movie']))
movieIds_indices = ranked_item_score['movie'][:m].array
print(movieIds_indices)
movies.iloc[movieIds_indices]['title']
# joined_indices = list(set(movies.movieId) & set(movieIds_indices))
# print(len(joined_indices))
# print(movies.iloc[joined_indices]['title'])

<class 'pandas.core.frame.DataFrame'>
0          1
1          2
2          3
3          4
4          5
        ... 
3701    3702
3702    3703
3703    3704
3704    3705
3705    3706
Name: movieId, Length: 3706, dtype: int64
<class 'pandas.core.series.Series'>
<PandasArray>
[1014,   98, 3129, 3541,  684, 2616,  833, 3333,  882, 2591,
 ...
 1905, 2222, 2960, 1337, 1931,  968, 1954,   16,  352, 2483]
Length: 1000, dtype: int64


  movieIds_indices = ranked_item_score['movie'][:m].array


IndexError: positional indexers are out-of-bounds

In [252]:
# Predicts scores - user's ratings
# get average user's w/ `userId` movie rating score

avg_rating = matrix[matrix.index == picked_userid].T.mean()[picked_userid]
print(f'The average movie rating for user {picked_userid} is {avg_rating:.2f}')

# Calcuate the predicted rating
ranked_item_score['predicted_rating'] = ranked_item_score['movie_score'] + avg_rating# Take a look at the data
print(m)
ranked_item_score.head(m)

The average movie rating for user 1 is 4.19
1000


Unnamed: 0,movie,movie_score,predicted_rating
506,1014,2.009524,6.201013
52,98,2.009524,6.201013
1761,3129,1.900000,6.091489
2036,3541,1.900000,6.091489
341,684,1.796591,5.988080
...,...,...,...
480,968,-0.041981,4.149508
1078,1954,-0.042366,4.149123
11,16,-0.043687,4.147802
195,352,-0.045455,4.146035
