In [50]:
import numpy as np
import pickle

In [51]:
data_folder = "data/"

user_id_path = data_folder + "user_id_mappings.npy"
anime_id_path = data_folder + "anime_id_mappings.npy"
anime_title_path = data_folder + "anime_title_mappings.pickle"
pivoted_array_path = data_folder + "pivoted_array.npy"
model_path = "model"

In [52]:
user_id_mappings = np.load(user_id_path)

In [53]:
anime_id_mappings = np.load(anime_id_path, allow_pickle=True)

In [54]:
with open(anime_title_path, 'rb') as f:
  anime_title_mappings = pickle.load(f)

Since computing the pivot matrix is extremely intensive, we compute it once and store it to disk, and simply load it here.

In [55]:
pivoted_array = np.load(pivoted_array_path)

# Cosine Similarity

### Why and how we use cosine similarity?
- Cosine similarity is our first choice because of the abundance of data in the given dataset.
- We begin by asking the new user for 5 Anime ratings of their choice.
- We use these 5 ratings to find the closest existing user in the database (*this is where cosine similarity is used*).
- We assume that this closest existing user has rated at least 10 other Anime's in addition to the 5 given by the new user.
- We rank these animes by the user score and use them as recommendations for the new user.
- If the assumption is false, we resort to using ALS.

In [56]:
def transform_new_ratings(new_user_ratings):
	new_user_array = np.zeros(pivoted_array.shape[1])
	for anime_id, score in new_user_ratings:
		new_user_array[anime_id - 1] = score
  
	return new_user_array


`cosine_similarity(u,v)`: calculates the cosine similarity measures the cosine of the angle between two vectors as a measure of similarity between these vectors in a high-dimensional space. It ranges from -1 to 1, where:
  - 1 indicates that the vectors are identical.
  - -1 indicates that the vectors are diametrically opposed (i.e., they point in opposite directions).
  - 0 indicates orthogonality (i.e., the vectors are perpendicular to each other).
  - Values between 0 and 1 represent varying degrees of similarity between the vectors.

In [57]:
def cosine_similarity(u, v):
	norm_u = np.linalg.norm(u)
	norm_v = np.linalg.norm(v)
	if norm_u == 0 or norm_v == 0:
		return 0
	return np.dot(u, v) / (norm_u * norm_v)

`find_closest_user(pivoted_array, new_user_array, new_user_only)`: finds the user most similar to the new user in terms of preferences by calculating the cosine similarity between the `new_user_only` vector and every other vector in the user-item association matrix.

In [58]:
def find_closest_user(pivoted_array, new_user_array, new_user_only):
	similarities = []
	user_id = 0
 
	for row in pivoted_array:
		ratings = row
		ratings = ratings[:pivoted_array.shape[1]] 
  
		ratings = ratings[[x[0] for x in new_user_only]]
		new_user_array = np.array([x[1] for x in new_user_only])

		similarity = cosine_similarity(new_user_array, ratings)
		similarities.append((user_id, similarity))
  
		# if (similarity > 0.8):
		# 	print(user_id, ratings)

		user_id += 1
  
	return max(similarities, key = lambda x: x[1])

`top_20_for_user(pivoted_array, user_index)`: finds the top 20 Anime titles that the user has rated the highest (there may not be 20, but we deal with that later).

In [59]:
def top_20_for_user(pivoted_array, user_index):
  user_ratings = pivoted_array[user_index]
  
  # for i, r in [(59, 10), (249, 10), (178, 10), (1698, 10), (1914, 10)]:
  #   print('\t', user_ratings[i])
  
  top_20 = sorted(range(len(user_ratings)), key=lambda x: user_ratings[x], reverse=True)[:20]
  
  # print(top_20)
  
  return_length = 20

  for i in range(19, -1, -1):
    # print(user_ratings[top_20[i]])
    if user_ratings[top_20[i]] == 0:
      return_length -= 1
  
  return top_20[:return_length]

The user must enter five Anime ID's and their corresponding ratings. Mappings between 'anime_id' and 'title' can be found in `data/anime_title_mappings.csv`.

In [61]:
new_user_ratings = []

for i in range(5):
    anime_id = int(input("Enter the anime ID: "))
    rating = int(input("Enter the rating out of 10: "))
    
    # check if the anime ID exists in the mappings
    if anime_id in anime_id_mappings:
        # get the index of the anime ID from the mappings
        anime_index = np.where(anime_id_mappings == anime_id)[0][0]
        new_user_ratings.append((anime_index, rating))
    else:
        print("Anime ID not found in the mappings.")

print("New user ratings:", new_user_ratings)

new_user_only = new_user_ratings

New user ratings: [(7252, 8), (8695, 7), (8015, 8), (559, 6), (99, 8)]


In [62]:
new_user_ratings = transform_new_ratings(new_user_ratings)

In [63]:
new_user_only

[(7252, 8), (8695, 7), (8015, 8), (559, 6), (99, 8)]

In [64]:
closest_user = find_closest_user(pivoted_array, new_user_ratings, new_user_only)

In [65]:
closest_user_index = closest_user[0]
print(closest_user_index)

106326


In [66]:
# get the closest user's highest rated animes
pivoted_array[closest_user_index]

array([8, 8, 0, ..., 0, 8, 0], dtype=uint8)

In [67]:
do_als = False

In [68]:
top_20 = top_20_for_user(pivoted_array, closest_user_index)

In [69]:
# now remove already watched from the top_20 for new user
new_user_recommendations = list(filter(lambda x: x not in [a[0] for a in new_user_only], top_20))

In [70]:
print(new_user_recommendations)

[4090, 4775, 5785, 6680, 6753, 7702, 7926, 11, 176, 517, 765, 791, 1386, 1523, 1551, 2613, 3262, 3299, 3321, 3360]


In [71]:
if len(top_20) != 20 or len(new_user_recommendations) < 10:
    do_als = True

In [72]:
print("Recommmendation for new user:")
for i, anime_index in enumerate(new_user_recommendations):
  print(f"{i}.", anime_title_mappings[anime_id_mappings[anime_index]])

Recommmendation for new user:
0. Angel Beats!
1. Steins;Gate
2. Yahari Ore no Seishun Love Comedy wa Machigatteiru.
3. Shigatsu wa Kimi no Uso
4. Yahari Ore no Seishun Love Comedy wa Machigatteiru. Zoku
5. Kimi no Na wa.
6. Yahari Ore no Seishun Love Comedy wa Machigatteiru. Zoku OVA
7. One Piece
8. Sen to Chihiro no Kamikakushi
9. Digimon Adventure
10. Suzumiya Haruhi no Yuuutsu
11. Nana
12. Death Note
13. Byousoku 5 Centimeter
14. Clannad Movie
15. Ookami to Koushinryou
16. Natsume Yuujinchou
17. Clannad: After Story
18. Toradora!
19. Suzumiya Haruhi no Yuuutsu (2009)


# ALS

### When will ALS be used?
- Keeping in mind that we have found the closest existing user to the new user based on the new user's rating of five Anime's.
- If the closest user doesn't have at least 10 new recommendations, we fall back to using ALS.
- ALS is used to predict 5 Anime's for the closest user found using cosine similarity.
- Essentially, we use ALS if cosine similarity fails to give new recommendations.

In [73]:

if not do_als:
    exit(0)

In [74]:
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [75]:
spark = SparkSession.builder.appName('spark').master('local').getOrCreate()


In [76]:

model = ALSModel.load(model_path)

als = ALS(userCol="user_id", itemCol="anime_id", ratingCol="my_score", coldStartStrategy="drop")

In [77]:
path = r"files\final_animedataset.csv"

df = spark.read.csv(path, header=True).select(
    F.col('user_id'), F.col('anime_id'), F.col('my_score'), F.col('title'))

In [78]:
df = df.dropna(subset=["user_id", "anime_id", "my_score", "title"])

df = df.withColumn("my_score", F.col("my_score").cast("double")) \
       .withColumn("anime_id", F.col("anime_id").cast("int")) \
       .withColumn("user_id", F.col("user_id").cast("int"))

In [79]:
anime_mapping_df = df.select("anime_id", "title").distinct()

In [80]:
user_id = int(user_id_mappings[closest_user_index])

user_id_list = [(user_id,)]

user_df = spark.createDataFrame(user_id_list, ["user_id"])

user_df.show()

+-------+
|user_id|
+-------+
|5173589|
+-------+



In [81]:
user_subset_recs = model.recommendForUserSubset(user_df, 5)

In [82]:
user_subset_recs.show()

+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|5173589|[{4282, 9.707604}...|
+-------+--------------------+



In [83]:
for row in user_subset_recs.collect():
    print(row)

Row(user_id=5173589, recommendations=[Row(anime_id=4282, rating=9.70760440826416), Row(anime_id=5460, rating=9.638671875), Row(anime_id=6438, rating=9.539900779724121), Row(anime_id=8609, rating=9.52868366241455), Row(anime_id=2514, rating=9.521927833557129)])


In [84]:
rec = user_subset_recs.collect()[0]['recommendations']

In [85]:
print(rec)

[Row(anime_id=4282, rating=9.70760440826416), Row(anime_id=5460, rating=9.638671875), Row(anime_id=6438, rating=9.539900779724121), Row(anime_id=8609, rating=9.52868366241455), Row(anime_id=2514, rating=9.521927833557129)]


In [86]:
anime_ids_list = []

for row in rec:
    anime_id = row['anime_id']
    anime_ids_list.append(anime_id)

print(anime_ids_list)

[4282, 5460, 6438, 8609, 2514]


In [87]:
print("Recommmendation for new user using ALS:")
for i, a_id in enumerate(anime_ids_list):
  print(f"{i + 1}.", anime_title_mappings[a_id])

Recommmendation for new user using ALS:
1. Kara no Kyoukai 5: Mujun Rasen
2. Detective Conan Movie 13: The Raven Chaser
3. Detective Conan OVA 09: The Stranger in 10 Years...
4. Detective Conan OVA 10: Kid in Trap Island
5. Detective Conan OVA 04: Conan and Kid and Crystal Mother
