In [44]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from api import get_all_accounts, get_collected_by_public_key, get_release_by_public_key

In [None]:
#------------------
#  CREATE DATASET
#  https://stackoverflow.com/questions/59245143/how-to-turn-a-column-of-lists-in-pandas-to-a-sparse-dataframe-of-the-unique-valu
#
#  this cell takes 10 minutes to run. outputs nina_user_collections.csv which can be found in repo
#  only run if current data is necessary
#------------------

accounts, _ = get_all_accounts(3000)
df = {}
print(len(accounts))
count = 0
for acc in accounts:
  count += 1
  if not count % 100: print(count)
  try:
    df[acc] = [[release.public_key for release in get_collected_by_public_key(acc)]]
    if not len(df[acc][0]): del df[acc]
  except:
    pass

print("done w requests")

df = pd.DataFrame.from_dict(df,orient='index')

mlb = MultiLabelBinarizer()
df = pd.DataFrame(mlb.fit_transform(df[0]),columns=mlb.classes_, index=df.index)

df.to_csv('nina_user_collections.csv', index=True)



In [45]:
#------------------
# LOAD THE DATASET
#------------------

data = pd.read_csv('nina_user_collections.csv', index_col=0)
data_items = data.drop(data.columns[0],axis=1)


In [46]:
#------------------------
# ITEM-ITEM CALCULATIONS
#------------------------

# As a first step we normalize the account vectors to unit vectors.

# magnitude = sqrt(x2 + y2 + z2 + ...)
magnitude = np.sqrt(np.square(data_items).sum(axis=1))

# unitvector = (x / magnitude, y / magnitude, z / magnitude, ...)
data_items = data_items.divide(magnitude, axis='index')

def calculate_similarity(data_items):
    """Calculate the column-wise cosine similarity for a sparse
    matrix. Return a new dataframe matrix with similarities.
    """
    data_sparse = sparse.csr_matrix(data_items)
    similarities = cosine_similarity(data_sparse.transpose())
    sim = pd.DataFrame(data=similarities, index= data_items.columns, columns= data_items.columns)
    return sim

# Build the similarity matrix (this is item_matrix.csv in the repo)
data_matrix = calculate_similarity(data_items)

# Lets get the top 11 similar releases for https://ninaprotocol.com/9ybyxyNaKi9cm7jENmy26XBCazkEq3nu2KvQ2nTRumJS
print(data_matrix.loc['9ybyxyNaKi9cm7jENmy26XBCazkEq3nu2KvQ2nTRumJS'].nlargest(11))

2P6JUnmrPTVZSSeDvJCe3EFdnAV7PvAPNjT3wecWqtZs    1.0
36dALkT9L88WR3PydN3QHBFpHzBgDt3qGfvaXnW9nSUp    1.0
3TyXX1jeUCrfsyDy21kTMSoxfZC2mLVdYk5vkCJBZaBx    1.0
3VDiNspnzWEQsUa1wRPSaBvMbtxHe5c8rVgv5dkzHqjU    1.0
4RpgVX7U58pLZPEKJb95pCvxxJiL1bFGyGQNF8Hgqtrh    1.0
5fSpw7aUcFLFSej75qnuZY8wwVy4sUmSExdWDxX3JVD2    1.0
6ALby4N8adRqFWB9mZw3fgWj8z1uzrd1z8VTDXwDH11D    1.0
6XERbjDd2myURWo21LYF74kXVLYqmUb8818Ptk5Z924o    1.0
74iE8coUgAJRpAA5f43UDKUsWTxuHSDbHZbB9GQXAyuA    1.0
7uoNjvp6cdSVQzE4ErGqCCUpWnx7J7pmFXHJEU4bzGzu    1.0
94K4QzryBjXXcE1fyDiu2S1CeHMpwZd22ecVx8cfNNnX    1.0
Name: 9ybyxyNaKi9cm7jENmy26XBCazkEq3nu2KvQ2nTRumJS, dtype: float64


In [47]:
#------------------------
# USER-ITEM CALCULATIONS
#------------------------

# Construct a new dataframe with the 10 closest neighbours (most similar)
# for each release.

data_neighbours = pd.DataFrame(index=data_matrix.columns, columns=range(1,11))
print(data_neighbours.shape)
print(len(data_matrix.columns))
for release in data_matrix.columns:
  data_neighbours.loc[release] = data_matrix.loc[:, release].sort_values(ascending=False)[:10].index



(1570, 10)
1570


In [48]:
def get_recommendations(user_key, num_recs):
  # Get the releases the user has collected.
  user_collection = [release.public_key for release in get_collected_by_public_key(user_key)]

  # Construct the neighbourhood from the most similar releases to the
  # ones our user has already collected.
  most_similar_to_collection = data_neighbours.loc[user_collection]
  similar_list = most_similar_to_collection.values.tolist()
  similar_list = list(set([item for sublist in similar_list for item in sublist]))

  neighbourhood = data_matrix[similar_list].loc[similar_list]

  # A user vector containing only the neighbourhood releases and
  # the known user owned releases.

  user_vector = data.loc[user_key][similar_list]

  # Calculate the score.
  score = neighbourhood.dot(user_vector).div(neighbourhood.sum(axis=1))

  # Drop the releases the user already owns.
  score = score.drop(user_collection)

  # print(user_collection)
  return score.nlargest(num_recs)

In [49]:
#------------------------
# EXAMPLE:
#------------------------

# Get top 5 recommendations for a user
recs = get_recommendations('MW8D5zJcQ35orcYizKgGzx3tv8YM276ibznvesAvnas', 5)
recs_info = [get_release_by_public_key(key) for key in recs.index.values]
for release in recs_info:
  print(f'{release.metadata.name}: {release.metadata.external_url}')

Teu - Um yes hello be well goodbye: https://ninaprotocol.com/ATvYwzdNMfSoaF59GXpF8S3Z2zUX3Eo6Aeg1pb6VXuL1
Music For Graphic Designers - Italia: https://ninaprotocol.com/7MDavy863pJqvydrEBracZXSNnFtzXuzpaEspWGPoC5v
Mifella - Milady You Are My Baby: https://ninaprotocol.com/6q4Xu44V79BmDAy6Cg3q1VQBTNGrZWAUvvEbNSfdXF1t
The Transcendence Orchestra - Serviceable villain / An ancient city and several nuclear explosions: https://ninaprotocol.com/J1LkFQ76xe6u8ZB4i4dJNRPKvo9DdbYwVWmCuhAwEFSJ
PANZERSCHOKOLADE - body speaking: https://ninaprotocol.com/EzF5ya9KjFWaShugKYiDm97a5nva2Np8gWwVK2iEN4Sw


In [None]:
#------------------------
# REFERENCES:
# https://medium.com/radon-dev/item-item-collaborative-filtering-with-binary-or-unary-data-e8f0b465b2c3
# http://www.salemmarafi.com/code/collaborative-filtering-with-python/
# https://realpython.com/build-recommendation-engine-collaborative-filtering/
#------------------------