In [None]:
import pandas as pd


post_data = pd.read_csv("data.csv") 


post_data



In [16]:
from scipy.sparse import csr_matrix

wide_post_data = post_data.pivot(
    index='post_id', columns='user_id', values='play').fillna(0)

wide_post_data_sparse = csr_matrix(wide_post_data.values)

In [19]:
import numpy as np


def save_sparse_csr(filename, array):
    np.savez(filename, data=array.data, indices=array.indices,
             indptr=array.indptr, shape=array.shape)


def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape=loader['shape'])

save_sparse_csr("post_data_sparse_matrix.npz",wide_post_data_sparse)


In [20]:
wide_post_data.to_csv("post_record.csv")

In [21]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric='cosine', algorithm='brute')

model_knn.fit(wide_post_data_sparse)


In [39]:
query_index = np.random.choice(wide_post_data.shape[0])


distances, indices = model_knn.kneighbors(
    wide_post_data.iloc[query_index, :].values.reshape(1, -1), n_neighbors=6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print("Recommendations with binary data for {0}:\n".format(
            wide_post_data.index[query_index]))
    else:
        print("{0}: {1}, with distances of {2}".format(
            i, wide_post_data.index[indices.flatten()[i]], distances.flatten()[i]))


Recommendations with binary data for 643918706e3f5b0c72edeeae:

1: 643918656e3f5b0c72edeead, with distances of 0.29289321881345254
2: 6437b1cb4f8a110a39fce506, with distances of 0.29289321881345254
3: 6441635b13792129775e0534, with distances of 1.0
4: 6441637d13792129775e0537, with distances of 1.0
5: 6441635113792129775e0533, with distances of 1.0


In [42]:
import string
from fuzzywuzzy import fuzz


def print_post_recommendations(post_id, post_data_matrix, knn_model, k):
    """
    Inputs:
        post_id: id of post
        post_data_matrix: post_data_matrix (not the sparse one, the pandas dataframe)
        knn_model: our previously knn_model
        k: the number of nearest neighbors
    Prints: Post recommendations for the provided post id
    Return: None
    """
    query_index = None
    ratio_tuples = []

    for i in post_data_matrix.index:
        ratio = fuzz.ratio(i, post_id)
        if ratio == 100:
            current_index_query = post_data_matrix.index.tolist().index(i)
            ratio_tuples.append((i, ratio, current_index_query))
    try:
        query_index = max(ratio_tuples, key=lambda x: x[1])[2]
        print(post_data_matrix.iloc[query_index, :])
    except:
        print("Your post didn\'t match any post id in the data. Try again")

    distances, indices = knn_model.kneighbors(
        post_data_matrix.iloc[query_index, :].values.reshape(1, -1), n_neighbors=k+1
    )

    for i in range(0, len(distances.flatten())):
        if i == 0:
            print("Recommendations for {0}\n".format(
                post_data_matrix.index[query_index]))
        else:
            print("{0}: {1}, with distance of {2}".format(
                i, post_data_matrix.index[indices.flatten()[i]], distances.flatten()[i]))
    return None


In [None]:
print_post_recommendations("6437b1cb4f8a110a39fce506",
                           wide_post_data, model_knn, 6)
