In [3]:
import numpy as np
import pandas as pd
import re
import time
from datasketch import MinHash, MinHashLSHForest

Reference:
https://www.learndatasci.com/tutorials/building-recommendation-engine-locality-sensitive-hashing-lsh-python/

In [5]:
papers = pd.read_csv("papers.csv")

# Preprocess the data
def preprocess(text):
    text = re.sub(r'[^\w\s]','',text)
    tokens = text.lower()
    tokens = tokens.split()
    return tokens




# Create Minhash Forest for Queries
def get_forest(data, perms):
    start_time = time.time()
    
    minhash = []
    
    for text in data['text']:
        tokens = preprocess(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest


# Evaluate Queries
def predict(text, database, perms, num_results, forest):
    start_time = time.time()
    
    tokens = preprocess(text)
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    
    result = database.iloc[idx_array]['title']
    
    print('It took %s seconds to query forest.' %(time.time()-start_time))
    
    return result

In [16]:
permutations = 128

# Test the code
forest = get_forest(papers, permutations)

num_recommendations = 5
title = 'Bayesian Query Construction for Neural Network Models'
result = predict(title, papers, permutations, num_recommendations, forest)
print('\n Top Recommendation(s) is(are) \n', result)

It took 18.330833196640015 seconds to build forest.
It took 0.0020868778228759766 seconds to query forest.

 Top Recommendation(s) is(are) 
 7169    Asymptotics of Gradient-based Neural Network T...
3       Bayesian Query Construction for Neural Network...
6633    Neural Network Model Selection Using Asymptoti...
652     Global Optimisation of Neural Network Models v...
859         Neural Network Based Model Predictive Control
Name: title, dtype: object


In [14]:
# choosing title

for i in papers['title']:
    print(i)

Self-Organization of Associative Database and Its Applications
A Mean Field Theory of Layer IV of Visual Cortex and Its Application to Artificial Neural Networks
Storing Covariance by the Associative Long-Term Potentiation and Depression of Synaptic Strengths in the Hippocampus
Bayesian Query Construction for Neural Network Models
Neural Network Ensembles, Cross Validation, and Active Learning
Using a neural net to instantiate a deformable model
Plasticity-Mediated Competitive Learning
ICEG Morphology Classification using an Analogue VLSI Neural Network
Real-Time Control of a Tokamak Plasma Using Neural Networks
Pulsestream Synapses with Non-Volatile Analogue Amorphous-Silicon Memories
Learning to Play the Game of Chess
Multidimensional Scaling and Data Clustering
An experimental comparison of recurrent neural networks
Training Multilayer Perceptrons with the Extended Kalman Algorithm
Interference in Learning Internal Models of Inverse Dynamics in Humans
Active Learning with Statistica