In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import numpy as np
from scipy import sparse
import joblib

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
jazz_df = pd.read_csv('jazz2.csv').fillna('')
meta_df = pd.read_csv('meta.csv').fillna('')

jazz_df = jazz_df.merge(meta_df,on='url')

vectorizer = joblib.load('vectorizer.pkl')
vectors = sparse.load_npz('jazz.npz')
dense_vectors = vectors.todense().tolist()

nbrs = NearestNeighbors(n_neighbors=24,algorithm='brute',radius=2)
nbrs.fit(dense_vectors)

names = np.load('names.npy',allow_pickle=True)

term_df = pd.DataFrame(dense_vectors,columns=names)



https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [9]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

In [18]:

def jazz_search(search_string):
    
    distances,indices = nbrs.kneighbors(vectorizer.transform([search_string]))
    mask = distances[0] < 2
    inds = indices[0][mask]

    vecs = pca.fit_transform(vectors[mask])

    points = [{'x': coordinate[0], 'y':coordinate[1]} for coordinate in vecs]

    print(points)
        
    ret_df = jazz_df.iloc[inds][['url','title','image_url','pagetype','genre']]
    
    return {'records': ret_df.to_records().tolist(), 'points':points}

In [19]:
jazz_search('john coltrane')['records']

[{'x': -0.9507963285345687, 'y': -0.19014411390449107}, {'x': -0.9171410659514696, 'y': -0.03762848048997373}, {'x': -0.12156791196667363, 'y': -0.007581546510239595}, {'x': 0.13846644707754205, 'y': 0.1384382659270665}, {'x': 0.09784192499041998, 'y': 0.0931804198878749}, {'x': 0.0898688227787068, 'y': 0.1634581585894272}, {'x': 0.13158511478974022, 'y': 0.09783947912531543}, {'x': 0.05533838244762222, 'y': 0.014448325629620635}, {'x': 0.18799912361115492, 'y': 0.09628465790000573}, {'x': 0.1253742070266082, 'y': 0.07321050093310934}, {'x': 0.17433389620627737, 'y': 0.08722788942173512}, {'x': 0.18293735836145575, 'y': 0.09750111199489171}, {'x': 0.18308728450398967, 'y': 0.09130121292197262}, {'x': 0.18820532843524176, 'y': 0.10359367272575264}, {'x': 0.21484843559073494, 'y': -0.17177904754481646}, {'x': 0.20025041521286607, 'y': -0.18917517948638185}, {'x': 0.17895318615870967, 'y': 0.09423086394452816}, {'x': 0.17701562596817905, 'y': -0.20756192935864995}, {'x': 0.164306177954942

[(177,
  'https://en.wikipedia.org/wiki/Ken_Burns_Jazz:_John_Coltrane',
  'Ken Burns Jazz: John Coltrane',
  'https://upload.wikimedia.org/wikipedia/en/thumb/5/5e/Ken_Burns_Jazz_John_Coltrane.jpg/220px-Ken_Burns_Jazz_John_Coltrane.jpg',
  'album',
  'jazz'),
 (185,
  'https://en.wikipedia.org/wiki/First_Meditations_(for_quartet)',
  'First Meditations (for quartet)',
  'https://upload.wikimedia.org/wikipedia/en/thumb/4/4a/Coltrane1stMeditationsLP.JPG/220px-Coltrane1stMeditationsLP.JPG',
  'album',
  'avant-garde jazz, free jazz, modal jazz'),
 (195,
  'https://en.wikipedia.org/wiki/Bye_Bye_Blackbird_(John_Coltrane_album)',
  'Bye Bye Blackbird (John Coltrane album)',
  'https://upload.wikimedia.org/wikipedia/en/d/da/Bye_Bye_Blackbird.jpg',
  'album',
  'avant-garde jazzmodal jazzhard bop'),
 (143,
  'https://en.wikipedia.org/wiki/The_John_Coltrane_Quartet_Plays',
  'The John Coltrane Quartet Plays',
  'https://upload.wikimedia.org/wikipedia/en/thumb/6/67/The_John_Coltrane_Quartet_Plays