In [1]:
import numpy as np
from scipy import sparse
import joblib
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.metrics.pairwise import paired_cosine_distances

# closer the function to 1 better match the paper is
def cosine_similarity_maximizer(array1, array2):
    # array1 is the local vectors
    # array2 is the vectors in arxiv
    # calculates the cosine with every vector in the arxiv
    # returns the minimum of cosine with all vectors in array1
    import numpy as np
    dot_prod = np.dot(array2, array1.T)
    cos_maximizer = []
    cos_maximizer = [np.mean(row) for row in dot_prod]
    return np.array(cos_maximizer)

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000

# Read in the data files for arxiv and the local papers
df_arxiv1 = pd.read_csv('./../arxiv_data/arxiv_physics_30000.csv')
df_arxiv2 = pd.read_csv('./../arxiv_data/arxiv_fluid_30000.csv')
df_arxiv3 = pd.read_csv('./../arxiv_data/arxiv_particle_30000.csv')
df_local = pd.read_csv('./../training_data/local_papers.csv')

df_arxiv = pd.concat([df_arxiv1, df_arxiv2, df_arxiv3], join='outer')

# Load the normalized vectors
local_array = sparse.load_npz('./../training_data/normalized_train_vectors.npz')
arxiv_array1 = sparse.load_npz('./../arxiv_data/normalized_arxiv_paper_vectors_physics.npz')
arxiv_array2 = sparse.load_npz('./../arxiv_data/normalized_arxiv_paper_vectors_fluid.npz')
arxiv_array3 = sparse.load_npz('./../arxiv_data/normalized_arxiv_paper_vectors_particle.npz')

arxiv_array = np.concatenate((arxiv_array1.todense(),arxiv_array2.todense(),arxiv_array3.todense()), axis = 0)
arxiv_array = sparse.csr_matrix(arxiv_array)
sparse.save_npz('./../arxiv_data/normalized_arxiv_paper_vectors.npz', matrix=arxiv_array)

# Comment these if you are not creating a new recommender
cos_maximizer = cosine_similarity_maximizer(local_array, arxiv_array)

joblib.dump(cos_maximizer,'./../arxiv_data/cosine_similarity_maximizer')

cos_maximizer = joblib.load('./../arxiv_data/cosine_similarity_maximizer')

df_arxiv['cosine_similarity_maximizer'] = cos_maximizer

df_arxiv_arranged = df_arxiv.sort_values(by='cosine_similarity_maximizer', ascending=False).drop_duplicates()

df_arxiv_arranged.to_csv('./../arxiv_data/df_arxiv_arranged.csv', index=False)

vocab = joblib.load('./../trained_models/cvec_vocabulary')

In [2]:
df_arxiv_arranged

Unnamed: 0,title,id,authors,abstract,journal,published,cosine_similarity_maximizer
3632,Particle accretion onto planets in discs with ...,http://arxiv.org/abs/1803.08730v2,"['Giovanni Picogna', 'Moritz H. R. Stoll', 'Wi...",The growth process of proto-planets can be s...,"A&A 616, A116 (2018)",2018-03-23 10:49:31+00:00,0.320196
11482,Slipping motion of large neutrally-buoyant par...,http://arxiv.org/abs/1306.1388v1,"['Mamadou Cisse', 'Holger Homann', 'Jeremie Bec']",Direct numerical simulations are used to inv...,,2013-06-06 12:16:09+00:00,0.319152
4096,Restricted Euler dynamics along trajectories o...,http://arxiv.org/abs/1608.02464v1,"['Perry L. Johnson', 'Charles Meneveau']",The fate of small particles in turbulent flo...,,2016-08-08 14:50:03+00:00,0.311234
1084,Interface-resolved simulations of small inerti...,http://arxiv.org/abs/1906.01249v5,"['Pedro Costa', 'Luca Brandt', 'Francesco Pica...",We present a direct comparison between inter...,,2019-06-04 07:49:19+00:00,0.311131
3627,Particle dynamics in discs with turbulence gen...,http://arxiv.org/abs/1607.02322v1,"['Moritz H. R. Stoll', 'Wilhelm Kley']",Among the candidates for generating turbulen...,"A&A 594, A57 (2016)",2016-07-08 11:40:16+00:00,0.310576
...,...,...,...,...,...,...,...
12088,Response to arXiv:1005.2615,http://arxiv.org/abs/1005.3723v1,"['J. I. Collar', 'D. N. McKinsey']",Brief response to a Reply [arXiv:1005.2615] ...,,2010-05-20 14:58:48+00:00,0.008555
2457,Thomas precession angle and spinor algebra,http://arxiv.org/abs/physics/0010082v2,"['Shao-Hsuan Chiu', 'T. K. Kuo']",See math-ph/0205036 for an expanded version.\n,,2000-10-31 20:04:51+00:00,0.008545
11193,"Vertex Lie algebras, vertex Poisson algebras a...",http://arxiv.org/abs/math/0102127v1,"['C. Dong', 'H. Li', 'G. Mason']",The notions of vertex Lie algebra and vertex...,,2001-02-16 22:26:10+00:00,0.008273
255,Seiberg-Witten Invariants-An Expository Account,http://arxiv.org/abs/math/9809044v1,"['Kapil Paranjape', 'Vishwambhar Pati']","In this note, we give an exposition of the c...",,1998-09-09 05:32:14+00:00,0.008138
