In [1]:
from tensorflow.keras import layers
from tensorflow import keras
import tensorflow as tf
from sklearn.model_selection import train_test_split

from ast import literal_eval

import pandas as pd
import numpy as np




In [2]:
import streamlit as st

In [3]:
df = pd.read_csv("Final_arxiv1.csv")

In [4]:
df

Unnamed: 0,title,category_id,abstract,authors
0,Sparsity-certifying Graph Decompositions,['cs.CG'],"We describe a new algorithm, the $(k,\ell)$-...",Ileana Streinu and Louis Theran
1,A limit relation for entropy and channel capac...,['cs.IT'],"In a quantum mechanical model, Diosi, Feldma...","I. Csiszar, F. Hiai and D. Petz"
2,Intelligent location of simultaneously active ...,"['cs.NE', 'cs.AI']",The intelligent acoustic emission locator is...,T. Kosel and I. Grabec
3,Intelligent location of simultaneously active ...,"['cs.NE', 'cs.AI']",Part I describes an intelligent acoustic emi...,T. Kosel and I. Grabec
4,On-line Viterbi Algorithm and Its Relationship...,['cs.DS'],"In this paper, we introduce the on-line Vite...","Rastislav \v{S}r\'amek, Bro\v{n}a Brejov\'a, T..."
...,...,...,...,...
59995,Inventions on using sound and speech in GUI,['cs.HC'],Voice Recognition (VR) facilitates a human i...,Umakant Mishra
59996,Why a Global Time is Needed in a Dependable SoS,['cs.DC'],A system-of-systems (SoS) is a large informa...,Hermann Kopetz
59997,Inventions on Using Colors in Graphical User I...,['cs.HC'],Color is an important aspect of any graphica...,Umakant Mishra
59998,Inventions on GUI Aesthetics,['cs.HC'],"Aesthetics or ""look and feel"" is one of the ...",Umakant Mishra


In [5]:
# !pip install -U -q sentence-transformers

In [6]:
# frame = pd.read_csv('arxiv_merged_dataframe.csv'/)

In [7]:
# category_id >> terms

In [8]:
print(df.columns)

Index(['title', 'category_id', 'abstract', 'authors'], dtype='object')


In [9]:
df.drop(columns=['category_id','abstract','authors'], axis=True ,inplace=True)

In [10]:
# print(df1)

In [11]:
df['title'] = df['title'].str.replace('\n', '')

# Sentence Transformers

In [12]:
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [18]:
sentences = df['title']

In [19]:
# embedding = model.encode(sentences)

In [69]:
embedding.shape

(60000, 384)

In [70]:
c = 0

for sentence, embeddings in zip(sentences, embedding):
    print("sentence :", sentence)
    print("embedding :", len(embeddings))
    print("")
    if c >= 5:
        break
    c+=1

sentence : Sparsity-certifying Graph Decompositions
embedding : 384

sentence : A limit relation for entropy and channel capacity per unit cost
embedding : 384

sentence : Intelligent location of simultaneously active acoustic emission sources:  Part I
embedding : 384

sentence : Intelligent location of simultaneously active acoustic emission sources:  Part II
embedding : 384

sentence : On-line Viterbi Algorithm and Its Relationship to Random Walks
embedding : 384

sentence : Real Options for Project Schedules (ROPS)
embedding : 384



# Save Files

In [71]:
import pickle

In [72]:
with open("models/embedding.pkl", 'wb') as f:
    pickle.dump(embedding,f)

with open("models/sentences.pkl", 'wb') as f:
    pickle.dump(sentences,f)

with open("models/rec_model.pkl", 'wb') as f:
    pickle.dump(model,f)

In [18]:
with open("models/rec_model.pkl", 'wb') as f:
    pickle.dump(model,f)

# Recommendation for similar papers

In [13]:
import pickle

In [14]:
embedding = pickle.load(open("models/embedding.pkl",'rb'))
sentence = pickle.load(open("models/sentences.pkl",'rb'))
rec_model = pickle.load(open("models/rec_model.pkl",'rb'))

In [15]:
import torch

In [16]:
def recommendation(input_paper):
    cosine_score = util.cos_sim(embedding, rec_model.encode(input_paper))
    top_similar_papers = torch.topk(cosine_score, dim = 0, k=5, sorted=True)

    paper_list = []
    for i in top_similar_papers.indices:
        paper_list.append(sentences[i.item()])

    return paper_list

In [70]:
help(util.cos_sim)

Help on function cos_sim in module sentence_transformers.util:

cos_sim(a: torch.Tensor, b: torch.Tensor)
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])



In [20]:
input_paper = input("Enter research paper title : ")

recommended_paper = recommendation(input_paper)
recommended_paper

Enter research paper title :  Attention is all you need


['Focus of Attention for Linear Predictors',
 'The dynamic pattern of human attention',
 'Attention-Sensitive Alerting',
 'Persistence and Success in the Attention Economy',
 '6th International Symposium on Attention in Cognitive Systems 2013']

In [21]:
papers = pd.DataFrame(recommended_paper)

In [22]:
# a = "Attention is all you need"
# myemb = rec_model.encode(a)

In [23]:
papers

Unnamed: 0,0
0,Focus of Attention for Linear Predictors
1,The dynamic pattern of human attention
2,Attention-Sensitive Alerting
3,Persistence and Success in the Attention Economy
4,6th International Symposium on Attention in Co...


In [24]:
# util.cos_sim(embedding, myemb)

In [25]:
# myemb

In [26]:
# !pip install sentence_transformers==2.2.2

# Data Cleaninig and Processing

In [27]:
arxiv_data = pd.read_csv("Final_arxiv1.csv")

In [28]:
arxiv_data = arxiv_data[~arxiv_data['title'].duplicated()]

In [29]:
arxiv_data.shape

(59910, 4)

In [30]:
type(arxiv_data['category_id'])

pandas.core.series.Series

In [31]:
print(sum(arxiv_data['category_id'].value_counts()==1))

1383


In [32]:
print(arxiv_data['category_id'].nunique())

2626


In [33]:
arxiv_data_filtered = arxiv_data.groupby('category_id').filter(lambda x: len(x) > 1)

In [34]:
        arxiv_data.shape

(59910, 4)

In [35]:
# arxiv_data_filtered['category_id']

In [36]:
arxiv_data_filtered['category_id'] = arxiv_data_filtered['category_id'].apply(lambda x: literal_eval(x))

In [37]:
arxiv_data_filtered['category_id'].values[:3]

array([list(['cs.CG']), list(['cs.IT']), list(['cs.NE', 'cs.AI'])],
      dtype=object)

In [106]:
# arxiv_data.isnull().sum()

In [107]:
# arxiv_data.duplicated().sum()

In [108]:
# arxiv_data.columns

In [110]:
# labels_columns = arxiv_data['category_id'].apply(literal_eval)
# labels = labels_columns.explode().unique()

In [111]:
# len(labels)|

In [112]:
# arxiv_data[arxiv_data['title'].duplicated()]

# Train & Test Split

In [38]:
train_df, test_df = train_test_split(arxiv_data_filtered, test_size=0.1, stratify=arxiv_data_filtered['category_id'].values )

In [39]:
test_df.shape

(5853, 4)

In [40]:
val_df = test_df.sample(frac = 0.5)
test_df.drop(val_df.index, inplace=True)

In [41]:
test_df

Unnamed: 0,title,category_id,abstract,authors
42180,A Fast Template Based Heuristic For Global Mul...,[cs.CE],Advances in bio-technology have made availab...,"Srikrishnan Divakaran, Arpit Mithal, and Namit..."
13378,Extension of Wirtinger Calculus in RKH Spaces ...,[cs.LG],"Over the last decade, kernel methods for non...","Pantelis Bouboulis, Sergios Theodoridis"
10061,A Tighter Bound for the Determinization of Vis...,"[cs.FL, cs.LO]","Visibly pushdown automata (VPA), introduced ...",Nguyen Van Tang (AIST)
20749,A Coinductive Calculus for Asynchronous Side-e...,[cs.LO],We present an abstract framework for concurr...,"Sergey Goncharov and Lutz Schr\""oder"
41539,Byzantine Vector Consensus in Complete Graphs,[cs.DC],Consider a network of n processes each of wh...,"Nitin H. Vaidya, Vijay K. Garg"
...,...,...,...,...
19757,A Note on the Sum of Correlated Gamma Random V...,[cs.IT],The sum of correlated gamma random variables...,Jose F. Paris
46323,A Note on Cyclic Codes from APN Functions,[cs.IT],"Cyclic codes, as linear block error-correcti...","Chunming Tang, Yanfeng Qi, Maozhi Xu"
46410,Hybrid Optical and Electrical Network Flows Sc...,[cs.NI],"Hybrid intra-data centre networks, with opti...",Ibrahim Kabiru Musa and Stuart Walker
14770,Weighted Automata and Recurrence Equations for...,"[cs.FL, cs.DM]",Let $\mathcal{P}(\Sigma^*)$ be the semiring ...,"Edoardo Carta-Gerardino, Parisa Babaali"


In [42]:
val_df.shape, test_df.shape

((2926, 4), (2927, 4))

In [43]:
train_df.shape

(52674, 4)

In [44]:
terms =  tf.ragged.constant(train_df['category_id'])

In [45]:
lookup = tf.keras.layers.StringLookup(output_mode='multi_hot')
lookup.adapt(terms)
vocab = lookup.get_vocabulary()





In [46]:
sample_label = train_df['category_id'].iloc[0]
print(sample_label)
label_binarized = lookup([sample_label])
print(label_binarized)

['cs.LG']
tf.Tensor(
[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]], shape=(1, 41), dtype=float32)


In [47]:
max_seqlen = 150
batch_size = 128
padding_token = "<pad>"
auto = tf.data.AUTOTUNE

def make_dataset(dataframe, is_train=True):
    labels = tf.ragged.constant(dataframe['category_id'].values)
    label_binarized = lookup(labels).numpy()
    dataset = tf.data.Dataset.from_tensor_slices((dataframe['abstract'].values, label_binarized))
    dataset = dataset.shuffle(batch_size*10) if is_train else dataset

    return dataset.batch(batch_size)

In [48]:
train_dataset = make_dataset(train_df, is_train=True)
val_dataset = make_dataset(val_df, is_train=True)
test_dataset = make_dataset(test_df, is_train=True)

In [49]:
def invert_multi_hot(encoded_labels):
    hot_indices = np.argwhere(encoded_labels==1.0)[...,0]
    return np.take(vocab, hot_indices)


text_batch, label_batch = next(iter(train_dataset))
for i, text in enumerate(text_batch[:5]):
    label = label_batch[1].numpy()[None, ...]
    print(text)
    print(invert_multi_hot(label[0]))

tf.Tensor(b'  Given an undirected graph $G$, the Minimum Sum Coloring problem (MSCP) is to\nfind a legal assignment of colors (represented by natural numbers) to each\nvertex of $G$ such that the total sum of the colors assigned to the vertices is\nminimized. This paper presents a memetic algorithm for MSCP based on a tabu\nsearch procedure with two neighborhoods and a multi-parent crossover operator.\nExperiments on a set of 77 well-known DIMACS and COLOR 2002-2004 benchmark\ninstances show that the proposed algorithm achieves highly competitive results\nin comparison with five state-of-the-art algorithms. In particular, the\nproposed algorithm can improve the best known results for 17 instances. We also\nprovide upper bounds for 18 additional instances for the first time.\n', shape=(), dtype=string)
['cs.DL']
tf.Tensor(b"  Most researchers acknowledge an intrinsic hierarchy in the scholarly journals\n('journal rank') that they submit their work to, and adjust not only their\nsubmissi

In [50]:
vocabulary = set()
train_df['abstract'].str.split().apply(vocabulary.update)
vocabulary_size = len(vocabulary)

# Text Vectorization

In [51]:
text_vectorizer = layers.TextVectorization(max_tokens=vocabulary_size, ngrams=2, output_mode='tf_idf')
text_vectorizer.adapt(train_dataset.map(lambda text, label: text))

In [52]:
text_vectorizer_weights = text_vectorizer.get_weights()

In [53]:
text_vectorizer_weights

[array([8.539302 , 0.6997619, 0.7040981, ..., 9.773322 , 9.485659 ,
        9.773322 ], dtype=float32),
 array([b'the', b'of', b'a', ..., b'traditional packet',
        b'traditional notions', b'traditional noncooperation'],
       dtype=object),
 0]

In [54]:
with open("models/text_vectorizer_weights.pkl", 'wb') as f:
    pickle.dump(text_vectorizer_weights,f)

In [55]:
train_dataset = train_dataset.map(lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto).prefetch(auto)
val_dataset = val_dataset.map(lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto).prefetch(auto)
test_dataset = test_dataset.map(lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto).prefetch(auto)

# Model Training

We are using Multi-Layer perceptron for Subject area prediction

In [56]:
model = keras.Sequential([
    layers.Dense(512, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(lookup.vocabulary_size(), activation='sigmoid')   
])

In [57]:
 model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])




In [58]:
from tensorflow.keras.callbacks import EarlyStopping
es = EarlyStopping(patience=5, restore_best_weights=True)

In [59]:
# model.fit(
#     train_dataset,
#     validation_data=val_dataset, 
#     epochs=20,
#     callbacks=[es]
# )

# Save Model and Text Vectorizer
Model, Vectorizer and vocabulary

In [61]:
# model.save('models/model1.h5')

save_text_vectorizer_config = text_vectorizer.get_config()
with open('models/text_vectorizer_weights.pkl','wb') as f:
    pickle.dump(save_text_vectorizer_config, f)

with open('models/vocab.pkl', 'wb') as f:
    pickle

In [62]:
with open('models/vocab.pkl', 'wb') as f:
    pickle.dump(vocab,f)

# Load Model and Text Vectorizer:

In [63]:
load_model = keras.models.load_model('models/model1.h5')

In [65]:
with open("models/text_vectorizer_config.pkl", 'rb') as f:
    save_text_vectorizer_config = pickle.load(f)

load_text_vectorizer = text_vectorizer.from_config(save_text_vectorizer_config)

with open("models/vocab.pkl", 'rb') as f:
    loaded_vocab = pickle.load(f)

In [67]:
with open("models/training_model.pkl", 'rb') as f:
    weighs = pickle.load(f)
    load_text_vectorizer.set_weights(weighs)

TypeError: object of type 'Sequential' has no len()

In [68]:
load_text_vectorizer.load_weights("models/training_model.pkl")

AttributeError: 'TextVectorization' object has no attribute 'load_weights'

# Model Evaluation

In [72]:
# load_model

In [73]:
# labe

# Model Prediction

In [78]:
def invert_multi_hot(encoded_hot):
    hot_indeces = np.argwhere(encoded_hot == 1.0)[...,0]
    return np.take(loaded_vocab, hot_indeces)

def pred_categories(abstract, model, vectorizer, label_lookup):
    preprocessed_abstract = vectorizer([abstract])
    predictions = model.predict(preprocessed_abstract)
    preduicted_labels = label_lookup(np.round(predictions).astype(int)[0])
    return predicted_labels

In [79]:
new_abstract = "ooooo"
predicted_category = pred_categories(new_abstract, load_model, load_text_vectorizer, invert_multi_hot)



NameError: name 'predicted_labels' is not defined

In [227]:
arxiv_data['abstract'][10]

'  In some particular cases we give criteria for morphic sequences to be almost\nperiodic (=uniformly recurrent). Namely, we deal with fixed points of\nnon-erasing morphisms and with automatic sequences. In both cases a\npolynomial-time algorithm solving the problem is found. A result more or less\nsupporting the conjecture of decidability of the general problem is given.\n'

In [228]:
new_abstract = "In some particular cases we give criteria for morphic sequences to be almost\nperiodic (=uniformly recurrent). Namely, we deal with fixed points of\nnon-erasing morphisms and with automatic sequences. In both cases a\npolynomial-time algorithm solving the problem is found. A result more or less\nsupporting the conjecture of decidability of the general problem is given.\n"

predicted_category = pred_categories(new_abstract, load_model, load_text_vectorizer, invert_multi_hot)

print("Predicted Categories:", predicted_category)

FailedPreconditionError: Exception encountered when calling layer 'string_lookup_11' (type StringLookup).

{{function_node __wrapped__LookupTableFindV2_device_/job:localhost/replica:0/task:0/device:CPU:0}} Table not initialized. [Op:LookupTableFindV2] name: 

Call arguments received by layer 'string_lookup_11' (type StringLookup):
  • inputs=<tf.RaggedTensor [[b'in', b'some', b'particular', b'cases', b'we', b'give', b'criteria',
  b'for', b'morphic', b'sequences', b'to', b'be', b'almost', b'periodic',
  b'uniformly', b'recurrent', b'namely', b'we', b'deal', b'with',
  b'fixed', b'points', b'of', b'nonerasing', b'morphisms', b'and',
  b'with', b'automatic', b'sequences', b'in', b'both', b'cases', b'a',
  b'polynomialtime', b'algorithm', b'solving', b'the', b'problem', b'is',
  b'found', b'a', b'result', b'more', b'or', b'less', b'supporting',
  b'the', b'conjecture', b'of', b'decidability', b'of', b'the',
  b'general', b'problem', b'is', b'given', b'in some', b'some particular',
  b'particular cases', b'cases we', b'we give', b'give criteria',
  b'criteria for', b'for morphic', b'morphic sequences', b'sequences to',
  b'to be', b'be almost', b'almost periodic', b'periodic uniformly',
  b'uniformly recurrent', b'recurrent namely', b'namely we', b'we deal',
  b'deal with', b'with fixed', b'fixed points', b'points of',
  b'of nonerasing', b'nonerasing morphisms', b'morphisms and',
  b'and with', b'with automatic', b'automatic sequences', b'sequences in',
  b'in both', b'both cases', b'cases a', b'a polynomialtime',
  b'polynomialtime algorithm', b'algorithm solving', b'solving the',
  b'the problem', b'problem is', b'is found', b'found a', b'a result',
  b'result more', b'more or', b'or less', b'less supporting',
  b'supporting the', b'the conjecture', b'conjecture of',
  b'of decidability', b'decidability of', b'of the', b'the general',
  b'general problem', b'problem is', b'is given']]>