<a href="https://colab.research.google.com/github/panik-79/arxiv_paper_recommendation_model/blob/main/rec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [250]:
from tensorflow.keras import layers
from tensorflow import keras
import tensorflow as tf

from sklearn.model_selection import train_test_split

from ast import literal_eval
# is used for safely evaluating strings containing Python literals or container displays
# (e.g., lists, dictionaries) to their corresponding Python objects.

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [251]:
arxiv_data = pd.read_csv("arxiv_data_210930-054931.csv")

In [252]:
# remove duplicate entries based on the "titles" (terms) column
# This filters the DataFrame, keeping only the rows where the titles are not duplicated.
arxiv_data = arxiv_data[~arxiv_data['titles'].duplicated()]
print(f"There are {len(arxiv_data)} rows in the deduplicated dataset.")
# There are some terms with occurrence as low as 1.
print(sum(arxiv_data['terms'].value_counts()==1))
# how many unique terms
print(arxiv_data['terms'].nunique())

There are 41105 rows in the deduplicated dataset.
2503
3401


In [253]:
# getting unique labels
labels_column = arxiv_data['terms'].apply(literal_eval)
labels = labels_column.explode().unique()
print("labels :",labels)
print("lenght :",len(labels))

labels : ['cs.LG' 'cs.AI' 'cs.CR' ... 'D.1.3; G.4; I.2.8; I.2.11; I.5.3; J.3'
 '68T07, 68T45, 68T10, 68T50, 68U35' 'I.2.0; G.3']
lenght : 1177


In [254]:
# Filtering the rare terms. (it keeps only those rows where the "terms" value occurs more than once in the original DataFrame.)
arxiv_data_filtered = arxiv_data.groupby('terms').filter(lambda x: len(x) > 1)
arxiv_data_filtered.shape

(38602, 3)

In [255]:
# It evaluates the given string containing a Python literal or container display (e.g., a list or dictionary) and returns the corresponding Python object.
arxiv_data_filtered['terms'] = arxiv_data_filtered['terms'].apply(lambda x: literal_eval(x))
arxiv_data_filtered['terms'].values[:3]

array([list(['cs.LG']), list(['cs.LG', 'cs.AI']),
       list(['cs.LG', 'cs.CR', 'stat.ML'])], dtype=object)

In [256]:
test_split = 0.25

# Initial train and test split.
# The stratify parameter ensures that the splitting is done in a way that preserves the same distribution of labels (terms) in both the training and test sets.
train_df, test_df = train_test_split(arxiv_data_filtered,test_size=test_split,stratify=arxiv_data_filtered["terms"].values,)

# Splitting the test set further into validation
# and new test sets.
val_df = test_df.sample(frac=0.5)
test_df.drop(val_df.index, inplace=True)

print(f"Number of rows in training set: {len(train_df)}")
print(f"Number of rows in validation set: {len(val_df)}")
print(f"Number of rows in test set: {len(test_df)}")

Number of rows in training set: 28951
Number of rows in validation set: 4826
Number of rows in test set: 4825


In [257]:
from sklearn.preprocessing import MultiLabelBinarizer

# Convert the terms column to a list of lists
terms_list = train_df['terms'].tolist()

# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Fit MultiLabelBinarizer on the terms list to build the vocabulary
mlb.fit(terms_list)

# Transform the terms list into a multi-hot encoded representation
multi_hot_encoded = mlb.transform(terms_list)

# Get the vocabulary (terms)
vocab = mlb.classes_

print("Vocabulary:\n")
print(vocab)
len(vocab)


Vocabulary:

['14J60 (Primary) 14F05, 14J26 (Secondary)' '60L10, 60L20' '62H30' '62H35'
 '62H99' '65D19' '68' '68Q32' '68T01' '68T05' '68T07' '68T10' '68T30'
 '68T45' '68T99' '68Txx' '68U01' '68U10'
 'E.5; E.4; E.2; H.1.1; F.1.1; F.1.3' 'F.2.2; I.2.7' 'G.3'
 'H.3.1; H.3.3; I.2.6; I.2.7' 'H.3.1; I.2.6; I.2.7' 'I.2' 'I.2.0; I.2.6'
 'I.2.1' 'I.2.10' 'I.2.10; I.2.6' 'I.2.10; I.4.8' 'I.2.10; I.4.8; I.5.4'
 'I.2.10; I.4; I.5' 'I.2.10; I.5.1; I.4.8' 'I.2.1; J.3' 'I.2.6'
 'I.2.6, I.5.4' 'I.2.6; I.2.10' 'I.2.6; I.2.7'
 'I.2.6; I.2.7; H.3.1; H.3.3' 'I.2.6; I.2.8' 'I.2.6; I.2.9' 'I.2.6; I.5.1'
 'I.2.6; I.5.4' 'I.2.7' 'I.2.8' 'I.2; I.2.6; I.2.7' 'I.2; I.4; I.5'
 'I.2; I.5' 'I.2; J.2' 'I.3.7' 'I.4' 'I.4.0' 'I.4.1' 'I.4.3' 'I.4.4'
 'I.4.5' 'I.4.6' 'I.4.6; I.4.8' 'I.4.8' 'I.4.9' 'I.4.9; I.5.4' 'I.4; I.5'
 'I.5.2' 'I.5.4' 'K.3.2' 'astro-ph.IM' 'cond-mat.dis-nn'
 'cond-mat.mtrl-sci' 'cond-mat.soft' 'cond-mat.stat-mech' 'cs.AI' 'cs.AR'
 'cs.CC' 'cs.CE' 'cs.CG' 'cs.CL' 'cs.CR' 'cs.CV' 'cs.CY' 'cs.DB' 'cs

164

In [258]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download("stopwords")
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [259]:
ps = PorterStemmer()
len(arxiv_data_filtered)
arxiv_data_filtered['abstracts'][0]

"Graph neural networks (GNNs) have been widely used to learn vector\nrepresentation of graph-structured data and achieved better task performance\nthan conventional methods. The foundation of GNNs is the message passing\nprocedure, which propagates the information in a node to its neighbors. Since\nthis procedure proceeds one step per layer, the range of the information\npropagation among nodes is small in the lower layers, and it expands toward the\nhigher layers. Therefore, a GNN model has to be deep enough to capture global\nstructural information in a graph. On the other hand, it is known that deep GNN\nmodels suffer from performance degradation because they lose nodes' local\ninformation, which would be essential for good model performance, through many\nmessage passing steps. In this study, we propose multi-level attention pooling\n(MLAP) for graph-level classification tasks, which can adapt to both local and\nglobal structural information in a graph. It has an attention pooling 

In [260]:
stop_words = set(stopwords.words('english'))

corpus_train = []

for abstract in train_df['abstracts']:
    # Remove newline characters and lowercasing
    abstract = re.sub('\n', ' ', abstract).lower()
    abstract = re.sub('[^a-zA-Z]', ' ', abstract)
    # Tokenize the abstract
    words = abstract.split()
    # Stemming and removing stopwords
    stemmed_abstract = [ps.stem(word) for word in words if word not in stop_words]
    # Join the stemmed words back into a sentence
    stemmed_abstract = ' '.join(stemmed_abstract)
    corpus_train.append(stemmed_abstract)

In [261]:
# Binarize all labels using the MultiLabelBinarizer
train_labels_binarized = mlb.transform(train_df["terms"])
test_labels_binarized = mlb.transform(test_df["terms"])
val_labels_binarized = mlb.transform(val_df["terms"])

In [262]:
train_df["abstracts"] = corpus_train

In [263]:
train_df.head()

Unnamed: 0,terms,titles,abstracts
15787,[cs.CV],Fingerprint Presentation Attack Detection: A S...,vulner autom fingerprint recognit system prese...
55156,"[cs.CV, cs.LG]",Making Better Mistakes: Leveraging Class Hiera...,deep neural network improv imag classif dramat...
21925,[cs.LG],A Generative Model to Synthesize EEG Data for ...,predict seizur occur vital bring normalci live...
16723,"[stat.ML, cs.LG, cs.NE, stat.ME]",Scalable Out-of-Sample Extension of Graph Embe...,sever popular graph embed techniqu represent l...
37562,[cs.CV],Depth Based Semantic Scene Completion with Pos...,semant scene complet ssc refer task infer sema...


In [264]:
import tensorflow as tf

# Define the maximum sequence length and batch size
max_seqlen = 150
batch_size = 128

# Define the function to create the dataset
def make_dataset(dataframe, labels, is_train):

    # Convert the abstracts column to a TensorFlow dataset
    abstracts_dataset = tf.data.Dataset.from_tensor_slices(dataframe['abstracts'].values)

    # Create a dataset of labels using the binarized representation
    labels_dataset = tf.data.Dataset.from_tensor_slices(labels)

    # Zip the abstracts and labels datasets together to create a single dataset of tuples
    dataset = tf.data.Dataset.zip((abstracts_dataset, labels_dataset))

    # Shuffle the dataset
    dataset = dataset.shuffle(buffer_size=len(dataframe)) if is_train else dataset

    return dataset.batch(batch_size)


In [265]:
train_dataset = make_dataset(train_df, train_labels_binarized, is_train=True)
validation_dataset = make_dataset(val_df, train_labels_binarized, is_train=False)
test_dataset = make_dataset(test_df, train_labels_binarized, is_train=False)

In [266]:
# Define a function to invert the multi-hot encoded labels
def invert_multi_hot(label):
    return [vocab[i] for i, val in enumerate(label) if val == 1]

# Iterate through batches of the training dataset and print the abstract text along with the corresponding labels
text_batch, label_batch = next(iter(train_dataset))
for i, text in enumerate(text_batch[:5]):
    label = label_batch[i].numpy()
    print(f"Abstract: {text}")
    print(f"Label(s): {invert_multi_hot(label)}")
    print(" ")


Abstract: b'incorpor multi scale featur fulli convolut neural network fcn key element achiev state art perform semant imag segment one common way extract multi scale featur feed multipl resiz input imag share deep network merg result featur pixelwis classif work propos attent mechan learn softli weight multi scale featur pixel locat adapt state art semant imag segment model jointli train multi scale input imag attent model propos attent model outperform averag max pool allow us diagnost visual import featur differ posit scale moreov show ad extra supervis output scale essenti achiev excel perform merg multi scale featur demonstr effect model extens experi three challeng dataset includ pascal person part pascal voc subset ms coco'
Label(s): ['cs.CV']
 
Abstract: b'recent cnn base method imag derain achiev excel perform term reconstruct error well visual qualiti howev method limit sens train fulli label data due variou challeng obtain real world fulli label imag derain dataset exist meth

In [267]:
vocabulary = set()
train_df["abstracts"].str.lower().str.split().apply(vocabulary.update)
vocabulary_size = len(vocabulary)
print(vocabulary_size)

32945


In [268]:
# Initializes a TextVectorization layer
text_vectorizer = layers.TextVectorization(max_tokens=vocabulary_size,ngrams=2,output_mode="tf_idf")
text_vectorizer.adapt(train_dataset.map(lambda text, label: text))

In [269]:
saved_text_vectorizer_config = text_vectorizer.get_config()
with open("text_vectorizer_config.pkl", "wb") as f:
    pickle.dump(saved_text_vectorizer_config, f)

weights = text_vectorizer.get_weights()
# Save the weights to a pickle file
with open("text_vectorizer_weights.pkl", "wb") as f:
    pickle.dump(weights, f)


In [270]:
auto = tf.data.experimental.AUTOTUNE
train_dataset = train_dataset.map(lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto).prefetch(auto)
validation_dataset = validation_dataset.map(lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto).prefetch(auto)
test_dataset = test_dataset.map(lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto).prefetch(auto)


In [271]:
import tensorflow as tf

# Assuming you have a train_dataset TensorFlow dataset object

# Define the number of elements to inspect
num_elements = 5  # You can change this to inspect more or fewer elements

# Take the first num_elements elements from the dataset
sample_data = train_dataset.take(num_elements)

# Iterate over the sample_data and print each element
for sample in sample_data:
    # Assuming the sample contains input features and labels
    input_features, labels = sample
    print("Input Features:")
    print(input_features)
    print("Labels:")
    print(labels)
    print("="*50)  # Separating each sample with '=' characters


Input Features:
tf.Tensor(
[[468.37527      0.93528265   2.0206993  ...   0.           0.
    0.        ]
 [535.286        0.93528265   0.         ...   0.           0.
    0.        ]
 [535.286        3.7411306    1.0103496  ...   0.           0.
    0.        ]
 ...
 [194.04118      0.93528265   0.         ...   0.           0.
    0.        ]
 [588.81464      4.676413     0.         ...   0.           0.
    0.        ]
 [368.00916      5.611696     0.         ...   0.           0.
    0.        ]], shape=(128, 32945), dtype=float32)
Labels:
tf.Tensor(
[[0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 1 1]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 1 0]], shape=(128, 164), dtype=int64)
Input Features:
tf.Tensor(
[[615.5789       6.5469785    4.0413985  ...   0.           0.
    0.        ]
 [528.595        0.93528265   0.         ...   0.           0.
    0.        ]
 [461.6842       0.           4.0413985  ...   0.           0.
    0.        ]
 ...
 [555.35925      

In [272]:
len(mlb.classes_)

164

In [273]:
from tensorflow.keras.callbacks import EarlyStopping

# Creating shallow_mlp_model (MLP) with dropout layers
model1 = keras.Sequential([
    # First hidden layer: 512 neurons, ReLU activation function, with dropout.
    layers.Dense(256, activation="relu"),
    layers.Dropout(0.6),

    # Second hidden layer: 256 neurons, ReLU activation function, with dropout.
    layers.Dense(128, activation="relu"),
    layers.Dropout(0.6),

    # Output layer: The number of neurons equals the vocabulary size (output vocabulary of the StringLookup layer), with a sigmoid activation function.
    layers.Dense(len(vocab), activation='sigmoid')
])

# Compile the model
model1.compile(loss="binary_crossentropy", optimizer='adam', metrics=['binary_accuracy'])

# Add early stopping
# Number of epochs with no improvement after which training will be stopped.
# Restore weights from the epoch with the best value of the monitored quantity.
early_stopping = EarlyStopping(patience=4,restore_best_weights=True)

# Train the model
# Add early stopping callback.verbose=1
history = model1.fit(train_dataset,validation_data=validation_dataset,epochs=20,callbacks=[early_stopping])



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


In [274]:
# model evaltuation on test and val dataset
_, binary_acc1 = model1.evaluate(test_dataset)
_, binary_acc2 = model1.evaluate(validation_dataset)

print(f"Categorical accuracy on the test set: {round(binary_acc1 * 100, 2)}%.")
print(f"Categorical accuracy on the validation set: {round(binary_acc2 * 100, 2)}%.")

Categorical accuracy on the test set: 98.94%.
Categorical accuracy on the validation set: 98.95%.


In [275]:
# Save the model, text_vectorizer, and vocab as pickle files.

with open('model1.pkl', 'wb') as f:
  pickle.dump(model1, f)

with open('vocab.pkl', 'wb') as f:
  pickle.dump(vocab, f)

In [276]:
from tensorflow import keras
import pickle

# Load the model
with open("model1.pkl", "rb") as f:
    loaded_model = pickle.load(f)

# Load the configuration of the text vectorizer
with open("text_vectorizer_config.pkl", "rb") as f:
    saved_text_vectorizer_config = pickle.load(f)

from tensorflow.keras.layers import TextVectorization
# Create a new TextVectorization layer with the saved configuration
loaded_text_vectorizer = TextVectorization.from_config(saved_text_vectorizer_config)

In [277]:
# Load the vocabulary
with open("vocab.pkl", "rb") as f:
  loaded_vocab = pickle.load(f)

In [278]:
# Define a function to invert the multi-hot encoded labels
def invert_multi_hot(label):
    return [vocab[i] for i, val in enumerate(label) if val == 1]


In [279]:
# prompt: generate predict_category function as per my implementation by modifying above code

def predict_category(abstract, model):
    # Preprocess the abstract using the loaded text vectorizer

    # Make predictions using the loaded model
    predictions = model.predict(abstract)

    # # Convert predictions to human-readable labels
    # predicted_labels = label_lookup(np.round(predictions).astype(int)[0])


    return predictions


In [280]:
import tensorflow as tf
import pickle

# Load the configuration of the text vectorizer
with open("text_vectorizer_config.pkl", "rb") as f:
    saved_text_vectorizer_config = pickle.load(f)

# Create a new TextVectorization layer with the saved configuration
loaded_text_vectorizer = tf.keras.layers.TextVectorization.from_config(saved_text_vectorizer_config)

# Load the saved weights into the new TextVectorization layer
with open("text_vectorizer_weights.pkl", "rb") as f:
    weights = pickle.load(f)
loaded_text_vectorizer.set_weights(weights)



tf.Tensor(
[[2.9039268e+03 9.3528265e-01 2.0206993e+00 ... 0.0000000e+00
  0.0000000e+00 0.0000000e+00]], shape=(1, 32945), dtype=float32)


In [341]:
# Preprocess the input abstract provided by the user
# input_abstract = "Graph neural networks (GNNs) have been widely used to learn vector\nrepresentation of graph-structured data and achieved better task performance\nthan conventional methods. The foundation of GNNs is the message passing\nprocedure, which propagates the information in a node to its neighbors. Since\nthis procedure proceeds one step per layer, the range of the information\npropagation among nodes is small in the lower layers, and it expands toward the\nhigher layers. Therefore, a GNN model has to be deep enough to capture global\nstructural information in a graph. On the other hand, it is known that deep GNN\nmodels suffer from performance degradation because they lose nodes' local\ninformation, which would be essential for good model performance, through many\nmessage passing steps. In this study, we propose multi-level attention pooling\n(MLAP) for graph-level classification tasks, which can adapt to both local and\nglobal structural information in a graph. It has an attention pooling layer for\neach message passing step and computes the final graph representation by\nunifying the layer-wise graph representations. The MLAP architecture allows\nmodels to utilize the structural information of graphs with multiple levels of\nlocalities because it preserves layer-wise information before losing them due\nto oversmoothing. Results of our experiments show that the MLAP architecture\nimproves the graph classification performance compared to the baseline\narchitectures. In addition, analyses on the layer-wise graph representations\nsuggest that aggregating information from multiple levels of localities indeed\nhas the potential to improve the discriminability of learned graph\nrepresentations."  # Replace with the user's input
input_abstract = input()
# Use the new TextVectorization layer to transform the preprocessed input abstract into a tensor
abstract_tensor = loaded_text_vectorizer(input_abstract)
print("1", type(abstract_tensor), abstract_tensor.shape)
abstract_tensor = tf.reshape(abstract_tensor, (1, -1))  # Reshape to add a batch dimension
print("2", type(abstract_tensor), abstract_tensor.shape)
print(abstract_tensor)

Semi-supervised learning is a classification method which makes use of both labeled data and unlabeled data for training. In this paper, we propose a semi-supervised learning algorithm using a Bayesian semi-supervised model. We make a general assumption that the observations will follow two multivariate normal distributions depending on their true labels after the same unknown transformation. We use B-splines to put a prior on the transformation function for each component. To use unlabeled data in a semi-supervised setting, we assume the labels are missing at random. The posterior distributions can then be described using our assumptions, which we compute by the Gibbs sampling technique. The proposed method is then compared with several other available methods through an extensive simulation study. Finally we apply the proposed method in real data contexts for diagnosing breast cancer and classify radar returns. We conclude that the proposed method has better prediction accuracy in a 

In [358]:
# Assuming `model1` is your trained model
# Assuming `abstract_tensor` is the tensor representing the input abstract

# Make predictions using the trained model
predictions = predict_category(abstract_tensor, loaded_model)

# Define a threshold for binary classification
threshold = 0.05

# Convert probabilities to binary predictions
binary_predictions = (predictions > threshold).astype(int)
print(type(binary_predictions))
print("Binary Predictions:")
print(binary_predictions)


<class 'numpy.ndarray'>
Binary Predictions:
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
  0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]]


In [361]:
def invert_multi_hot(encoded_labels):
    """Reverse a single multi-hot encoded label to a list of vocab terms."""
    hot_indices = np.argwhere(encoded_labels == 1)
    hot_indices = hot_indices[:, 1:]  # Remove the first column (index 0)
    terms_list = [loaded_vocab[index] for index in hot_indices.flatten()]
    return terms_list

predicted_terms = invert_multi_hot(binary_predictions)
print(predicted_terms)

['cs.AI', 'cs.CV', 'cs.LG', 'stat.ML']
