# Data Preprocess - Embedding

## Setup and Imports

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
from torch.utils.data import DataLoader, Dataset
import numpy as np
from pinecone import Pinecone, ServerlessSpec

In [28]:
PINECONE_API_KEY = "703c7c8a-2b8f-46bc-b2f7-ede6b037b3fa"
VECTOR_DIM = 384

In [6]:
data = pd.read_csv('/home/student/FinalProject/PaperFeedback/Datasets/acm_citation_network_v8_labeled.csv')
data.columns

Index(['Unnamed: 0', 'index', 'title', 'authors', 'year', 'venue',
       'references', 'abstract', 'id', 'topic'],
      dtype='object')

In [None]:
abstracts_df = data[['id', 'abstract', 'title']]
abstracts_df['abstract'] = abstracts_df['abstract'].fillna(' ')
abstracts_df['title'] = abstracts_df['title'].fillna(' ')
abstracts_df['document'] = abstracts_df['title'] + ' ' + abstracts_df['abstract']

We do not drop null values in this phase, as we don't want to hurt the induced graph's topology in the GNN training phase. Instead, we treat a document based on a concatination of the title of the paper and the abstract, which significantly reduce null values according to this assumption.

## Create Embeddings for Abstratcs

We compute vector embeddings using SentenceTransformer's model <code>all-MiniLM-L6-v2</code>. \
Dataset statistics: \
* number of vectors: 2,381,675
* vector dimensinality: 384
* batch size for encoding: 256

We use CUDA and a custom dataset to encode documents efficiently


In [24]:
class AbstractDataset(Dataset):
    def __init__(self, abstracts_df):
        self.df = abstracts_df

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        return self.df.iloc[index]['id'], self.df.iloc[index]['document']

In [25]:
abstract_dataset= AbstractDataset(abstracts_df)
batch_size = 256
dataloader = DataLoader(abstract_dataset, batch_size=batch_size, shuffle=False, num_workers=8, pin_memory=True)
if torch.cuda.is_available():
    print('GPU available')
    device = 'cuda'
else:
    print('GPU is not available')
    device = 'cpu'
model = SentenceTransformer('all-MiniLM-L6-v2')
model = model.to(device)


GPU available




In [26]:
from tqdm import tqdm

In [27]:
all_embeddings = []
all_ids = []
with torch.no_grad():
    for ids_batch, abstract_batch in tqdm(dataloader):
        batch_embeddings = model.encode(abstract_batch, convert_to_tensor=True, show_progress_bar=False)
        all_embeddings.extend(batch_embeddings.cpu().numpy())
        all_ids.extend(ids_batch)

all_embeddings = np.array(all_embeddings)


100%|██████████| 9304/9304 [1:29:35<00:00,  1.73it/s]


## Upsert Embeddings to VectorDB

We use PineconeDB for storing and interacting with our dataset. The defined similarity metric of the DB is cosine.

In [29]:
embedding_df = pd.DataFrame({
    'id': [int(x) for x in all_ids],
    'embeddings': list(all_embeddings)
})


In [30]:
pc = Pinecone(api_key=PINECONE_API_KEY)
pc.create_index(
    name='ann-embeddings',
    dimension=VECTOR_DIM, 
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="azure",
        region="eastus2"
    ) 
)


In [31]:
vectors_to_upsert = [(str(row['id']), row['embeddings']) for _, row in embedding_df.iterrows()]

In [32]:
index = pc.Index('ann-embeddings')

In [33]:
import itertools

In [34]:
def chunks(vectors, batch_size=500):
    it = iter(vectors)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))


for vec_chunks in chunks(vectors=vectors_to_upsert, batch_size=1000):
    index.upsert(vectors=vec_chunks)