# Explore the Top2Vec model and its contents

In [5]:
import os
import json
import numpy as np 
import pandas as pd 
from top2vec import Top2Vec

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
top2vec = Top2Vec.load("cs-papers-only.top2vec")

### Data

The model contains three types of data of interest: words, documents, and topics.

Of these, words and documents are represented as both text and embeddings while topics
are represented only as vectors.

In [6]:
# Words are in `.vocab`
top2vec.vocab[:5]

['the', 'of', 'and', 'to', 'in']

In [10]:
# Documents are in `.documents`
top2vec.documents[100]

'  The Extended BP (EBP) Generalized EXIT (GEXIT) function introduced in\n\\cite{MMRU05} plays a fundamental role in the asymptotic analysis of sparse\ngraph codes. For transmission over the binary erasure channel (BEC) the\nanalytic properties of the EBP GEXIT function are relatively simple and well\nunderstood. The general case is much harder and even the existence of the curve\nis not known in general. We introduce some tools from non-linear analysis which\ncan be useful to prove the existence of EXIT like curves in some cases. The\nmain tool is the Krasnoselskii-Rabinowitz (KR) bifurcation theorem.\n'

In [14]:
# Word vectors are in `.word_vectors`
word_vector_example = top2vec.word_vectors[0]
print(f"Type: {type(word_vector_example)}\nShape: {word_vector_example.shape}")

Type: <class 'numpy.ndarray'>
Shape: (300,)


In [16]:
# Document vectors are in `.document_vectors`
doc_vector_example = top2vec.document_vectors[0]
print(f"Type: {type(doc_vector_example)}\nShape: {doc_vector_example.shape}")

Type: <class 'numpy.ndarray'>
Shape: (300,)


In [17]:
# Topic vectors are in `.topic_vectors`
topic_vector_example = top2vec.topic_vectors[0]
print(f"Type: {type(topic_vector_example)}\nShape: {topic_vector_example.shape}")

Type: <class 'numpy.ndarray'>
Shape: (300,)


# Determining Structure of Data in Redis

### Metadata

* id: str
* title: str
* year: int
* authors: str
* categories: str (comma separated)
* abstract: str
* input: cleaned title + abstract

### Top2Vec Output

* words: str
* documents: str
* word_vectors: numpy array; Shape: 300,
* document_vectors: numpy array; Shape: 300,
* topic_vectors: numpy array; Shape: 300,

### Questions

**Q**: Can a vector index be created on a property of something created through redis-om? And  
   is there no downside to having a large bytes object (the numpy array of the vector in bytes) in such a model?  

    - If so, all the above properties can be in a single pydantic model.  
    - If not, we'll have a separate place for the vectors, which will be related to the
      metadata through the primary key.

**A**: Looks like this isn't possible just yet, so we'll follow the method of having the paper metadata separate  
   from the vectors which will be stored in a hash: https://github.com/redis/redis-om-python/issues/343. We can  
   follow Tyler's work [here](https://github.com/RedisVentures/redis-arXiv-search/blob/b40eab980e67ebb505635c7dd53ac2bb63d622da/backend/vecsim_app/models.py) and [here](https://github.com/RedisVentures/redis-arXiv-search/blob/main/backend/vecsim_app/load_data.py).

In [1]:
import os
os.environ["REDIS_OM_URL"] = "redis://default:{}@redis-18891.c21900.ap-south-1-1.ec2.cloud.rlrcp.com:18891"

from topvecsim.models import Paper
from topvecsim.redis_utils import redis_conn
from topvecsim.search_index import SearchIndex
from topvecsim.load_data import load_all_data_from_disk

search_index = SearchIndex()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
result = await load_all_data_from_disk(
    model_path="../cs-only.top2vec",
    df_path="../arxiv-cs-only.pkl",
    limit_data={
        "papers": {
            "upper": 500
        }
    },
)

In [3]:
# Sample text search
print(f"Paper Category: {(await Paper.find(Paper.abstract % 'probability').first()).categories}")

# Sample topic search
print(f"Topic word 0: {(await redis_conn.hgetall(f'topic_vector:5'))[b'word_0']}")

Paper Category: quant-ph
Topic word 0: b'junctions'


In [4]:
# Sample vector search
query = search_index.vector_query([], [])
res = await redis_conn.ft("papers").search(
    query,
    query_params={
        "vec_param": await redis_conn.hget("topic_vector:5", "vector")
    }
)

print(res.docs[0])

Document {'id': 'paper_vector:1806.09066', 'payload': None, 'vector_score': '0.625671982765', 'paper_id': '1806.09066', 'paper_pk': '01GH8W60YSM8TB66DPT59MYHXH'}


In [3]:
# Get rid of all keys in the DB.
# await redis_conn.flushall()

True

In [1]:
from topvecsim.ml import train_save_top2vec

model = train_save_top2vec(
    save_path="quick.top2vec",
    df_path="../arxiv-df-2022.pkl",
    workers=16,
    speed="fast-learn", 
)

  from .autonotebook import tqdm as notebook_tqdm


2022-11-07 23:24:38,433 - top2vec - INFO - Pre-processing documents for training
2022-11-07 23:24:46,969 - top2vec - INFO - Creating joint document/word embedding
2022-11-07 23:27:05,081 - top2vec - INFO - Creating lower dimension embedding of documents
2022-11-07 23:27:31,029 - top2vec - INFO - Finding dense areas of documents
2022-11-07 23:27:32,345 - top2vec - INFO - Finding topics


## Test MinIO

In [28]:
import boto3

BUCKET = 'topvecsim'
ENDPOINT = 'https://castle-minio.community.saturnenterprise.io'
AWS_ACCESS_KEY_ID = ''
AWS_SECRET_ACCESS_KEY = ''
REGIONNAME = 'random'

session = boto3.Session(
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    aws_session_token=None,
    region_name=REGIONNAME,
    botocore_session=None,
    profile_name=None
)

s3resource = session.resource('s3', endpoint_url=ENDPOINT)

s3resource.Bucket(BUCKET).creation_date