In [2]:
import numpy as np 
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [4]:
model = SentenceTransformer("all-MiniLM-L6-v2")
# model = SentenceTransformer("all-mpnet-base-v2")
print(f'Model max token length is {model.max_seq_length}. Sentence length longer than this is truncated.')

# typically abstract is about 100 -250 words, so this serves its purpose. 

Model max token length is 256. Sentence length longer than this is truncated.


In [10]:
query = ['recommend some outdoor activities']
documents = [
    'go visit the museum of natural history',
    'stay at home and play video games',
    'go for a hiking trip in the Appalacians',
    'watch a movie with friends',
    'go swimming in a lake'
]
model = SentenceTransformer("all-MiniLM-L6-v2")
doc_embedding = model.encode(documents)
query_embedding = model.encode(query)
scores = model.similarity(query_embedding,doc_embedding)
for i in range(len(documents)): 
    print(documents[i]+' : '+ f'{scores[0][i].item():.2f}')


go visit the museum of natural history : 0.34
stay at home and play video games : 0.33
go for a hiking trip in the Appalacians : 0.46
watch a movie with friends : 0.23
go swimming in a lake : 0.39


In [14]:
query = ['research articles about graphene']
documents = [
    'Phases and phase transitions in a dimerized spin-12 XXZ chain ',
    'Strongly interacting Hofstadter states in magic-angle twisted bilayer graphene',
    'Constraints imposed by symmetry on pairing operators for the iron pnictides',
    'Interplay between tetragonal magnetic order, stripe magnetism, and superconductivity in iron-based materials',
    'Visualizing the nonlinear coupling between strain and electronic nematicity in the iron pnictides by elasto-scanning tunneling spectroscopy',
    'Strong-coupling expansion of multi-band interacting models: Mapping onto the transverse-field J1-J2 Ising model'
]
model = SentenceTransformer("all-MiniLM-L6-v2")
doc_embedding = model.encode(documents)
query_embedding = model.encode(query)
scores = model.similarity(query_embedding,doc_embedding)
for i in range(len(documents)): 
    print(documents[i]+': '+ f'{scores[0][i].item():.2f}')


Phases and phase transitions in a dimerized spin-12 XXZ chain : 0.13
Strongly interacting Hofstadter states in magic-angle twisted bilayer graphene: 0.47
Constraints imposed by symmetry on pairing operators for the iron pnictides: 0.12
Interplay between tetragonal magnetic order, stripe magnetism, and superconductivity in iron-based materials: 0.13
Visualizing the nonlinear coupling between strain and electronic nematicity in the iron pnictides by elasto-scanning tunneling spectroscopy: 0.21
Strong-coupling expansion of multi-band interacting models: Mapping onto the transverse-field J1-J2 Ising model: 0.10


In [6]:
query = ['recent articles about graphene']
query_embedding = model.encode(query)
scores = model.similarity(query_embedding,doc_embedding)
print(scores)

tensor([[0.2516, 0.5488, 0.2235, 0.2155, 0.2648, 0.1547]])


In [60]:
query = ['recent articles about graphene']
query_embedding = model.encode(query)
scores = model.similarity(query_embedding,doc_embedding)
print(scores)

tensor([[0.2544, 0.5624, 0.2697, 0.2475, 0.3872, 0.2009]])


In [63]:
query = ['old articles before 2016']
query_embedding = model.encode(query)
scores = model.similarity(query_embedding,doc_embedding)
print(scores)

tensor([[0.0795, 0.0780, 0.0957, 0.0070, 0.0383, 0.0119]])


* https://sbert.net/examples/applications/computing-embeddings/README.html

In [105]:
doc = [
    "Venture capital (VC) and hedge funds are both types of investment firms, but they differ significantly in terms of their strategies, goals, and the types of investments they make. **Investment Focus**: VCs invest in early-stage companies, often startups, that have high growth potential. These companies are usually in the technology, healthcare, or other innovative sectors **Risk and Return**: Venture capital investments are high-risk because they involve funding companies that may not yet be profitable. However, if the companies succeed, the returns can be very high. **Investment Horizon**: VCs typically have a long-term investment horizon, often holding investments for several years until the company goes public (IPO) or is acquired. *Involvement**: Venture capitalists often take an active role in the companies they invest in, offering advice, guidance, and sometimes even taking seats on the board of directors.  **Fund Structure**: VCs usually raise funds from institutional investors or wealthy individuals and invest that capital into a portfolio of startups. *Investment Focus**: Hedge funds invest in a wide range of financial assets, including stocks, bonds, derivatives, currencies, and sometimes even commodities or real estate. Their goal is to maximize returns while minimizing risk, often through complex strategies. *Risk and Return**: Hedge funds can range from low-risk to very high-risk, depending on their strategies. They aim to generate consistent returns regardless of market conditions. Investment Horizon**: Hedge funds usually have a shorter investment horizon compared to VC, often holding positions for months or even weeks Involvement**: Hedge fund managers generally do not get involved in the management of the companies they invest in. Their focus is on market performance rather than the operational success of any particular company. Fund Structure**: Hedge funds raise capital from institutional investors and wealthy individuals, similar to VCs, but they often employ leverage and use complex trading strategies to enhance returns.**Stage of Investment**: VCs invest in early-stage companies, while hedge funds typically invest in more mature markets and assets. **Time Horizon**: VCs have a longer-term focus, while hedge funds may have shorter-term strategies  **Involvement**: VCs are more involved in the companies they invest in, while hedge funds focus on financial performance **Risk Profile**: Venture capital is inherently riskier due to the nature of startups, whereas hedge funds use various strategies to manage and mitigate risk. Both play crucial roles in the financial ecosystem but cater to different investor needs and objectives. "]

doc_embedding = model.encode(doc)
doc_embedding1 = model.encode([doc[0][:1550]])
model.similarity(doc_embedding,doc_embedding1)

# this demonstrates that doc is truncated in the encoding process 

tensor([[1.0000]])