In [1]:
pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 3.1 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 68.8 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 50.4 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 57.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 38.1 MB/s 
Building wheels for collected 

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch
import pandas as pd

embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Corpus with example sentences
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'A cheetah is running behind its prey.'
          ]
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

# Query sentences:
queries = ['A man is eating pasta.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah chases prey on across a field.']


# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(5, len(corpus))
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(corpus[idx], "(Score: {:.4f})".format(score))

    """
    # Alternatively, we can also use util.semantic_search to perform cosine similarty + topk
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=5)
    hits = hits[0]      #Get the hits for the first query
    for hit in hits:
        print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
    """

In [None]:
hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=5)
hits = hits[0]      #Get the hits for the first query
for hit in hits:
  print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('multi-qa-mpnet-base-dot-v1') # you can specify other models of sentence transformer
df = pd.read_excel(r'app_features.xlsx')
print(df)

In [32]:
app_features_list = df.values.tolist()

In [51]:
app_features_values = []
app_features_name = []
for af in app_features_list:
  app_features_values.append(af[1])
  app_features_name.append(af[0])

In [None]:
app_features_values

In [None]:
app_features_name

In [54]:
review_dataset = pd.read_csv('zoom_reviews1.csv')

In [55]:
reviews_df = pd.DataFrame(review_dataset)
cols = [3]
zoom_reviews_df = reviews_df[reviews_df.columns[cols]]

In [56]:
review_list = zoom_reviews_df.values.tolist()

In [None]:
for zr in review_list:
  print(zr[0])

In [58]:
feature_embeddings = embedder.encode(app_features_values, convert_to_tensor=True)
top_k = min(5, len(app_features_values))

In [None]:
for review in review_list:
    review_embedding = embedder.encode(review, convert_to_tensor=True)
    cos_scores = util.cos_sim(review_embedding, feature_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)
    print("\n\n======================\n\n")
    print("Query:", review)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(app_features_name[idx], "(Score: {:.4f})".format(score))

In [None]:
for score, idx in zip(top_results[0], top_results[1]):
        print(app_features_values[idx], "(Score: {:.4f})".format(score))