In [None]:
# !docker network create arangodb-net
!docker-compose up -d

In [None]:
!pip3 install torch
!pip3 install transformers
!pip3 install sentence-transformers
!pip3 install bertviz
!pip3 install pyarango
!pip3 install "python-arango>=5.0"
!pip3 install pandas
!pip3 install ipywidgets

#in case error IP
# jupyter nbextension enable --py widgetsnbextension
# jupyter nbextension install --py widgetsnbextension


In [None]:
import itertools
import json
import requests
import sys
import oasis
import time
import textwrap

from pyArango.connection import *
from arango import ArangoClient
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F
import torch
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from bertviz import head_view

In [None]:
model_name = "distilbert-base-uncased"
model = AutoModel.from_pretrained(model_name, output_attentions=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
tokenized = tokenizer("This is an input sentence!", return_tensors="pt")
tokenized

In [None]:
tokenizer.decode(tokenized["input_ids"].tolist()[0])

In [None]:
model_output = model(**tokenized)
print(model_output.last_hidden_state)

In [None]:
sentence = "Jack was tired so he went to sleep"
tokenized_sent = tokenizer(sentence, return_tensors="pt")
preds = model(**tokenized_sent)

attention = preds[-1]
tokens = tokenizer.convert_ids_to_tokens(tokenized_sent["input_ids"][0].tolist())
head_view(attention, tokens)

In [None]:
model = SentenceTransformer("paraphrase-TinyBERT-L6-v2")

def embed_and_compare(inputs):
  input_embeddings = torch.from_numpy(model.encode(inputs))

  n = input_embeddings.shape[0]

  combos = list(itertools.product(list(range(n)), list(range(n))))

  for a, b in combos:
    if a == b or a > b:
      continue
    print(f"1st input: {inputs[a]}")
    print(f"2nd input: {inputs[b]}")

    cosine_sim = F.cosine_similarity(input_embeddings[a], input_embeddings[b], dim=0).numpy()
    print(f"Cosine similarity: {cosine_sim:.3f}")
    print("\n")

In [None]:
terms = [
    "happy",
    "cheerful", 
    "sad"
]
embed_and_compare(terms)

In [None]:
sentences = [
    "This is an input sentence",
    "Totally unrelated thing.",
    "This is an input query.",
    "This is another sentence!",
]
embed_and_compare(sentences)

In [None]:
# use on cloud
# login = oasis.getTempCredentials(tutorialName="WordEmbeddings", credentialProvider="https://tutorials.arangodb.cloud:8529/_db/_system/tutorialDB/tutorialDB")
# login = oasis.getTempCredentials(tutorialName="WordEmbeddings")
# database = oasis.connect_python_arango(login)

In [None]:
# use database on local
login = {
    "dbName": "_system",
    "hostname": "localhost",
    "port": "8529",
    "username": "root",
    "password": "rootpassword"
    }
database = oasis.connect_python_arango_local(login)

In [None]:
print("http://"+login["hostname"]+":"+str(login["port"]))
print("Username: " + login["username"])
print("Password: " + login["password"])
print("Database: " + login["dbName"])

In [None]:
# test connection on database
# arangodb command line was disable build but it can use on linux

# !docker run --rm -v /python_embedd/imdb_dump:/dump arangodb arangorestore \
#  --server.endpoint http://host.docker.internal:8529 --server.password rootpassword

In [None]:
# !docker pull arangodb/arangodb
# !docker run --rm -v /python_embedd/imdb_dump:/dump arangodb/arangodb arangorestore \
#   --server.endpoint http://{login["hostname"]}:{login["port"]} \
#   --server.username {login["username"]} \
#   --server.database {login["dbName"]} \
#   --server.password {login["password"]} \
#   --default-replication-factor 3 \
#   --input-directory /dump


In [None]:
# !docker pull arangodb/arangodb on docker local
!docker run --rm -v  /python_embedd/imdb_dump:/dump arangodb arangorestore \
  --server.endpoint http://host.docker.internal:8529 \
  --server.username root \
  --server.database _system \
  --server.password rootpassword \
  --default-replication-factor 3 \
  --input-directory /dump


In [None]:
cursor = database.aql.execute(
"""
FOR d IN imdb_vertices 
   FILTER d.type == "Movie"
   FILTER d.description != "No overview found."
   RETURN {
     _id: d._id,
     description: d.description
    }
"""
)
movie_descriptions = list(cursor)

# let's take this list of movie descriptions and put it in a dataframe for ease of use
movies_df = pd.DataFrame(movie_descriptions)
movies_df = movies_df.dropna()
     

In [None]:
batch_size = 32

all_embs = []

for i in tqdm(range(0, len(movies_df), batch_size)):
  descr_batch = movies_df.iloc[i:i+batch_size].description.tolist()
  embs = model.encode(descr_batch)
  all_embs.append(embs)

all_embs = np.concatenate(all_embs)
movies_df.loc[:, "word_emb"] = np.vsplit(all_embs, len(all_embs))
movies_df["word_emb"] = movies_df["word_emb"].apply(lambda x: x.squeeze().tolist())

In [None]:
BATCH_SIZE = 250
movie_collection = database["imdb_vertices"]

for i in range(0, len(movies_df), batch_size):
  update_batch = movies_df.loc[i:i+batch_size, ["_id", "word_emb"]].to_dict("records")
  movie_collection.update_many(update_batch)
     

In [None]:
cursor = database.aql.execute(
"""
  FOR m in imdb_vertices
    FILTER m._id == "imdb_vertices/28685"
    RETURN { "title": m.title, "description": m.description }
""")

# Iterate through the result cursor
for doc in cursor:
  print(doc)

In [None]:
cursor = database.aql.execute(
"""
LET descr_emb = (
  FOR m in imdb_vertices
    FILTER m._id == "imdb_vertices/28685"
    FOR j in RANGE(0, 767)
      RETURN TO_NUMBER(NTH(m.word_emb,j))
)

LET descr_mag = (
  SQRT(SUM(
    FOR i IN RANGE(0, 767)
      RETURN POW(TO_NUMBER(NTH(descr_emb, i)), 2)
  ))
)

LET dau = (

    FOR v in imdb_vertices
    FILTER HAS(v, "word_emb")

    LET v_mag = (SQRT(SUM(
      FOR k IN RANGE(0, 767)
        RETURN POW(TO_NUMBER(NTH(v.word_emb, k)), 2)
    )))

    LET numerator = (SUM(
      FOR i in RANGE(0,767)
          RETURN TO_NUMBER(NTH(descr_emb, i)) * TO_NUMBER(NTH(v.word_emb, i))
    ))

    LET cos_sim = (numerator)/(descr_mag * v_mag)

    RETURN {"movie": v._id, "title": v.title, "cos_sim": cos_sim}

    )

FOR du in dau
    SORT du.cos_sim DESC
    LIMIT 50
    RETURN {"movie": du.title, "cos_sim": du.cos_sim} 
""")

# Iterate through the result cursor
for doc in cursor:
  print(doc)

In [None]:
search_term = "world"
search_emb = model.encode(search_term).tolist()
print(search_emb)

In [None]:
emb_str = f"""
LET descr_emb = (
  {search_emb}
)
"""
cursor = database.aql.execute(
emb_str + """
LET descr_size = (
  SQRT(SUM(
    FOR i IN RANGE(0, 767)
      RETURN POW(TO_NUMBER(NTH(descr_emb, i)), 2)
  ))
)

LET dau = (

    FOR v in imdb_vertices
    FILTER HAS(v, "word_emb")

    LET v_size = (SQRT(SUM(
      FOR k IN RANGE(0, 767)
        RETURN POW(TO_NUMBER(NTH(v.word_emb, k)), 2)
    )))

    LET numerator = (SUM(
      FOR i in RANGE(0,767)
          RETURN TO_NUMBER(NTH(descr_emb, i)) * TO_NUMBER(NTH(v.word_emb, i))
    ))

    LET cos_sim = (numerator)/(descr_size * v_size)

    RETURN {"movie": v._id, "title": v.title, "cos_sim": cos_sim}

    )

FOR du in dau
    SORT du.cos_sim DESC
    LIMIT 50
    RETURN {"movie": du.title, "cos_sim": du.cos_sim} 
""")

# Iterate through the result cursor
for doc in cursor:
  print(doc)