In [1]:
import os
import numpy as np
import torch
from scipy.spatial.distance import cdist
from transformers import AutoModel, AutoTokenizer

from gait import Layers, FEL

In [2]:
model_name = "chatdb/natural-sql-7b"
model_path = "/Users/mraad/Downloads/natural-sql-7b.Q8_0.gguf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
def generate_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=False, max_length=384, )
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the hidden state of the last layer and average pooling
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

In [4]:
layers = Layers.load(os.path.expanduser("~/data/NorthSea.json"))
fel = FEL(layers)

In [5]:
fel.create_line_1()

FELLine(line='Locate all pipelines where current phase is like place', fel=FEL1(layer='Pipelines', where="current_phase LIKE '%PLACE%'"))

In [6]:
data = []

line1 = [fel.create_line_1() for _ in range(10)]
line2 = [fel.create_line_2() for _ in range(10)]

data.extend([_.line for _ in line1])
data.extend([_.line for _ in line2])

len(data)

20

In [7]:
# texts = [
#     "What is the capital of France?",
#     "Explain SQL joins.",
#     "How to use embeddings in AI?",
#     "What is reinforcement learning?",
# ]

# texts = [text for text, _ in data]

# Generate embeddings for the dataset
dataset_embeddings = np.array([generate_embedding(_) for _ in data])

In [8]:
dataset_embeddings.shape

(20, 4096)

In [148]:
len(tokenizer(layers.create_line_1(), return_tensors="pt", padding="max_length", truncation=False, max_length=384)[
        "input_ids"][0])

384

In [9]:
query = fel.create_line_1()
print(query)

line='Find all wellbores with has casing lot is less than 1 or depth in meters = 94.48799986' fel=FEL1(layer='Wellbores', where='casing_lot < 1 or water_depth = 94.48799986')


In [10]:
# query = "Please explain deep learning"
query = fel.create_line_1()
query_embedding = generate_embedding(query.line)

# Find nearest embedding using cosine similarity
distances = cdist([query_embedding], dataset_embeddings, metric="cosine")
nearest_idx = np.argmin(distances)

# Output the nearest match
print("Input Query:", query)
print("Most Similar Text:", data[nearest_idx])
print("Similarity Score (Cosine Distance):", 1 - distances[0][nearest_idx])
print("FEL:", data[nearest_idx][1])

Input Query: line='Show all discoveries where discovery type not exceeding 1 or where field name is edvard grieg' fel=FEL1(layer='Discoveries', where="discovery_type < 1 or field_name = 'EDVARD GRIEG'")
Most Similar Text: List all discoveries with discovery type is oil and discovery well bore name ending with 3
Similarity Score (Cosine Distance): 0.9996901889064381
FEL: i
