In [1]:
import json

f = open('../annotations.json')
training_data = json.load(f)

In [2]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
for text, annot in tqdm(training_data['annotations']): 
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents 
    db.add(doc)

db.to_disk("../models/training_data.spacy") # save the docbin object

100%|██████████| 9/9 [00:00<00:00, 998.22it/s]


In [4]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency


[38;5;1m✘ The provided output file already exists. To force overwriting the
config file, set the --force or -F flag.[0m



In [5]:
! python -m spacy train config.cfg --output ../ner-results/ --paths.train ../models/training_data.spacy --paths.dev ../models/training_data.spacy

[38;5;4mℹ Saving to output directory: ..\ner-results[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     34.56    0.00    0.00    0.00    0.00
 29     200         86.44   1460.11  100.00  100.00  100.00    1.00
 65     400          1.40      1.60  100.00  100.00  100.00    1.00
112     600          0.00      0.00  100.00  100.00  100.00    1.00
171     800          0.00      0.00  100.00  100.00  100.00    1.00
239    1000          0.00      0.00  100.00  100.00  100.00    1.00
335    1200          0.00      0.00  100.00  100.00  100.00    1.00
435    1400          0.00      0.00  100.00  100.00  100.00    1.00
559    1600          0.00      0.00  100.00  100.00  100.00    1.00
759    1800          0.00      0.00  1

[2023-12-11 23:20:10,344] [INFO] Set up nlp object from config
[2023-12-11 23:20:10,353] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-12-11 23:20:10,357] [INFO] Created vocabulary
[2023-12-11 23:20:10,359] [INFO] Finished initializing nlp object
[2023-12-11 23:20:10,499] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


In [10]:
import pandas as pd

df = pd.read_csv("../vectortest.csv", sep=';')
df.shape

(10, 2)

In [11]:
df

Unnamed: 0,name,description
0,MEAS_ACCL_STRAY_PAYMENT_PREDICTION_PAYERS_AMT,"Payment prediction for stray payers, not in ac..."
1,MEAS_ACCL_SETTLEMENT_PAYMENT_PREDICTION_PAYERS...,Payment prediction for settlement payers
2,MEAS_ACCH_AMT_PAYMENT_CP,Amount paid within the observation period
3,MEAS_APPL_AMT_APPROVED,Approved amount
4,MEAS_APPL_AMT_DISCOUNT,Discount amount
5,MEAS_APPL_AMT_PAYMENTS_TOTAL,Total payments amount so far
6,MEAS_APLL_AMT_WRITEOFF,Balance written off
7,MEAS_APLL_AMT_ENTRY_PRINCIPAL,Principal on arrangement creation
8,MEAS_APLL_AMT_ENTRY_BALANCE,Balance on arrangement creation
9,MEAS_APLL_AMT_INSTALMENTS_FUTURE,Total future instalments amount


In [12]:
from sentence_transformers import SentenceTransformer

In [13]:
encoder = SentenceTransformer("all-mpnet-base-v2")

In [14]:
vectors = encoder.encode(df.description)

In [15]:
dim = vectors.shape[1]
dim

768

Step 2 : Build a FAISS Index for vectors

In [16]:
import faiss

index = faiss.IndexFlatL2(dim)

Step 3 : Normalize the source vectors (as we are using L2 distance to measure similarity) and add to the index

In [17]:
index.add(vectors)

In [18]:
search_query = "Create report shows number settlement applications submitted month , approved amount entry principal balance , \
    September 2020 per DCA application type . The report produced Earth portfolio ."
# search_query = "looking for places to visit during the holidays"
# search_query = "An apple a day keeps the doctor away"
vec = encoder.encode(search_query)
vec.shape

(768,)

In [19]:
import numpy as np
svec = np.array(vec).reshape(1,-1)
svec.shape

(1, 768)

Step 5: Search for similar vector in the FAISS index created

In [20]:
distances, I = index.search(svec, k=5)
I

array([[1, 5, 2, 3, 0]], dtype=int64)

In [21]:
df.loc[I[0]]

Unnamed: 0,name,description
1,MEAS_ACCL_SETTLEMENT_PAYMENT_PREDICTION_PAYERS...,Payment prediction for settlement payers
5,MEAS_APPL_AMT_PAYMENTS_TOTAL,Total payments amount so far
2,MEAS_ACCH_AMT_PAYMENT_CP,Amount paid within the observation period
3,MEAS_APPL_AMT_APPROVED,Approved amount
0,MEAS_ACCL_STRAY_PAYMENT_PREDICTION_PAYERS_AMT,"Payment prediction for stray payers, not in ac..."
