In [1]:
import json

f = open('../annotations.json')
training_data = json.load(f)

In [2]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
for text, annot in tqdm(training_data['annotations']): 
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents 
    db.add(doc)

db.to_disk("../models/training_data.spacy") # save the docbin object

100%|██████████| 9/9 [00:00<00:00, 998.22it/s]


In [4]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency


[38;5;1m✘ The provided output file already exists. To force overwriting the
config file, set the --force or -F flag.[0m



In [5]:
! python -m spacy train config.cfg --output ../ner-results/ --paths.train ../models/training_data.spacy --paths.dev ../models/training_data.spacy

[38;5;4mℹ Saving to output directory: ..\ner-results[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     34.56    0.00    0.00    0.00    0.00
 29     200         86.44   1460.11  100.00  100.00  100.00    1.00
 65     400          1.40      1.60  100.00  100.00  100.00    1.00
112     600          0.00      0.00  100.00  100.00  100.00    1.00
171     800          0.00      0.00  100.00  100.00  100.00    1.00
239    1000          0.00      0.00  100.00  100.00  100.00    1.00
335    1200          0.00      0.00  100.00  100.00  100.00    1.00
435    1400          0.00      0.00  100.00  100.00  100.00    1.00
559    1600          0.00      0.00  100.00  100.00  100.00    1.00
759    1800          0.00      0.00  1

[2023-12-11 23:20:10,344] [INFO] Set up nlp object from config
[2023-12-11 23:20:10,353] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-12-11 23:20:10,357] [INFO] Created vocabulary
[2023-12-11 23:20:10,359] [INFO] Finished initializing nlp object
[2023-12-11 23:20:10,499] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


In [6]:
nlp_ner = spacy.load('../ner-results/model-best') 

In [7]:
doc = nlp_ner('''Generate a report detailing the September 2020 data for settlement applications within the Earth portfolio. This report should include information on the number of rejected applications, the approved settlement amounts, the average time it took for rejections, as well as the entry principal and entry balance, categorized by DCA (Debt Collection Agency) and application type.
Generate a report presenting data for September 2020 within the Earth portfolio, focusing on approved settlement applications. This report should include the count of approved applications, the approved settlement amounts, the written-off balance, the average time taken for approvals, both the average and median durations, as well as the entry principal and balance. This information should be categorized by DCA (Debt Collection Agency) and application type. 	
Generate a report for the Earth portfolio, covering September 2020, that provides insights into pending settlement applications as of the end of the previous month. The report should encompass the count of applications in statuses such as Review, Working, Quality Control, For approval, and Approved. Additionally, it should include information on the approved amounts, the average number of days these applications have been pending, as well as details on the entry principal and balance. This data should be categorized based on DCA (Debt Collection Agency) and application type.
Generate a report for the Earth portfolio, specifically for September 2020, highlighting settlement applications submitted during that month. The report should include information on the count of applications, their approved amounts, as well as details regarding entry principal and balance. These data points should be organized and presented based on DCA (Debt Collection Agency) and application type.
Generate a report for the Mirror portfolio, focusing on the bucket roll rates for September 2020. The report should present data in a matrix format, with rows representing the application bucket of the previous month categorized into groups 1, 2, 3, 4, 5, 6, 7-12, 13+, and null. Columns should represent the application bucket of the current month, also categorized into the same groups.

Within this matrix, the report should measure and display the number of active and running applications for each combination of the previous month's bucket and the current month's bucket. This will provide insights into how applications move between different bucket categories from one month to the next.
Furnish an analysis for the Earth portfolio in September 2020, categorizing settlements of types Settlement, Preapproved, Resch to Sett, Out of Mandate Had Settlement, and null based on their duration. The duration should be divided into the following bins: up to 6 months, 7 to 12 months, 13 to 36 months, 37 to 72 months, 72 to 108 months, and more than 109 months.
Furnish a report for September 2020 within the Earth portfolio, detailing the following information per asset class and DCA (Debt Collection Agency):

The number of customers with active and running settlements for types Settlement, Preapproved, Resch to Sett, Out of Mandate Had Settlement, and null.

The corresponding number of applications for each of these settlement types.

The initial settlement amount for these applications.

The discount amount applied to each settlement.

Details on future installment payments associated with these settlements.

This report will provide a comprehensive overview of the customer base, applications, settlement amounts, and installment information broken down by asset class and DCA for September 2020 within the Earth portfolio.
Generate a report for September 2020 within the Earth portfolio, focusing on accounts that are not in running settlements. This report should include the following information categorized by DCA (Debt Collection Agency):

The number of accounts not in running settlements.

The expected monthly payments at the end of the examined month for these accounts.

The actual monthly payments made by these accounts during September 2020.

This report will provide insights into accounts that are not currently in settlements, their expected and actual monthly payments, and how this information  varies across different DCAs within the Earth portfolio.

Create a report for the Earth portfolio, specifically for September 2020, focusing on accounts that have running settlements. This report should present the following data categorized by DCA (Debt Collection Agency):

The number of accounts with active running settlements.

The expected monthly payments at the end of the examined month for these accounts.

The actual monthly payments made by these accounts during September 2020.

This report will offer insights into accounts that are currently in running settlements, providing information on their expected and actual monthly payment behavior, and how this varies across different DCAs within the Earth portfolio.''')

R3 = nlp_ner('''Provide the number of customers with active and running settlements of types Settlement, Preappoved, Resch to Sett,\
       Out of Mandate Had Settlement and null on the Earth portfolio, the corresponding number of applications,\
       the initial settlement amount, discount amount and future instalments per asset class and DCA, for September 2020.''')
R1 = nlp_ner('''Provide a report that displays the number of accounts with running settlements for the Earth portfolio, \
             their expected monthly payments at the end of the examined month and their actual monthly payments per DCA, for September 2020. ''')
R6 = nlp_ner('''Create a report that shows the number of settlement applications submitted during the month, \
             their approved amount and their entry principal and balance, for September 2020  per DCA and application type. \
             The report should be produced on Earth portfolio.''')

In [8]:
spacy.displacy.render(R3, style="ent", jupyter=True) # display in Jupyter

In [9]:
from functions import extract_named_entities

descriptions = ['Generate a report detailing the September 2020 data for settlement applications within the Earth portfolio. \
                This report should include information on the number of rejected applications, the approved settlement amounts, \
                the average time it took for rejections, as well as the entry principal and entry balance, \
                categorized by DCA (Debt Collection Agency) and application type.',
                'Generate a report presenting data for September 2020 within the Earth portfolio, focusing on approved settlement applications. \
                    This report should include the count of approved applications, the approved settlement amounts, the written-off balance, \
                        the average time taken for approvals, both the average and median durations, as well as the entry principal and balance. \
                            This information should be categorized by DCA (Debt Collection Agency) and application type.',
                            'Provide the number of customers with active and running settlements of types Settlement, Preappoved, Resch to Sett,\
                                  Out of Mandate Had Settlement and null on the Earth portfolio, the corresponding number of applications, \
                                    the initial settlement amount, discount amount and future instalments per asset class and DCA, for September 2020.']
entities = extract_named_entities(descriptions, nlp_ner)

entities[2]

[('number', 'MT_ITEMS'),
 ('portfolio', 'MT_LOV'),
 ('number', 'MT_ITEMS'),
 ('amount', 'MT_MONEY'),
 ('discount amount', 'MT_MONEY'),
 ('instalments', 'MT_MONEY'),
 ('asset class', 'MT_LOV'),
 ('DCA', 'MT_LOV'),
 ('September 2020', 'MT_DATE')]

In [10]:
import pandas as pd

df = pd.read_csv("../vectortest.csv", sep=';')
df.shape

(10, 2)

In [11]:
df

Unnamed: 0,name,description
0,MEAS_ACCL_STRAY_PAYMENT_PREDICTION_PAYERS_AMT,"Payment prediction for stray payers, not in ac..."
1,MEAS_ACCL_SETTLEMENT_PAYMENT_PREDICTION_PAYERS...,Payment prediction for settlement payers
2,MEAS_ACCH_AMT_PAYMENT_CP,Amount paid within the observation period
3,MEAS_APPL_AMT_APPROVED,Approved amount
4,MEAS_APPL_AMT_DISCOUNT,Discount amount
5,MEAS_APPL_AMT_PAYMENTS_TOTAL,Total payments amount so far
6,MEAS_APLL_AMT_WRITEOFF,Balance written off
7,MEAS_APLL_AMT_ENTRY_PRINCIPAL,Principal on arrangement creation
8,MEAS_APLL_AMT_ENTRY_BALANCE,Balance on arrangement creation
9,MEAS_APLL_AMT_INSTALMENTS_FUTURE,Total future instalments amount


In [12]:
from sentence_transformers import SentenceTransformer

In [13]:
encoder = SentenceTransformer("all-mpnet-base-v2")

In [14]:
vectors = encoder.encode(df.description)

In [15]:
dim = vectors.shape[1]
dim

768

Step 2 : Build a FAISS Index for vectors

In [16]:
import faiss

index = faiss.IndexFlatL2(dim)

Step 3 : Normalize the source vectors (as we are using L2 distance to measure similarity) and add to the index

In [17]:
index.add(vectors)

In [18]:
search_query = "Create report shows number settlement applications submitted month , approved amount entry principal balance , \
    September 2020 per DCA application type . The report produced Earth portfolio ."
# search_query = "looking for places to visit during the holidays"
# search_query = "An apple a day keeps the doctor away"
vec = encoder.encode(search_query)
vec.shape

(768,)

In [19]:
import numpy as np
svec = np.array(vec).reshape(1,-1)
svec.shape

(1, 768)

Step 5: Search for similar vector in the FAISS index created

In [20]:
distances, I = index.search(svec, k=5)
I

array([[1, 5, 2, 3, 0]], dtype=int64)

In [21]:
df.loc[I[0]]

Unnamed: 0,name,description
1,MEAS_ACCL_SETTLEMENT_PAYMENT_PREDICTION_PAYERS...,Payment prediction for settlement payers
5,MEAS_APPL_AMT_PAYMENTS_TOTAL,Total payments amount so far
2,MEAS_ACCH_AMT_PAYMENT_CP,Amount paid within the observation period
3,MEAS_APPL_AMT_APPROVED,Approved amount
0,MEAS_ACCL_STRAY_PAYMENT_PREDICTION_PAYERS_AMT,"Payment prediction for stray payers, not in ac..."
