# Install/import relevant libraries

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle
from scipy import spatial
import os
print(os.getcwd())

C:\Users\yiehy\OneDrive\Desktop\cs425-nlc-project\4.Retrieval\untuned_bertbase


In [2]:
!pip install sentence-transformers

# Get embeddings for distilbert

In [2]:
qna_df = pd.read_csv("../../0.Datasets/QnA.csv")

In [3]:
qna_df.head()

Unnamed: 0.1,Unnamed: 0,qid,docid,question,answer
0,0,0,18850,What is considered a business expense on a bus...,The IRS Guidance pertaining to the subject. I...
1,1,1,14255,Claiming business expenses for a business with...,Yes you can claim your business deductions if ...
2,2,2,308938,Transferring money from One business checking ...,You should have separate files for each of the...
3,3,3,296717,Having a separate bank account for business/in...,Having a separate checking account for the bus...
4,4,3,100764,Having a separate bank account for business/in...,"You don't specify which country you are in, so..."


In [4]:
qna_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17072 entries, 0 to 17071
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  17072 non-null  int64 
 1   qid         17072 non-null  int64 
 2   docid       17072 non-null  int64 
 3   question    17072 non-null  object
 4   answer      17072 non-null  object
dtypes: int64(3), object(2)
memory usage: 667.0+ KB


In [5]:
model = SentenceTransformer("sentence-transformers/msmarco-bert-base-dot-v5")



In [6]:
# split answers to chunks for model to encode
answers_list = qna_df["answer"].tolist()
chunk_size = 10
chunks = [answers_list[i:(i+chunk_size)] for i in range(0, len(answers_list), chunk_size)]

In [7]:
answers_list = qna_df["answer"].tolist()

In [9]:
dimension_size = 768 # embedding dimensions from msmarco distilbert documentation
sentence_embeddings = np.empty([0,dimension_size], dtype=np.float32)
for i,chunk in enumerate(chunks):
    if i% chunk_size == 0:
        print(i* chunk_size)
    # stack it to sentence_embeddings numpy array
    sentence_embeddings = np.vstack((sentence_embeddings, model.encode(chunk)))

In [None]:
len(sentence_embeddings)

In [29]:
#save model embeddings
with open("./embeddings_bertbase.pkl", 'wb') as f:
    pickle.dump(sentence_embeddings, f, pickle.HIGHEST_PROTOCOL)

# Show top 50 results for a query

In [12]:
# Read embeddings
sentence_embeddings = pickle.load(open("./embeddings_bertbase.pkl", 'rb'))

In [4]:
test_query = "Should i use a robo advisor or hire a financial advisor?"

In [9]:
%%time
question_embedding = model.encode(test_query)
answer_similiarity = {}
for i,embed in enumerate(sentence_embeddings):
    answer_similiarity[i]= np.dot(question_embedding, embed)
answer_similiarity = {k: v for k, v in sorted(answer_similiarity.items(), key=lambda item: item[1], reverse=True)}
top_50_index = list(answer_similiarity)[:50]
for i,index in enumerate(top_50_index):
    print(f"========== Rank {i+1} ==========")
    print(answers_list[index])
    print()

If your financial needs aren't complex, and mostly limited to portfolio management, consider looking into the newish thing called robo-advisers (proper term is "Automated investing services").  The difference is that robo-advisers use software to manage portfolios on a large scale, generating big economy of scale and therefore offering a much cheaper services than personal advisor would - and unless your financial needs are extremely complex, the state of the art of scaled up portfolio management is at the point that a human advisor really doesn't give you any value-add (and - as other answers noted - human advisor can easily bring in downsides such as conflict of interest and lack of fiduciary responsibility).  disclaimer: I indirectly derive my living from a company which derives a very small part of their income from a robo-adviser, therefore there's a possible small conflict of interest in my answer

If you are looking for an advisor to just build a portfolio and then manage it, a 



# Evaluate test result

In [24]:
test_df = pd.read_csv("../../0.Datasets/train_test_split/test.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17072 entries, 0 to 17071
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  17072 non-null  int64 
 1   qid         17072 non-null  int64 
 2   docid       17072 non-null  int64 
 3   question    17072 non-null  object
 4   answer      17072 non-null  object
dtypes: int64(3), object(2)
memory usage: 667.0+ KB


In [25]:
test_df.info()

Unnamed: 0,qid,docid,question,answer
0,0,18850,What is considered a business expense on a bus...,The IRS Guidance pertaining to the subject. I...
1,1,14255,Claiming business expenses for a business with...,Yes you can claim your business deductions if ...
2,2,308938,Transferring money from One business checking ...,You should have separate files for each of the...
3,3,296717,Having a separate bank account for business/in...,Having a separate checking account for the bus...
4,3,100764,Having a separate bank account for business/in...,"You don't specify which country you are in, so..."


In [26]:
test_df.head().drop(columns=["Unnamed: 0"])

In [27]:
# split answers to chunks for model to encode
test_answers_list = test_df["answer"].tolist()
chunk_size = 10
chunks = [test_answers_list[i:(i+chunk_size)] for i in range(0, len(test_answers_list), chunk_size)]
dimension_size = 768 # embedding dimensions from msmarco bertbase documentation

test_answer_embeddings = np.empty([0,dimension_size], dtype=np.float32)
for i,chunk in enumerate(chunks):
    if i% chunk_size == 0:
        print(i* chunk_size)
    # stack it to test_answer_embeddings numpy array
    test_answer_embeddings = np.vstack((test_answer_embeddings, model.encode(chunk)))
    
#save model embeddings
with open("./untuned_bertbase_test_answer_embeddings.pkl", 'wb') as f:
    pickle.dump(test_answer_embeddings, f, pickle.HIGHEST_PROTOCOL)

In [28]:
question_answer_index_map = {}
for _,row in test_df.iterrows():
    if row["qid"] not in question_answer_index_map:
        question_answer_index_map[row["qid"]]= []
        question_answer_index_map[row["qid"]].append(row["docid"])
    else:
        question_answer_index_map[row["qid"]].append(row["docid"])

In [1]:
labels = []
for v in question_answer_index_map.values():
    labels.append(v)

In [None]:
question_map = {}
label_map = {}
for _,row in test_df.iterrows():
    if row["qid"] not in question_map:
        question_map[row["qid"]] = row["question"]
    if row["answer"] not in label_map:
        label_map[row["answer"]] = row["docid"]

In [2]:
sentence_embeddings = pickle.load(open("./untuned_bertbase_implementation.pkl", 'rb'))
predictions = []
count=1
for k,v in question_map.items():
    if count%100==0:
        print(count)
    question_embedding = model.encode(v)
    answer_similiarity = {}
    for i,embed in enumerate(sentence_embeddings):
        answer_similiarity[i]= np.dot(question_embedding, embed)
    answer_similiarity = {k: v for k, v in sorted(answer_similiarity.items(), key=lambda item: item[1], reverse=True)}
    response = list(answer_similiarity)[0]
    label_index = label_map[test_answers_list[response]]
    predictions.append([label_index])
    count+=1

In [None]:
#save prediction and results
results = {"labels":labels,"predictions":predictions}
with open("../../7.Evaluate/untuned_bertbase_results.pkl", 'wb') as f:
    pickle.dump(results, f, pickle.HIGHEST_PROTOCOL)