In [None]:
import numpy as np
import json
import pandas as pd
from numpy.linalg import norm
from scipy.stats import zscore
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px
import ast
import voyageai
import time
from rouge import Rouge
import plotly.graph_objects as go
from matplotlib import pyplot as plt
from matplotlib_venn import venn3
from plotly.subplots import make_subplots
import math
from sentence_transformers import CrossEncoder
vo = voyageai.Client()
ce = CrossEncoder('BAAI/bge-reranker-base')

In [None]:
def embed(text, vo):
    # Voyage AI usage guide - https://docs.voyageai.com/docs/api-key-and-installation
    result = vo.embed(text, model="voyage-2", input_type="document")
    return result.embeddings[0]

In [None]:
# Risk Description
inp = """Our success depends in large part upon the strength of our skilled engineering professionals 
        and management team. If we fail to attract, retain, train and optimally utilize these personnel, 
        our business may be unable to grow and our revenue and profitability could decline. 
        Further, increases in wages and other employee benefit expenses for such personnel could prevent 
        us from sustaining our competitive advantage."""
inp = inp.replace("\n", "")

vec = embed(inp, vo)

In [None]:
coord = pd.read_csv('Input/Risk_Coords_Voyage.csv')
matrx = pd.read_csv('Input/Finetuned_Risk_Matrix.csv')
fname = pd.read_csv('Input/Filenames.csv')

r1 = pd.read_csv('Input/Risk_Factors_Voyage_1.csv')
r2 = pd.read_csv('Input/Risk_Factors_Voyage_2.csv')
r3 = pd.read_csv('Input/Risk_Factors_Voyage_3.csv')
risks = pd.concat([r1, r2, r3])
risks.reset_index(drop=True, inplace=True)

In [None]:
# Calculates the closeness of the input risk statement to each Risk Category

def cos_sim(npy, inp):
    df = inp.copy()
    df['Embedding'] = df['Embedding'].str[1:-1].str.split(",").apply(lambda x: [float(string) for string in x]).apply(np.array)
    
    # Calculate cosine similarity for each row
    similarities = []
    for index, row in df.iterrows():
        similarity = cosine_similarity([npy], [row['Embedding']])
        similarities.append(similarity[0][0])

    df['Similarity'] = similarities
    df = df.sort_values(by= 'Similarity', ascending = False, inplace=False).reset_index(drop = True)
    return df

In [None]:
prof = cos_sim(vec, coord)

fig = px.line_polar(prof, r=prof['Similarity'], theta=prof['Risk Name'], line_close=True)

fig.update_layout(
    template=None,
    polar = dict(
        radialaxis = dict(range=[prof['Similarity'].min() - 0.02, prof['Similarity'].max() + 0.02], ticks='')
    )
)
fig.show()

In [None]:
# Matrix Search Retrieval Algorithm

def matrix_search (prof, risk, stmt, fname):
    df = prof[['Risk Name', 'Similarity']].copy()
    df = df.transpose()
    df.columns = df.iloc[0]
    df = df.drop(df.index[0])
    mrsk = risk.copy()
    r_cols = [col for col in mrsk.columns if col.endswith('Risk')]
    for col in mrsk.filter(like = 'Risk').columns:
        val = df.at[df.index[0], col]
        mrsk[col] *= val
    mrsk['Weighted Similarity'] = mrsk[[col for col in mrsk.columns if col.endswith('Risk')]].max(axis=1)
    mrsk = mrsk.sort_values(by= 'Weighted Similarity', ascending = False, inplace=False).reset_index(drop = True)
    mrsk = mrsk[['Company', 'Year', 'Unique ID', 'Point', 'Info', 'Weighted Similarity']]
    resl = pd.merge(mrsk, fname, on = ['Company', 'Year', 'Unique ID'], how = 'left')
    return resl

In [None]:
start_mat = time.time()
rsk = matrix_search(prof, matrx, inp, fname)[:10]
end_mat = time.time()
mat_time = end_mat - start_mat
rsk.head()

In [None]:
# Complete Search Retrieval Algorithm (Takes longer to execute)

def complete_search(npy, inp, fname):
    df = inp.copy()
    # Remove brackets and split the string by spaces, then convert to float
    df['Voyage_Embedding'] = df['Voyage_Embedding'].str[1:-1].str.split(",").apply(lambda x: [float(string) for string in x]).apply(np.array)
    
    # Calculate cosine similarity for each row
    similarities = []
    for index, row in df.iterrows():
        similarity = cosine_similarity([npy], [row['Voyage_Embedding']])
        similarities.append(similarity[0][0])
    
    # Add similarity values to DataFrame
    df['Cosine Similarity'] = similarities
    df = df.sort_values(by= 'Cosine Similarity', ascending = False, inplace=False).reset_index(drop = True)
    df = df[['Company', 'Year', 'Unique ID', 'Point', 'Info', 'Cosine Similarity']]
    resl = pd.merge(df, fname, on = ['Company', 'Year', 'Unique ID'], how = 'left')
    return resl

In [None]:
start_cmp = time.time()
sim = complete_search(vec, risks, fname)[:10]
end_cmp = time.time()
cmp_time = end_cmp - start_cmp
sim.head()

In [None]:
# Hybrid Search Retrieval Algorithm

def hybrid_search(prof, matrx, inp, vec, risks, fname):
    hyb = 0
    mat = matrix_search(prof, matrx, inp, fname)
    npy = mat[:500]
    mdrsk = pd.merge(risks, npy[['Unique ID', 'Point']], on = ['Unique ID', 'Point'], how = 'inner')
    hyb = complete_search(vec, mdrsk, fname)
    return hyb

In [None]:
start_hyb = time.time()
res = hybrid_search(prof, matrx, inp, vec, risks, fname)[:10]
end_hyb = time.time()
hyb_time = end_hyb - start_hyb
res.head()

In [None]:
rouge = Rouge()

# Rouge L Metric (https://en.wikipedia.org/wiki/ROUGE_(metric))
# Calculating Rouge-L Metric between each of the selected Risk Factors and the Risk Statement
def calc_rouge(reference, hypothesis):
    scores = rouge.get_scores(hypothesis, reference)
    dfres = scores[0]['rouge-l']['f'] 
    return dfres 

In [None]:
rog = pd.DataFrame()
rog['Matrix Search'] = rsk['Info'].apply(lambda x: calc_rouge(inp, x))

rog['Complete Search'] = sim['Info'].apply(lambda x: calc_rouge(inp, x))

rog['Hybrid Search'] = res['Info'].apply(lambda x: calc_rouge(inp, x))

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=rog.index, y=rog['Matrix Search'], mode='lines', name='Matrix Search', line=dict(color='blue')))

fig.add_trace(go.Scatter(x=rog.index, y=rog['Complete Search'], mode='lines', name='Complete Search', line=dict(color='red')))

fig.add_trace(go.Scatter(x=rog.index, y=rog['Hybrid Search'], mode='lines', name='Hybrid Search', line=dict(color='green')))

fig.update_layout(title='ROUGE-L Scores Comparison',
                  xaxis_title='Risk Factor Closeness Rank',
                  yaxis_title='ROUGE-L Score')

fig.show()

In [None]:
print ("Matrix Search Duration : ", mat_time, "s")
print ("Complete Search Duration : ", cmp_time, "s")
print ("Hybrid Search Duration : ", hyb_time, "s")

In [None]:
rsk_set = set(rsk['Unique ID'] + '_' + rsk['Point'].astype(str))
sim_set = set(sim['Unique ID'] + '_' + sim['Point'].astype(str))
res_set = set(res['Unique ID'] + '_' + res['Point'].astype(str))

# Find intersections
int_2_sets = {
    'rsk_sim': rsk_set & sim_set,
    'rsk_res': rsk_set & res_set,
    'sim_res': sim_set & res_set
}
int_3_sets = rsk_set & sim_set & res_set

# Create Venn diagram
venn3(subsets=(len(rsk_set - sim_set - res_set), len(sim_set - rsk_set - res_set),
               len(sim_set & rsk_set - res_set), len(res_set - rsk_set - sim_set),
               len(res_set & rsk_set - sim_set), len(res_set & sim_set - rsk_set),
               len(int_3_sets)),
      set_labels=('Matrix Search', 'Complete Search', 'Hybrid Search'))

plt.title("Venn Diagram of Risk Factors")
plt.show()

In [None]:
#print("Matrix Search Result top 10 Risk Factors")
#rsk.head(10)

In [None]:
#print("Complete Search Result top 10 Risk Factors")
#sim.head(10)

In [None]:
print("Hybrid Search Result top 10 Risk Factors")
res.head(10)

In [None]:
# Prepare the pairs for similarity calculation
pairs = [(inp, info) for info in res['Info']]

# Calculate the similarity scores
similarity_scores = ce.predict(pairs)

# Add the similarity scores to the DataFrame
res['Cross-Encoder Similarity'] = similarity_scores


In [None]:
print("Top 3 Risk Factors")
res.sort_values('Cross-Encoder Similarity', ascending = False).head(3)