In [13]:
import sys
sys.path.append('../src/')

import numpy as np
import pandas as pd

from openai_functions import generate_text, get_embedding
from similarity import get_similar_blogs, get_topics

In [2]:
from langchain.embeddings import HuggingFaceBgeEmbeddings
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device':'cpu'}
encode_kwargs = {'normalize_embeddings':True} # True for cosine similarity
model = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)

In [3]:
sentence_1 = "Are German companies ready for twin transition?"
sentence_2 = "What do German executives see as the biggest opportunities and challenges?"
sentence_3 = "What do German executives see as the biggest opportunities and challenges in twin transition?"

In [4]:
ada_embedding_1 = get_embedding(sentence_1)
ada_embedding_2 = get_embedding(sentence_2)
print(np.dot(ada_embedding_1, ada_embedding_2))

ada_embedding_1 = get_embedding(sentence_1)
ada_embedding_3 = get_embedding(sentence_3)
print(np.dot(ada_embedding_1, ada_embedding_3))

0.8778036550398265
0.9350853309473685


In [5]:
bge_embedding_1 = model.embed_query(sentence_1)
bge_embedding_2 = model.embed_query(sentence_2)
print(np.dot(bge_embedding_1, bge_embedding_2))

bge_embedding_1 = model.embed_query(sentence_1)
bge_embedding_3 = model.embed_query(sentence_3)
print(np.dot(bge_embedding_1, bge_embedding_3))

0.6726142532838701
0.8588599228211886


In [55]:
df = pd.read_csv('../data/processed/futurice/blogs2.csv')

In [63]:
df['average session duration'].mean()

119.10314145544227

In [64]:
df['average session duration'].std(ddof=0)

109.39267698309507

In [None]:
df[df['title'] == 'Generative AI tools at Futurice, May 2023']

In [54]:
def get_nearest_titles(text, threshold):
    df = pd.read_csv('../data/processed/futurice/blogs2.csv')
    df = df[df['title'] != text]
    embedded_text =  model.embed_query(text)
    df['similarity'] = df['bge embedded title'].apply(eval).apply(lambda x: np.dot(x, embedded_text))
    df = df[df['similarity'] >= threshold]
    df = df.sort_values(by='similarity', ascending=False)
    return df

In [None]:
title = 'Are German companies ready for twin transition?'

df_similarity = get_nearest_titles(title, threshold=0.8)

df_similarity

In [9]:
prompt = f"Given the following blog text “{df_similarity.iloc[0]['full text']}” extract at most 5 important key topics, summarized into either 2 or 3 words for each key topic. Write it out as a python list and don't return anything else."

print(generate_text(prompt, temperature=0))


['Twin transition', 'Financial performance', 'Operational efficiency', 'Business practices', 'Sustainable technologies']


In [10]:
prompt = f"Given the following blog text “{df_similarity.iloc[1]['full text']}” extract at most 5 important key topics, summarized into either 2 or 3 words for each key topic. Write it out as a python list and don't return anything else."

print(generate_text(prompt, temperature=0))

['Twin transition challenges', 'Twin transition opportunities', 'Leadership', 'Complexity', 'Lack of resources']
