In [5]:
import re
from annoy import AnnoyIndex
import pandas as pd
import numpy as np
import cohere

api_key = ''
co = cohere.Client(api_key) 

def concatenate_broken_sentences(sentences):
    new_sentences = []
    buffer = ""
    for sentence in sentences:
        if re.search(r'[\d$]$', sentence):
            buffer += sentence + "."
        else:
            new_sentence = buffer + sentence
            new_sentences.append(new_sentence.strip())
            buffer = ""
    return new_sentences
text = """
In the fiscal year of 2021, XYZ Corp reported a total revenue of $1.2 billion, marking a 15% increase compared to the prior year. 
The company's cost of goods sold (COGS) amounted to $600 million, resulting in a gross profit of $600 million. 
The gross margin for the year stood at 50%, a slight dip from the 52% gross margin in 2020. 

Operating expenses for the year totaled $200 million, which includes $50 million in research and development, $100 million in sales and marketing, and $50 million in general and administrative expenses. 
The operating income, therefore, was $400 million, leading to an operating margin of 33.3%. 

Net income for the year came in at $350 million after accounting for interest and taxes, translating to a net profit margin of approximately 29.2%. 
Earnings per share (EPS) for the fiscal year was $1.75, up from $1.50 in the previous year. 

The company also generated $100 million in cash flow from operating activities and invested $50 million in capital expenditures. 
Free cash flow for the year was $50 million. 
The board of directors declared a dividend of $0.50 per share, totaling $25 million in dividend payouts. 

In terms of liquidity, the company ended the year with $150 million in cash and cash equivalents. 
Current assets totaled $300 million, while current liabilities were $100 million, resulting in a current ratio of 3.0. 
Long-term debt stood at $200 million, with a debt-to-equity ratio of 0.5. 

The company expects revenue to grow by 10% to 15% in the next fiscal year, targeting a gross margin of at least 51%.
"""
initial_texts = text.split('.')
initial_texts = [t.strip(' \n') for t in texts]
# Concatenate broken sentences
texts = concatenate_broken_sentences(initial_texts)
# Get the embeddings
response = co.embed(texts=texts).embeddings
embeds = np.array(response)
# Create Annoy index
search_index = AnnoyIndex(embeds.shape[1], 'angular')
for index, embed in enumerate(embeds):
    search_index.add_item(index, embed)
search_index.build(10)
search_index.save('test.ann')

# Search function for dense retrieval
def search(query):
    query_embed = co.embed(texts=[query]).embeddings[0]
    similar_item_ids, distances = search_index.get_nns_by_vector(query_embed, n=3, include_distances=True)
    results = pd.DataFrame(data={'texts': [texts[i] for i in similar_item_ids], 'distance': distances})
    print(f"Query: '{query}'\nNearest neighbors:\n")
    for i, row in results.iterrows():
        print(f"Text: {row['texts']}\nDistance: {row['distance']}\n")
    return results

query = "Tell me about dividend"
search(query)


# Query and rerank
query = "Tell me about the dividend"
MODEL_NAME = "rerank-english-02"
results = co.rerank(query=query, model=MODEL_NAME, documents=texts, top_n=3)

# Display reranked results
for idx, r in enumerate(results):
    print(f"Document Rank: {idx + 1}, Document Index: {r.index}")
    print(f"Document: {r.document['text']}")
    print(f"Relevance Score: {r.relevance_score:.2f}")
    print("\n")