In [None]:
import cohere
import numpy as np
import pandas as pd
import streamlit as st
from annoy import AnnoyIndex

# Access the API key value
api_key = 'E1vtwb38YrYiHwE4iNNkpRd8mH7nwUMzC622ZXIZ'

In [None]:
df = pd.read_csv('cohere_text_preprocessing.csv')
df

In [None]:
# add an id column
df['id'] = df.index

In [None]:
def chunk_text(df, width=1500, overlap=500):
    # create an empty dataframe to store the chunked text
    new_df = pd.DataFrame(columns=['id', 'text_chunk'])

    # iterate over each row in the original dataframe
    for index, row in df.iterrows():
        # split the text into chunks of size 'width', with overlap of 'overlap'
        chunks = []
        rows = []
        for i in range(0, len(row['text']), width - overlap):
            chunk = row['text'][i:i+width]
            chunks.append(chunk)

        # iterate over each chunk and add it to the new dataframe
        chunk_rows = []
        for i, chunk in enumerate(chunks):
            # calculate the start index based on the chunk index and overlap
            start_index = i * (width - overlap)

            # create a new row with the chunked text and the original row's ID
            new_row = {'id': row['id'], 'text_chunk': chunk, 'start_index': start_index}
            chunk_rows.append(new_row)
        chunk_df = pd.DataFrame(chunk_rows)
        new_df = pd.concat([new_df, chunk_df], ignore_index=True)

    return new_df



new_df = chunk_text(df)
# append text chunks to the original dataframe in id order
df = df.merge(new_df, on='id', how='left')
df


In [None]:

co = cohere.Client(api_key) 

# Get the embeddings
embeds = co.embed(texts=list(df['text_chunk']),
                  model="large",
                  truncate="RIGHT").embeddings
# Check the dimensions of the embeddings
embeds = np.array(embeds)
embeds.shape


In [None]:
# Create the search index, pass the size of embedding
search_index = AnnoyIndex(embeds.shape[1], 'angular')
# Add all the vectors to the search index
for i in range(len(embeds)):
    search_index.add_item(i, embeds[i])

search_index.build(10) # 10 trees
search_index.save('search_index.ann')

In [None]:
# export the dataframe to a csv file
df.to_csv('cohere_text_final.csv', index=False)