## Read CSV file, Convert it into embeddings, store in Redis Vectore Store and Do Similarity Search 

In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import redis
import json

In [5]:
# Step 1: Load CSV and Preprocess
def load_and_prepare_data(csv_path):
    df = pd.read_csv(csv_path)
    # Create a description for embedding
    df['description'] = df.apply(
        lambda row: f"{row['bike_name']} in {row['city']} by {row['owner']} owner with {row['kms_driven']} km driven.",
        axis=1
    )
    return df

# Step 2: Generate Embeddings
def generate_embeddings(df, model):
    embeddings = model.encode(df['description'].tolist())
    return embeddings

# Step 3: Store Embeddings and Metadata in Redis
def store_embeddings_in_redis(df, embeddings, redis_client, index_name="bike_vector_index"):
    for i, embedding in enumerate(embeddings):
        bike_data = {
            "bike_name": df.loc[i, 'bike_name'],
            "price": df.loc[i, 'price'],
            "city": df.loc[i, 'city'],
            "kms_driven": df.loc[i, 'kms_driven'],
            "owner": df.loc[i, 'owner'],
            "age": df.loc[i, 'age'],
            "power": df.loc[i, 'power'],
            "brand": df.loc[i, 'brand']
        }
        redis_client.hset(
            f"{index_name}:{i}",
            mapping={
                "embedding": np.array(embedding).tobytes(),  # Store embedding as binary
                "metadata": json.dumps(bike_data)  # Metadata as JSON string
            }
        )
    redis_client.sadd(f"{index_name}_keys", *[f"{index_name}:{i}" for i in range(len(embeddings))])

# Step 4: Perform Similarity Search
def similarity_search(query_embedding, redis_client, index_name="bike_vector_index", top_k=3):
    # In a real scenario, use RediSearch or Approximate Nearest Neighbor libraries
    all_keys = redis_client.smembers(f"{index_name}_keys")
    results = []
    for key in all_keys:
        data = redis_client.hgetall(key)
        embedding = np.frombuffer(data['embedding'], dtype=np.float32)
        score = np.dot(query_embedding, embedding)  # Example: Cosine similarity
        results.append((score, json.loads(data['metadata'])))
    results.sort(reverse=True, key=lambda x: x[0])  # Sort by similarity score
    return results[:top_k]

# Step 5: Query Ollama for Additional Context
def query_ollama(prompt, context):
    from langchain.chat_models import Ollama
    from langchain.schema import ChatMessage

    model = Ollama(model="ollama3")
    response = model.chat(messages=[ChatMessage(content=context), ChatMessage(content=prompt)])
    return response

In [7]:
# Main Workflow
if __name__ == "__main__":
    # File and model initialization
    csv_path = r"D:\Projects\genai-poc\test_data_files\Used_Bikes_With_Header.csv"
    model = SentenceTransformer("all-MiniLM-L6-v2")
    redis_client = redis.Redis(host="localhost", port=6379, decode_responses=True)

    # Load and process data
    df = load_and_prepare_data(csv_path)
    embeddings = generate_embeddings(df, model)

    # Store embeddings and metadata in Redis
    store_embeddings_in_redis(df, embeddings, redis_client)

    # User query and search
    user_query = "Show me bikes available in Delhi"
    query_embedding = model.encode([user_query])[0]
    search_results = similarity_search(query_embedding, redis_client)

    # Prepare context for Ollama
    context = "\n".join([f"Bike: {result[1]['bike_name']}, Price: {result[1]['price']}, City: {result[1]['city']}" for result in search_results])
    response = query_ollama(user_query, context)

    print("Ollama Response:", response)

KeyError: 'embedding'