In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import pinecone
import uuid

In [None]:
def load_medical_data(csv_path):
    df = pd.read_csv(csv_path)
    df.drop(['Image URL',	'Manufacturer',	'Excellent Review %',	'Average Review %',	'Poor Review %'],axis = 1,inplace = True)
    return df

In [None]:
def preprocess_medical_data(df):
    df['full_text'] = df['Medicine Name'] + ' ' + \
                      df['Composition'].fillna('') + ' ' + \
                      df['Uses'].fillna('') + ' ' + \
                      df['Side_effects'].fillna('')

    df['full_text'] = df['full_text'].str.replace('[^a-zA-Z0-9\s]', '', regex=True)
    df['full_text'] = df['full_text'].str.lower()

    return df

In [None]:
def generate_embeddings(df):
    embedding_model = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO',token = "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")
    embeddings = embedding_model.encode(df['full_text'].tolist())

    return embeddings

In [None]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import time8
import uuid

def store_in_pinecone(df, embeddings):
    pc = Pinecone(api_key=PINECONE_API_KEY)

    # Changed index name to use hyphens instead of underscore
    index_name = "medical-database"  
    if index_name not in pc.list_indexes():
        pc.create_index(
            name=index_name,
            dimension=embeddings.shape[1], 
            metric="cosine",                 
            spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            )
        )
        while not pc.describe_index(index_name).status['ready']:
            time.sleep(1)
            
    index = pc.Index(index_name)

    vectors = []
    for i, row in df.iterrows():
        vector_id = str(uuid.uuid4())
        vector = embeddings[i].tolist()
        metadata = {
            'medicine_name': row['Medicine Name'],
            'composition': row['Composition'],
            'uses': row['Uses'],
            'side_effects': row['Side_effects']
        }

        vectors.append((vector_id, vector, metadata))
    for i in range(0, len(vectors), 100):
        batch = vectors[i:i+100]
        index.upsert(vectors=batch)

In [None]:
csv_path = '/content/Medicine_Details.csv'
df = load_medical_data(csv_path)

In [None]:
processed_df = preprocess_medical_data(df)

In [None]:
embeddings = generate_embeddings(processed_df)

In [None]:
store_in_pinecone(processed_df, embeddings)