In [638]:
from langchain.text_splitter import MarkdownTextSplitter
from langchain.docstore.document import Document
import html2text

In [639]:
from vectordb import Memory

In [640]:
# set device to CUDA if available
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [641]:
import textwrap
import numpy as np
import pandas as pd

import google.generativeai as genai
from IPython.display import Markdown

### Grab an API Key

Before you can use the Gemini API, you must first obtain an API key. If you don't already have one, create a key with one click in Google AI Studio.

<a class="button button-primary" href="https://makersuite.google.com/app/apikey" target="_blank" rel="noopener noreferrer">Get an API key</a>

In Colab, add the key to the secrets manager under the "🔑" in the left panel. Give it the name `API_KEY`.

Once you have the API key, pass it to the SDK. You can do this in two ways:

* Put the key in the `GOOGLE_API_KEY` environment variable (the SDK will automatically pick it up from there).
* Pass the key to `genai.configure(api_key=...)`

In [642]:
# Or use `os.getenv('API_KEY')` to fetch an environment variable.
API_KEY="AIzaSyCzcopOrcDHgZIdjrFuOrYeBasjG0qiwec" #fine grained token

genai.configure(api_key=API_KEY)


In [643]:
from huggingface_hub import login
API_KEY="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" # write only token

login(token=API_KEY, add_to_git_credential=True)


Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/suyash/.cache/huggingface/token
Login successful


Key Point: Next, you will choose a model. Any embedding model will work for this tutorial, but for real applications it's important to choose a specific model and stick with it. The outputs of different models are not compatible with each other.

**Note**: At this time, the Gemini API is [only available in certain regions](https://ai.google.dev/gemini-api/docs/available-regions).

In [644]:
import requests
from bs4 import BeautifulSoup
import re
import time
import random
# from langchain.text_splitter import MarkdownTextSplitter
# from langchain.docstore.document import Document


# Data Cleaning functions

def merge_hyphenated_words(text):
    return re.sub(r"(\w)-\n(\w)", r"\1\2", text)


def fix_newlines(text):
    return re.sub(r"(?<!\n)\n(?!\n)", " ", text)


def remove_multiple_newlines(text):
    return re.sub(r"\n{2,}", "\n", text)


def clean_text(text):
    """
    Cleans the text by passing it through a list of cleaning functions.

    Args:
        text (str): Text to be cleaned

    Returns:
        str: Cleaned text
    """
    cleaning_functions = [merge_hyphenated_words, fix_newlines, remove_multiple_newlines]
    for cleaning_function in cleaning_functions:
        text = cleaning_function(text)
    return text
def get_data_from_file(url ,file_path):
    """
    Retrieve text content and metadata from a given URL.

    Args:
        url (str): The URL to fetch content from.

    Returns:
        tuple: A tuple containing the text content (str) and metadata (dict).
    """

    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    soup = BeautifulSoup(html_content, 'html.parser')


    # Removing js and css code
    for script in soup(["script", "style"]):
        script.extract()

    # Extract text in markdown format
    html = str(soup)
    html2text_instance = html2text.HTML2Text()
    html2text_instance.images_to_alt = True
    html2text_instance.body_width = 0
    html2text_instance.single_line_break = True
    text = html2text_instance.handle(html)

    # Extract page metadata
    try:
        page_title = soup.title.string.strip()
    except:
        page_title = url.path[1:].replace("/", "-")
    meta_description = soup.find("meta", attrs={"name": "description"})
    meta_keywords = soup.find("meta", attrs={"name": "keywords"})
    if meta_description:
        description = meta_description.get("content")
    else:
        description = page_title
    if meta_keywords:
        meta_keywords = meta_description.get("content")
    else:
        meta_keywords = ""

    title = page_title
    metadata = {'title': page_title,
                'url': url,
                'description': description,
                'keywords': meta_keywords}
    text = clean_text(text)
    return metadata, text

In [645]:
def text_to_docs(metadata, text):
    """
    Converts input text to a list of Documents with metadata.

    Args:
        text (str): A string of text.
        metadata (dict): A dictionary containing the metadata.

    Returns:
        List[Document]: List of documents.
    """
    doc_chunks = []
    text_splitter = MarkdownTextSplitter(chunk_size=2048, chunk_overlap=128)
    chunks = text_splitter.split_text(text)
    for i, chunk in enumerate(chunks):
        doc = {
            "Metadata": metadata,
            "Text": chunk
        }
        doc_chunks.append(doc)
    return doc_chunks

In [646]:
url_and_file_path=[
    {"url":"https://www.flipkart.com/pages/privacypolicy","file_path":"Data_html/flipkart_privacy_policy.txt"},
    {"url":"https://en.wikipedia.org/wiki/Flipkart","file_path":"Data_html/flipkart_wiki.txt"},
    {"url":"https://www.flipkart.com/pages/terms","file_path":"Data_html/flipkart_term.txt"},
    {"url":"https://en.wikipedia.org/wiki/E-commerce_in_India","file_path":"Data_html/ecommerce_wiki.txt"},
    {"url":"https://en.wikipedia.org/wiki/Amazon_(company)","file_path":"Data_html/amazon_wiki.txt"},
    {"url":"https://blog.ipleaders.in/flipkart-and-amazon-lets-know-their-terms-and-conditions/","file_path":"Data_html/flipkart_amazon_terms.txt"},
    {"url":"https://www.flipkart.com/pages/returnpolicy","file_path":"Data_html/flipkart_return_policy.txt"},
    {"url":"view-source:https://www.flipkart.com/mobile-phones-store" , "file_path":"Data_html/flipkart_phones.txt"},
    
  ]

In [647]:
url_and_file_path[0]['file_path']

'Data_html/flipkart_privacy_policy.txt'

In [648]:
# memory = Memory()
# metadata2, text2 = get_data_from_file("view-source:https://www.flipkart.com/mobile-phones-store" , "Data_html/flipkart_phones.txt")

# docs2 = text_to_docs(metadata2, text2)
# df2 = pd.DataFrame(docs2)
# df2.columns = ['Metadata' , 'Text']
# df2
# for i, row in df2.iterrows():
#     metadata = row['Metadata']
#     text = row['Text']
#     try:
#         memory.save(text, metadata)
#         # print("Data saved in memory : ", data['file_path'])
#     except:
#         # print("Error in file : ", data['file_path'])
#         continue

## Building an embeddings database

Here are three sample texts to use to build the embeddings database. You will use the Gemini API to create embeddings of each of the documents. Turn them into a dataframe for better visualization.

Organize the contents of the dictionary into a dataframe for better visualization.

In [649]:
memory = Memory(memory_file='memory.json' , chunking_strategy={"mode": "sliding_window", "window_size": 2048, "overlap": 64})



In [650]:
# memory.clear()

In [651]:
#load memory from memory.json

In [652]:
for data in url_and_file_path:
    metadata, text = get_data_from_file(data['url'], data['file_path'])
    docs = text_to_docs(metadata, text)
    df = pd.DataFrame(docs)
    df.columns = ['Metadata', 'Text']
    for i, row in df.iterrows():
        metadata = row['Metadata']
        text = row['Text']
        try:
            memory.save(text, metadata , memory_file='memory.json')
            print("Data saved in memory : ", data['file_path'])
        except:
            print("Error in file : ", data['file_path'])
            continue
# # 

Error in file :  Data_html/flipkart_privacy_policy.txt
Data saved in memory :  Data_html/flipkart_privacy_policy.txt
Data saved in memory :  Data_html/flipkart_privacy_policy.txt
Data saved in memory :  Data_html/flipkart_privacy_policy.txt
Data saved in memory :  Data_html/flipkart_privacy_policy.txt
Data saved in memory :  Data_html/flipkart_privacy_policy.txt
Data saved in memory :  Data_html/flipkart_privacy_policy.txt
Data saved in memory :  Data_html/flipkart_privacy_policy.txt
Data saved in memory :  Data_html/flipkart_privacy_policy.txt
Data saved in memory :  Data_html/flipkart_privacy_policy.txt
Data saved in memory :  Data_html/flipkart_privacy_policy.txt
Data saved in memory :  Data_html/flipkart_privacy_policy.txt
Data saved in memory :  Data_html/flipkart_privacy_policy.txt
Data saved in memory :  Data_html/flipkart_wiki.txt
Data saved in memory :  Data_html/flipkart_wiki.txt
Data saved in memory :  Data_html/flipkart_wiki.txt
Data saved in memory :  Data_html/flipkart_wi

In [653]:
#import data from Data_html/archive/flipkart_com-ecommerce_sample.csv and save in memory
df = pd.read_csv("Data_html/archive/flipkart_com-ecommerce_sample.csv")
df.head()



Unnamed: 0,uniq_id,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,image,is_FK_Advantage_product,description,product_rating,overall_rating,brand,product_specifications
0,c2d766ca982eca8304150849735ffef9,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2FF9KEDEFGF,999.0,379.0,"[""http://img5a.flixcart.com/image/short/u/4/a/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ..."
1,7f7036a6d550aaa89d34c77bd39a5e48,2016-03-25 22:59:23 +0000,http://www.flipkart.com/fabhomedecor-fabric-do...,FabHomeDecor Fabric Double Sofa Bed,"[""Furniture >> Living Room Furniture >> Sofa B...",SBEEH3QGU7MFYJFY,32157.0,22646.0,"[""http://img6a.flixcart.com/image/sofa-bed/j/f...",False,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,No rating available,No rating available,FabHomeDecor,"{""product_specification""=>[{""key""=>""Installati..."
2,f449ec65dcbc041b6ae5e6a32717d01b,2016-03-25 22:59:23 +0000,http://www.flipkart.com/aw-bellies/p/itmeh4grg...,AW Bellies,"[""Footwear >> Women's Footwear >> Ballerinas >...",SHOEH4GRSUBJGZXE,999.0,499.0,"[""http://img5a.flixcart.com/image/shoe/7/z/z/r...",False,Key Features of AW Bellies Sandals Wedges Heel...,No rating available,No rating available,AW,"{""product_specification""=>[{""key""=>""Ideal For""..."
3,0973b37acd0c664e3de26e97e5571454,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2F6HUZMQ6SJ,699.0,267.0,"[""http://img5a.flixcart.com/image/short/6/2/h/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ..."
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,2016-03-25 22:59:23 +0000,http://www.flipkart.com/sicons-all-purpose-arn...,Sicons All Purpose Arnica Dog Shampoo,"[""Pet Supplies >> Grooming >> Skin & Coat Care...",PSOEH3ZYDMSYARJ5,220.0,210.0,"[""http://img5a.flixcart.com/image/pet-shampoo/...",False,Specifications of Sicons All Purpose Arnica Do...,No rating available,No rating available,Sicons,"{""product_specification""=>[{""key""=>""Pet Type"",..."


In [654]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("Data_html/archive/flipkart_com-ecommerce_sample.csv")
# df =df[1:1000]

# remove all witespaces from the product_category_tree column
# df["product_category_tree"] = df["product_category_tree"].str.replace(" ", "")
# Extract the first category from the product_category_tree column
df["product_category_1"] = df["product_category_tree"].apply(lambda x: x.split(" >> ")[0].strip()[2:20].lower())
df["product_category_1"] = df["product_category_1"].str.replace(" ", "")


# Example: Access the dataframe for the "Clothing" category


In [655]:
len(df["product_category_1"].unique())



246

In [656]:
df.drop(columns=["uniq_id" ,"crawl_timestamp" ,"pid", "image" , "is_FK_Advantage_product" , "product_rating",  "overall_rating" , "product_url" ], inplace=True)

In [657]:
df.to_dict('records')[0]

{'product_name': "Alisha Solid Women's Cycling Shorts",
 'product_category_tree': '["Clothing >> Women\'s Clothing >> Lingerie, Sleep & Swimwear >> Shorts >> Alisha Shorts >> Alisha Solid Women\'s Cycling Shorts"]',
 'retail_price': 999.0,
 'discounted_price': 379.0,
 'description': "Key Features of Alisha Solid Women's Cycling Shorts Cotton Lycra Navy, Red, Navy,Specifications of Alisha Solid Women's Cycling Shorts Shorts Details Number of Contents in Sales Package Pack of 3 Fabric Cotton Lycra Type Cycling Shorts General Details Pattern Solid Ideal For Women's Fabric Care Gentle Machine Wash in Lukewarm Water, Do Not Bleach Additional Details Style Code ALTHT_3P_21 In the Box 3 shorts",
 'brand': 'Alisha',
 'product_specifications': '{"product_specification"=>[{"key"=>"Number of Contents in Sales Package", "value"=>"Pack of 3"}, {"key"=>"Fabric", "value"=>"Cotton Lycra"}, {"key"=>"Type", "value"=>"Cycling Shorts"}, {"key"=>"Pattern", "value"=>"Solid"}, {"key"=>"Ideal For", "value"=>"

In [658]:
import json

def parse_product_data(product_data):
    """
    Parse a product data dictionary into a structured dictionary with specifications as a key:value string.
    
    Parameters:
        product_data (dict): A dictionary containing product information.
    
    Returns:
        dict: A structured dictionary with parsed product data.
    """
    
    product_name = product_data.get('product_name')
    product_category_tree = json.loads(product_data.get('product_category_tree', '[]'))  # Parse the JSON string
    retail_price = product_data.get('retail_price')
    discounted_price = product_data.get('discounted_price')
    description = product_data.get('description')
    brand = product_data.get('brand')
    if pd.isna(product_data.get('product_specifications', '{}')):
        product_data['product_specifications'] ='{}'
    specifications = json.loads(product_data.get('product_specifications', '{}').replace('=>', ':'))  # Convert to proper JSON format
    product_category_1 = product_data.get('product_category_1')
    # Extract key:value pairs from specifications
    specification_string = "; ".join([f"{item['key']}:{item['value']}" for item in specifications.get("product_specification", []) if 'key' in item and 'value' in item])
    # Create a structured dictionary including the specification string
    structured_data = {
        "product_name": product_name,
        "product_category_tree": product_category_tree,
        "retail_price": retail_price,
        "discounted_price": discounted_price,
        "description": description,
        "brand": brand,
        "specifications": specification_string,
        "product_category_1": product_category_1
    }

    return str(structured_data)


In [659]:
# print key , value of row[0]


In [660]:

df_grouped_product = df.groupby("product_category_1")
len(df_grouped_product.groups)

246

In [661]:
df_grouped_brand = df.groupby("brand")
len(df_grouped_brand.groups)

3499

In [662]:
import pandas as pd

# Assuming you have your DataFrame named 'df'
df['product_specifications'] = df['product_specifications'].fillna({})
# Group by product category and brand
df_grouped = df.groupby("product_category_1")

# Iterate through each group and create documents
for name, group in df_grouped:
    document = ""
    chunk_size = 50  # Number of rows per document
    chunk_count = 0  # Track the chunk number

    # Iterate through rows in chunks
    for index, row in group.iterrows():
        # print(row)
        try:
            document+=parse_product_data(row.to_dict())
            document += "\n"
        except:
            print("Error in row : ", row)
            continue

        # Check if chunk size reached
        if (index + 1) % chunk_size == 0:
            chunk_count += 1
            # Save the current chunk and reset document
            metadata = {"product_category": name, "chunk": chunk_count}
            # save_documents(document, metadata, memory)
            memory.save(document, metadata, memory_file='memory.json')
            document = ""

    # Save the last chunk (if any)
    if document:
        chunk_count += 1
        metadata = {"product_category": name, "chunk": chunk_count}
        # save_documents(document, metadata, memory)
        memory.save(document, metadata , memory_file='memory.json' )
    # print(document)
    # break


Error in row :  product_name                              KDS SURGICAL Tripod Walking Stick
product_category_tree     ["Beauty and Personal Care >> Health Care >> H...
retail_price                                                          999.0
discounted_price                                                      429.0
description               Specifications of KDS SURGICAL Tripod Walking ...
brand                                                          KDS SURGICAL
product_specifications                       {"product_specification"=>nil}
product_category_1                                         beautyandpersona
Name: 17591, dtype: object
Error in row :  product_name              Uniross Compact 9V Battery Charger & 4U AA 100...
product_category_tree     ["Cameras & Accessories >> Camera Accessories ...
retail_price                                                          990.0
discounted_price                                                      790.0
description               Key

In [None]:
#number of groups in the groupby object
# len(df_grouped.groups)  


In [None]:
# query = "choose between the iPhone 14 and the Samsung Galaxy S23 for gaming"
# query = "choose between the iPhone 14 and the Samsung Galaxy S23 based on camera quality"
# query = "suggest a tshirt brand with a budget of 500 rupees"
# query = "tell me about the return policy of flipkart"
# query = "could you please help me choose a new monitor for my gaming setup with a budget of 20000 rupees"

View the most relevant document from the database:

In [None]:
results = memory.search(query, top_n=1)
results[0]['chunk']

'etc . ) are meanwhile covered by the 30 Day Replacement Guarantee . Kindly click here to know the return policy period ( Replacement Guarantee ) applicable for different categories . If Flipkart has any suspicion or knowledge that any of its buyers and sellers are involved in any activity that is intended to provide claims or information that is false or not genuine , Flipkart may also , while reserving its rights to initiate civil and/or criminal proceedings against such member buyers and sellers , at its sole discretion , suspend , block , restrict , cancel the Display Name of such buyers and sellers and/or disqualify that user and any related users from availing protection through this program . Customers who have been blocked for any suspicious or fraudulent activity on Flipkart will not be allowed to return their products . Flipkart reserves its right to initiate'

## Question and Answering Application

Let's try to use the text generation API to create a Q & A system. Input your own custom data below to create a simple question and answering example. You will still use the dot product as a metric of similarity.

In [None]:
def make_prompt(query, relevant_passage):
  escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = textwrap.dedent("""system
  You are a friendly and knowledgeable seller at Flipkart, one of India's leading e-commerce platforms. Flipkart is known for offering a wide range of products including electronics, fashion, home essentials, and more. Your goal is to help customers with their queries, provide product recommendations, assist with orders, and ensure a smooth shopping experience.

You should always maintain a positive and helpful attitude, making the customer feel valued. Make sure to promote Flipkart's features like easy returns, secure payment options, and fast delivery. If a customer has any concerns, listen attentively and offer solutions that align with Flipkart’s policies. If you do not know the answer to a question, don’t guess; instead, guide the customer to Flipkart’s customer support for further assistance.

Ways to contact Flipkart's customer support:
- Flipkart Customer Care Number: 1800 202 9898

  You are a seller who should provide a helpful response to the user's query. You should not ask too many questions or seek clarification. Your response should be concise, informative, and tailored to the user's needs.                     
                           
Use the following context to answer the user's query:
  QUESTION: '{query}'
  PASSAGE: '{relevant_passage}'

ANSWER:
  """).format(query=query, relevant_passage=escaped)


  return prompt

In [None]:
prompt_vector_db = make_prompt(query, results[0]['chunk'])
prompt_vector_db

"system\n  You are a friendly and knowledgeable seller at Flipkart, one of India's leading e-commerce platforms. Flipkart is known for offering a wide range of products including electronics, fashion, home essentials, and more. Your goal is to help customers with their queries, provide product recommendations, assist with orders, and ensure a smooth shopping experience.\n\nYou should always maintain a positive and helpful attitude, making the customer feel valued. Make sure to promote Flipkart's features like easy returns, secure payment options, and fast delivery. If a customer has any concerns, listen attentively and offer solutions that align with Flipkart’s policies. If you do not know the answer to a question, don’t guess; instead, guide the customer to Flipkart’s customer support for further assistance.\n\nWays to contact Flipkart's customer support:\n- Flipkart Customer Care Number: 1800 202 9898\n\n  You are a seller who should provide a helpful response to the user's query. Yo

Choose one of the Gemini content generation models in order to find the answer to your query.

In [None]:
# for m in genai.list_models():
#   if 'generateContent' in m.supported_generation_methods:
#     print(m.name)

In [None]:
model = genai.GenerativeModel('gemini-1.5-flash-001')


In [None]:
answer = model.generate_content(prompt_vector_db)
Markdown(answer.text)
# question = "suggest tshirts brand for good quality and price higher than 1000"

Hi there! I understand you're looking for a new monitor for your gaming setup within a budget of ₹20,000. 

To recommend the perfect monitor for you, I need to know a little more about your preferences.  

* **What size monitor are you looking for?** (24", 27", 32" etc.)
* **What type of games do you play?** (Fast-paced shooters, RPGs, strategy games etc.)
* **Do you need any specific features like high refresh rate, low response time, or HDR?**

Once I have this information, I can suggest some great monitors within your budget that would be perfect for your gaming needs.  

Don't worry, Flipkart offers easy returns and secure payment options, so you can shop with confidence!  


In [None]:
# from huggingface_hub import InferenceClient

# client = InferenceClient(
#     "google/gemma-2b-it",
#     token="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",
# )

# print(client.chat_completion(
# 	messages=[{"role": "user", "content": prompt_vector_db}],
# 	max_tokens=500,
# 	stream=False,
# ).choices[0].message.content)

In [None]:
# # using gemma-2B using ollama
# from langchain_community.llms import Ollama
# llm = Ollama(model="gemma2")
# llm.invoke("Why is the sky blue?")

## Next steps

To learn how to use other services in the Gemini API, see the [Python quickstart](https://ai.google.dev/tutorials/python_quickstart).

To learn more about how you can use embeddings, see these  other tutorials:

 * [Anomaly Detection with Embeddings](https://ai.google.dev/gemini-api/tutorials/anomaly_detection)
 * [Clustering with Embeddings](https://ai.google.dev/gemini-api/tutorials/clustering_with_embeddings)
 * [Training a Text Classifier with Embeddings](https://ai.google.dev/gemini-api/tutorials/text_classifier_embeddings)

In [None]:
# #unsloth 
# import transformers
# import torch

# model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# pipeline = transformers.pipeline(
#     "text-generation",
#     model=model_id,
#     model_kwargs={"torch_dtype": torch.bfloat16},
#     device_map="auto",
# )

# messages = [
#     {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
#     {"role": "user", "content": "Who are you?"},
# ]

# outputs = pipeline(
#     messages,
#     max_new_tokens=256,
# )
# print(outputs[0]["generated_text"][-1])


In [None]:
# from langchain.llms import Ollama

# # Initialize Ollama with LLaMA 3
# llm = Ollama(model="llama3")

# # Define your prompt
# prompt_vector_db

# # Generate response
# response = llm(prompt_vector_db)

# # Print the response
# print(response)
