In [None]:
!pip show protobuf

!pip uninstall tensorflow tensorflow-metadata protobuf -y
!pip install tensorflow
!pip install tensorflow --use-deprecated=legacy-resolver

!pip install langchain_community
!pip install langchain_text_splitters
!pip install langchain_openai
!pip install langchain_huggingface
!pip install chromadb

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

from typing import List
from langchain.schema import Document  # Import Document if needed
import pandas as pd
import os

In [None]:
loader = TextLoader('/content/drive/MyDrive/Intro.txt')
data = loader.load()

In [None]:
data

In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=150,
    chunk_overlap=50,
    encoding_name='cl100k_base'
)

texts = text_splitter.split_text(data[0].page_content)

In [None]:
texts

In [None]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = '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' #본인의 Hugging Face token 입력

In [None]:
embeddings_model = HuggingFaceEmbeddings(
        model_name='jhgan/ko-sroberta-nli',
        model_kwargs={'device':'cpu'},
        encode_kwargs={'normalize_embeddings':True}
    )

In [None]:
print(texts[0])
embeddings_model.embed_query(texts[0])[:3]

In [None]:
# Vectorized size
len(embeddings_model.embed_query(texts[0]))

In [None]:
db = Chroma.from_texts(
    texts,
    embeddings_model,
    collection_name = 'history',
    persist_directory = './db/chromadb',
    collection_metadata = {'hnsw:space': 'cosine'}, # l2 is the default
)

db

In [None]:
query = '디지털경영전공은 무엇인가요?'
docs = db.similarity_search(query)
print(docs[0].page_content)

In [None]:
docs

In [None]:
query = '데이터분석과 빅데이터는 무슨 상관인가요'
docs = db.similarity_search(query)
print(docs[0].page_content)

In [None]:
# Read the online retail dataset
data = pd.read_csv('/content/drive/MyDrive/amazon.csv')
df = data[:100].copy()
df.dropna(subset=['rating_count'], inplace=True)

df['sub_category'] = df['category'].astype(str).str.split('|').str[-1]
df['main_category'] = df['category'].astype(str).str.split('|').str[0]

In [None]:
df.columns

In [None]:
df1 = df.copy()
df1['product_name'] = df1['product_name'].str.lower()
df1 = df1.drop_duplicates(subset=['product_name'])

In [None]:
print(df.shape)
print(df1.shape)

In [None]:
df1['product_name'][0]

In [None]:
df1['about_product']

In [None]:
df2 = df1[['product_id','product_name', 'about_product','main_category','sub_category', 'actual_price','discount_percentage','rating','rating_count' ]]

In [None]:
df2.head()

In [None]:
df2.to_csv('/content/drive/MyDrive/amazon_rag.csv', index=False)

In [None]:
from typing import List
from dotenv import load_dotenv
import os

from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
from sentence_transformers import SentenceTransformer
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.vectorstores import Chroma
from langchain.schema.document import Document


# This will expose your Langchain api token as an environment variable
load_dotenv()

# def read_csv(file_path: str, source_column: str = "about_product") -> List[Document]:
def read_csv(file_path: str, source_column: str = "product_name") -> List[Document]:
    """Reads a CSV file and returns a list of Documents.

    Args:
        file_path (str): The path to the CSV file to read.
        source_column (str, optional): The name of the column in the CSV file that contains the text data. Defaults to "Description".

    Returns:
        List[Document]: A list of Documents, where each Document contains the text data from the corresponding row in the CSV file.

    Raises:
        FileNotFoundError: If the CSV file does not exist.
        IOError: If there is an error reading the CSV file.
    """

    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File does not exist: {file_path}")

    loader = CSVLoader(file_path=file_path, source_column=source_column)
    data = loader.load()

    return data

In [None]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = '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' #본인의 Hugging Face token 입력

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
#reference: https://python.langchain.com/docs/integrations/providers/huggingface/#huggingfaceembeddings
# model_name = "intfloat/multilingual-e5-large-instruct"
model_name = 'jhgan/ko-sroberta-nli'

# Function to load embeddings model
def load_embeddings_model(model_name: str) -> HuggingFaceEmbeddings:
    """Loads a Hugging Face Transformer model and returns an Embeddings object.

    Args:
        model_name (str): The name of the Hugging Face Transformer model to load.

    Returns:
        HuggingFaceEmbeddings: An Embeddings object that can be used to encode text into embeddings.
    """
    embedding_function = HuggingFaceEmbeddings(
        model_name=model_name,
        # huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
        model_kwargs={'device':'cpu'},
        encode_kwargs={'normalize_embeddings':True}
    )
    return embedding_function

In [None]:
# Load the embeddings

# model_name = "intfloat/multilingual-e5-large-instruct"
model_name = 'sentence-transformers/all-mpnet-base-v2'
# model_name = 'jhgan/ko-sroberta-nli'
embeddings = load_embeddings_model(model_name)

# Test embedding a query
text = "This is a test document."
query_result = embeddings.embed_query(text)
print(query_result[:3])

In [None]:
def vectorize_documents(data: List[Document], embedding_function: HuggingFaceEmbeddings) -> Chroma:
    """Vectorizes a list of Documents using a Hugging Face Transformer model.

    Args:
        data (List[Document]): A list of Documents to vectorize.
        embedding_function (HuggingFaceEmbeddings): An Embeddings object that can be used to encode text into embeddings.

    Returns:
        Chroma: A Chroma object that contains the vectorized documents.
    """

    ## Chroma, as a vector database, cosine similarity by default for searches.
    db = Chroma.from_documents(data, embedding=embedding_function,
                            #    collection_metadata={"hnsw:space": "l2"}
                               collection_metadata={"hnsw:space": "cosine"}
                               )
    return db

In [None]:
def init_llm():
    """Initializes the LLM by reading the CSV file, loading the embeddings model, and vectorizing the documents.

    Returns:
        Chroma: A Chroma object that contains the vectorized documents.
    """
    # Replace 'read_csv' with the appropriate CSV reading logic
    data = read_csv(file_path='/content/drive/MyDrive/amazon_rag.csv', source_column="product_name")
    model_name = 'sentence-transformers/all-mpnet-base-v2'
    # model_name = 'jhgan/ko-sroberta-nli'
    embedding_function = load_embeddings_model(model_name)
    db = vectorize_documents(data, embedding_function)
    return db

In [None]:
db = init_llm()

In [None]:
# Query the vector database
query = "iPhone USB charger and adapter"
found_docs = db.similarity_search_with_score(query, k=5)

In [None]:
found_docs

In [None]:
# Document with Score
document, score = found_docs[-1]
print(document.page_content)
print(f"\nScore: {score}")

In [None]:
found_docs[-1][0].page_content

In [None]:
#Get source
found_docs[0][0].metadata['source']

In [None]:
# def run_vector_search(query: str):
#     # print(query)
#     query_vector = db.similarity_search_with_score(query, k=5)
#     # print(query_vector)
#     # document, score = query_vector

#     # return query_vector.metadata['source'], document.page_content, score
#     return query_vector

def run_vector_search(query: str, k = 3) -> str:
    """Performs a vector search and returns results in a structured format."""
    query_vector = db.similarity_search_with_score(query, k=k)

    # Extract and structure the search results
    results = []
    for document, score in query_vector:
        # Extract metadata and page content
        metadata = document.metadata
        page_content = {k.strip(): v.strip() for k, v in [line.split(':', 1) for line in document.page_content.split('\n') if ':' in line]}

        # Combine all data into a single dictionary
        combined_result = {
            "source": metadata.get("source", "Unknown Source"),
            **page_content,
            "score": score
        }
        results.append(combined_result)

    # Convert to a DataFrame
    df_results = pd.DataFrame(results)
    return df_results

In [None]:
run_vector_search('what is the iphone cable?')

In [None]:
from transformers import AutoTokenizer
import transformers
import torch

# Load model
# model = "nilq/mistral-1L-tiny"
#huggingface-cli login
#Reference: https://huggingface.co/docs/huggingface_hub/en/guides/cli

model = "ArliAI/Gemma-2-2B-ArliAI-RPMax-v1.1"#'ArliAI/Mistral-Small-22B-ArliAI-RPMax-v1.1'
tokenizer = AutoTokenizer.from_pretrained(
    model,
    use_auth_token= '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', #본인의 Token 업데이트,
)

# pipeline
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [None]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline(pipeline=pipeline)

In [None]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain import PromptTemplate

promptTemplate_fstring = """
You are a customer service assistant, tasked with providing clear and concise answers based on the given context.
Context:
{context}
Question:
{query}
Answer:
"""

In [None]:
from langchain_core.prompts import PromptTemplate

template = """Question: {question}
Answer:"""

prompt = PromptTemplate.from_template(template)
print(prompt)
chain = prompt | hf

question = "What is electroencephalography?"

print(chain.invoke({"question": question}))

In [None]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# Query definition
query = "suggest cool iPhone USB charger and adapter"
# query = "what is the iphone cable?"
# query = "What is the caracteristic of iPhone USB charger and adapter"

# Perform vector search
doc_context = run_vector_search(query)

# Extract relevant columns
doc = doc_context[['product_name', 'about_product']]
# doc = doc_context[[ 'about_product']]

# print(doc)
# Convert context to string
context = doc.to_string(index=False)

#You are an assistant in customer service. Use the following context to answer the question:

# Define the prompt template
# promptTemplate_fstring = """
# Context:
# {context}
# Question:
# {query}
# Answer:
# """

promptTemplate_fstring = """
You are a customer service assistant, tasked with providing clear and concise answers based on the given context.
Context:
{context}
Question:
{query}
Answer:
"""

# Initialize the prompt
prompt = PromptTemplate(
    # input_variables=["query", "context"],
    template=promptTemplate_fstring,
)

# print(prompt)
# Create the chain
chain = LLMChain(prompt=prompt, llm=hf)

# Run the chain and get the response
response = chain.run({"query": query, "context": context})

# Print the response
print(response)