In [51]:
import os
from dotenv import load_dotenv
from langchain_community.chat_models.huggingface import ChatHuggingFace
from langchain.chains import LLMChain
from langchain_community.llms import HuggingFaceHub, HuggingFaceEndpoint
from langchain.prompts import PromptTemplate
from langchain.schema import (
    HumanMessage,
    SystemMessage,
)
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from dotenv import load_dotenv


In [52]:
# Selecting LLM Model from HuggingFace 
REPO_ID = "google/gemma-2b-it"

In [53]:
# Getting HF_TOKEN
load_dotenv(".env.template")
HF_TOKEN = os.environ.get("HF_TOKEN")

Run Locally using LangChain Pipeline

In [54]:
hf = HuggingFacePipeline.from_model_id(
    model_id=REPO_ID,
    task="text-generation",
    pipeline_kwargs={"max_new_tokens": 100},
)

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device has 1 GPUs available. Provide device={deviceId} to `from_model_id` to use availableGPUs for execution. deviceId is -1 (default) for CPU and can be a positive integer associated with CUDA device id.


In [55]:
template = """Question: {question}

Answer: Let's think step by step."""

prompt = PromptTemplate.from_template(template)

In [56]:
llm_chain = LLMChain(prompt=prompt, llm=hf)

Getting The Data

In [1]:
from langchain_community.document_loaders import TextLoader, PyPDFLoader, DirectoryLoader
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader

In [None]:
# pip install unstructured

In [37]:
txtDatas = []
for filename in os.listdir("../Bank Product Data/"):
    if(filename.endswith(".txt")):
        txtDatas.append(filename)

print(txtDatas)

['Business bank accounts.txt', 'Buy now, pay later.txt', 'Car Loans.txt', 'Commonwealth Bank of Australia.txt', 'CommonWealth Our company.txt', 'Commonwealth Private.txt', 'Credit cards.txt', 'Debt consolidation loans.txt', 'Disputing a transaction.txt', 'electric-vehicle-loan.txt', 'Everyday Account Smart Access.txt', 'EVERYDAY ACCOUNT.txt', 'Fixed Rate Personal Loans.txt', 'Foreign Currency Account.txt', 'GoalSaver.txt', 'Helpful ways to offset the rising cost of living.txt', 'Home Improvement Loan.txt', 'How do I activate my CommBank card.txt', 'How do I close my CommBank account.txt', 'How do I report my card lost, stolen or damaged.txt', 'How long does it take to transfer money.txt', 'InstalPay.txt', 'International Money Transfers (IMT).txt', 'International Money Transfers.txt', 'Klarna.txt', 'NetBank Saver.txt', 'Pensioner Security Account.txt', 'Personal loans.txt', 'Premier and Private Banking.txt', 'Premier Banking.txt', 'Saving Account & Term Deposit.txt', 'Secured Personal L

In [42]:
output_filename = "combined.txt"

def concat_text_files(filenames, output_filename):
  with open(output_filename, 'w') as outfile:
    for filename in filenames:
      path = "../Bank Product Data/"+filename
      with open(path, 'r', encoding='utf-8') as infile:
        outfile.write(infile.read())

In [43]:
concat_text_files(txtDatas, output_filename)

UnicodeEncodeError: 'charmap' codec can't encode character '\u202f' in position 5018: character maps to <undefined>

In [48]:
loader = TextLoader("./combined.txt")
data = loader.load()

In [None]:
# loader = WebBaseLoader("https://www.ocbc.com.hk/webpages/en-us/html/retail_banking/personal/faq.html")


# loader = DirectoryLoader('../', glob="**/*.txt")
# docs = loader.load()
# data = loader.load()
# print(len(data))

# loader = DirectoryLoader("../Bank Product Data",{
#     ".txt": (path) => TextLoader(path),
# }
# )
# loader.load()

In [49]:
# split document into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(data)


Created a chunk of size 2042, which is longer than the specified 1000
Created a chunk of size 1659, which is longer than the specified 1000


In [50]:
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [57]:
# load it into Chroma
db = Chroma.from_documents(docs, embedding_function)



In [64]:
# Creating RAG Chain

from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.chains import RetrievalQA

In [59]:
retriever = db.as_retriever(search_type="mmr", search_kwargs={'k': 4, 'fetch_k': 20})
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm_chain
)

In [65]:
# Ask the LLM
rag_chain.invoke("Why Choose CommBank for your next car?")