In [None]:
#To run transformers we would need GPU. Hence, we can you `https://colab.research.google.com/` with free T4 Runtime and execute below code.

#Log-in to HuggingFace.co and generate write TOKEN and put it in colab secret

!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124

!pip install -q requests==2.32.4 diffusers transformers==4.48.3 gradio langchain langchain-community faiss-cpu langchain_experimental bitsandbytes==0.46.0 accelerate==1.3.0



from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM,  BitsAndBytesConfig, pipeline
import torch
from google.colab import userdata
import gradio as gr
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain, RetrievalQA
from langchain_community.llms import HuggingFacePipeline



hf_token=userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)


loader = CSVLoader(file_path="/content/sample_data/california_housing_test.csv", content_columns= ['longitude',	'latitude',	'housing_median_age',	'total_rooms',	'total_bedrooms',	'population',	'households',	'median_income',	'median_house_value'],
                   metadata_columns=['longitude',	'latitude',	'housing_median_age',	'total_rooms',	'total_bedrooms',	'population',	'households',	'median_income',	'median_house_value'])

documents = loader.load()

text_splitter = SemanticChunker(HuggingFaceEmbeddings)
chunks = text_splitter.split_documents(documents)

embedding = HuggingFaceEmbeddings()

vector = FAISS.from_documents(documents = documents, embedding=embedding)

retriever = vector.as_retriever(search_type = "similarity", search_kwargs={"k":3})



code_qwen = "Qwen/CodeQwen1.5-7B-Chat"

messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Tell me a joke."}
]

quant_config= BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(code_qwen)   #initialize tokenizer
tokenizer.pad_token=tokenizer.eos_token    #add padding


model = AutoModelForCausalLM.from_pretrained(code_qwen, device_map="auto", quantization_config=quant_config) #initialize llm model


pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer = tokenizer,
    max_new_tokens=512,
    temperature=0.7
)

llm = HuggingFacePipeline(pipeline=pipe)

chain = RetrievalQA.from_chain_type(llm, retriever = retriever)

query = "Please explain california housing test in couple of sentences"

print(chain.run(query))



# memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# conversation_chain = ConversationalRetrievalChain.from_llm(llm=model, retriever=retriever, memory=memory)



# result = conversation_chain.invoke({"question": query})

# print(result["answer"])



