In [5]:
import pandas as pd
import re
from nltk.tokenize import sent_tokenize

# Load the dataset
data_path = "chat_data.csv"  # Adjust if file is zipped
df = pd.read_csv(data_path)

# Example of cleaning a single conversation
def clean_text(text):
    """
    Cleans the text by:
    - Removing unwanted characters
    - Handling extra whitespaces
    """
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s.,!?]', '', text)  # Remove special characters
    text = text.strip()
    return text

df['cleaned_conversations'] = df['conversations'].apply(clean_text)

# Splitting large texts into smaller chunks (e.g., 512 tokens)
def chunk_text(text, chunk_size=512):
    """
    Splits text into smaller chunks of a defined size, 
    while respecting sentence boundaries.
    """
    sentences = sent_tokenize(text)  # Split into sentences
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence.split())  # Word count in the sentence
        if current_length + sentence_length > chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(sentence)
        current_length += sentence_length

    # Add any remaining chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Apply chunking
df['conversation_chunks'] = df['cleaned_conversations'].apply(chunk_text)

# Flatten the DataFrame: each chunk becomes a row
flattened_data = []
for idx, row in df.iterrows():
    for chunk in row['conversation_chunks']:
        flattened_data.append({'id': row['id'], 'chunk': chunk})

chunk_df = pd.DataFrame(flattened_data)

# Review preprocessed data
chunk_df.head()


Unnamed: 0,id,chunk
0,identity_0,"from human, value Ive been feeling so sad and ..."
1,identity_0,Remember to be kind to yourself and celebrate ...
2,identity_1,"from human, value Hi, Im feeling really scared..."
3,identity_1,"from human, value Thank you for reminding me o..."
4,identity_2,"from human, value Hey, I hope youre doing well..."


In [6]:
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.decomposition import PCA

# Load model with GPU support if available
embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cuda' if torch.cuda.is_available() else 'cpu')

# Batch encode chunks
chunks = chunk_df['chunk'].tolist()
embeddings = embedding_model.encode(chunks, batch_size=32, show_progress_bar=True, convert_to_numpy=True)

# Reduce dimensions using PCA (Optional: Use `whiten=True` for normalized output)
pca = PCA(n_components=128, whiten=True)
reduced_embeddings = pca.fit_transform(embeddings)

# Build a FAISS index with IVF for scalable approximate search
nlist = 100  # Number of clusters for inverted indexing
quantizer = faiss.IndexFlatL2(128)  # Base index for clustering
faiss_index = faiss.IndexIVFFlat(quantizer, 128, nlist, faiss.METRIC_L2)

# Train the index on reduced embeddings
if not faiss_index.is_trained:
    faiss_index.train(reduced_embeddings)

# Add vectors to the index
faiss_index.add(reduced_embeddings)

# Optional: Save the index for reuse
faiss.write_index(faiss_index, "faiss_index_ivf.index")

print(f"FAISS index built successfully with {faiss_index.ntotal} vectors.")


Batches:   0%|          | 0/6135 [00:00<?, ?it/s]

FAISS index built successfully with 196319 vectors.


In [15]:

def retrieve_relevant_chunks(query, k=5):
    """
    Retrieve the top-k relevant chunks based on the query.
    """
    # Encode the query
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    
    # Reduce dimensions using the same PCA used earlier
    query_embedding_reduced = pca.transform(query_embedding)
    
    # Search the FAISS index
    distances, indices = faiss_index.search(query_embedding_reduced.astype(np.float32), k)
    
    # Retrieve the corresponding chunks
    results = [chunk_df.iloc[idx]['chunk'] for idx in indices[0]]
    return results


In [17]:
query = "How can I reset my password?"
relevant_chunks = retrieve_relevant_chunks(query)
print(relevant_chunks)


['from human, value I like the idea of selfcompassion, but its hard to convince myself that I deserve it. Challenging negative thoughts might be difficult, but Im willing to give it a shot. As for finding joy, its been a while since Ive truly felt happy. How do I even begin? from gpt, value Starting small is key, Charlie. Begin by acknowledging one positive thing about yourself each day, no matter how small. Take time to appreciate your strengths and accomplishments, however insignificant they may seem. Its through these small steps that you can gradually invite joy back into your life. Remember, Im here to guide and support you throughout this journey. from human, value Thank you, Alex. It means a lot to have someone who believes in me when I struggle to believe in myself. Im scared, but Im willing to give therapy a chance. from gpt, value Charlie, Im here for you every step of the way. It takes strength and courage to confront our inner demons and work towards healing. Together, we c

In [39]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Load model and tokenizer for Falcon
model_name = "meta-llama/Llama-2-7b-hf"  # Adjust to your desired model path
tokenizer = AutoTokenizer.from_pretrained(model_name)
llm_model = AutoModelForCausalLM.from_pretrained(model_name)

# Create Hugging Face pipeline for text generation
llm_pipeline = pipeline("text-generation", model=llm_model, tokenizer=tokenizer)

# Define your RAG pipeline here
def rag_pipeline(query):
    """
    RAG pipeline:
    1. Retrieve relevant context chunks.
    2. Use the LLM to generate a response using the retrieved context.
    """
    # Retrieve relevant chunks (implement `retrieve_relevant_chunks` function as per your code)
    context = retrieve_relevant_chunks(query)
    context_str = "\n".join(context)
    
    # Formulate a prompt
    prompt = f"Context: {context_str}\n\nQuestion: {query}\n\nAnswer:"
    
    # Generate response
    response = llm_pipeline(prompt, max_length=200, num_return_sequences=1)
    return response[0]['generated_text']

# Test the RAG pipeline
response = rag_pipeline("How do I reset my password?")
print(response)


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-2-7b-hf.
401 Client Error. (Request ID: Root=1-674faabd-0be79050114ff4f47331a8da;83920360-085e-454b-9bcd-dadefda39b37)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Access to model meta-llama/Llama-2-7b-hf is restricted. You must have access to it and be authenticated to access it. Please log in.