In [4]:
pip install faiss-cpu




In [5]:
import pandas as pd
import spacy
import numpy as np
import torch
from transformers import BertModel, BertTokenizer, RagTokenizer, RagTokenForGeneration
import faiss

## Data Preprocessing with spaCy

This section of the code is dedicated to loading and preprocessing a dataset of Medium articles. The aim is to clean and prepare text data for further natural language processing or machine learning tasks.

### Dependencies

The script uses `pandas` for data manipulation and `spaCy` for natural language processing:

- **pandas**: A powerful Python data analysis toolkit for reading and manipulating tabular data.
- **spaCy**: An advanced natural language processing library designed for large-scale information extraction tasks.

```python
import pandas as pd
import spacy


In [6]:
import pandas as pd
import spacy

data = pd.read_csv('/content/medium.csv')
nlp = spacy.load('en_core_web_sm')

def preprocess(text):
    doc = nlp(text.lower())
    lemmatized = [token.lemma_ for token in doc if not token.is_punct and not token.is_stop and not token.is_space]
    return ' '.join(lemmatized)

# Applying preprocessing to the article titles and text content.
data['processed_title'] = data['Title'].apply(preprocess)
data['processed_text'] = data['Text'].apply(preprocess)
# Displaying the first few rows to check the results
print(data[['processed_title', 'processed_text']].head())


                                     processed_title  \
0    beginner guide word embed gensim word2vec model   
1  hand graph neural network pytorch pytorch geom...   
2                                 use ggplot2 python   
3  databrick save datum frame csv file local comp...   
4  step step implementation gradient descent back...   

                                      processed_text  
0  1 introduction word2vec word2vec popular techn...  
1  article introduce concept graph neural network...  
2  introduction thank strict implementation gramm...  
3  photo credit mika baumeister unsplash work pyt...  
4  step step implementation gradient descent back...  


## Article Text Chunking Function

The `chunk_article` function is designed to segment an article's text into smaller, manageable chunks. This is particularly useful for processing large texts or for applications like text summarization, where handling smaller sections of text can improve both efficiency and effectiveness.

### Purpose

The function takes a piece of text and divides it into chunks that do not exceed a specified maximum length. This ensures that each chunk is substantial enough to maintain contextual meaning without being too cumbersome for NLP tasks.



In [7]:
def chunk_article(text, max_length=300):
    doc = nlp(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in doc.sents:
        sentence_length = len(sentence.text.split())
        if current_length + sentence_length > max_length:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence.text]
            current_length = sentence_length
        else:
            current_chunk.append(sentence.text)
            current_length += sentence_length
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks


In [8]:
# Apply chunking
data['chunks'] = data['Text'].apply(chunk_article)
data.explode('chunks').to_csv('passages.tsv', index=False, sep='\t')


## Creating and Indexing Embeddings with BERT and FAISS

This section of the code demonstrates how to create text embeddings using a BERT model and subsequently index these embeddings using FAISS for efficient similarity searches. This process is crucial for applications like semantic search, where you want to quickly retrieve the most relevant texts based on their semantic content.

### Dependencies

- **Transformers**: Provides access to the pre-trained BERT model and tokenizer.
- **Torch**: Used for managing model inputs and outputs.
- **NumPy**: Aids in handling arrays—specifically, stacking embeddings into a matrix.
- **FAISS**: Facebook AI Similarity Search, used for efficient similarity searching.

### BERT Model and Tokenizer Initialization

First, we initialize the tokenizer and model from the Hugging Face `transformers` library. We use the `bert-base-uncased` model, which is a general-purpose pre-trained model suitable for a variety of NLP tasks.



In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def create_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(1).squeeze().numpy()
embeddings = np.vstack([create_embeddings(chunk) for chunk in data.explode('chunks')['chunks']])

# FAISS index
index = faiss.IndexFlatL2(768)
index.add(embeddings)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
faiss.write_index(index, './index.faiss')

In [None]:
def search(query, k=5):
    query_embedding = create_embeddings(query)
    # Ensure that query_embedding is a 2D array with only one query
    query_embedding = np.array([query_embedding])
    distances, indices = index.search(query_embedding, k)
    return distances, indices



In [None]:
print(search("Deep Learning"))

In [None]:
pip install datasets

## Setting Up the Retrieval-Augmented Generation (RAG) Model

This section describes how to configure and initialize the Retrieval-Augmented Generation (RAG) model, which combines the power of a dense retriever (using FAISS) and a sequence-to-sequence model. The RAG model leverages both the retrieval of relevant document passages and the generation of coherent text responses, making it ideal for applications such as question answering and information retrieval from a large corpus.

### Import Required Libraries

Before setting up the model, ensure all necessary libraries are imported. This includes `transformers` for the RAG components and `faiss` for efficient similarity search which is crucial for the retrieval component of RAG.

```python
from transformers import RagRetriever, RagTokenForGeneration, RagConfig, RagTokenizer
import faiss  # This ensures faiss is loaded and available


In [None]:
from transformers import RagRetriever, RagTokenForGeneration, RagConfig, RagTokenizer
import faiss
config = RagConfig.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq",
                                         config=config,
                                         index_name="flat",
                                         passages_path="./passages.tsv",
                                         index_path="./index.faiss")
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", config=config, retriever=retriever)

In [None]:
def generate_answer(query):
    input_ids = tokenizer(query, return_tensors="pt").input_ids
    with torch.no_grad():
        output_ids = model.generate(input_ids=input_ids, num_beams=5)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)
query = "What is the future of artificial intelligence?"
answer = generate_answer(query)
print("Generated Answer:", answer)
