In [None]:
!pip install datasets pandas pymongo sentence_transformers
!pip install -U transformers
# Install below if using GPU
!pip install accelerate


In [None]:
pip install pandas pymupdf


In [None]:
import fitz  # PyMuPDF
import pandas as pd

# Replace with your PDF file path
pdf_path = "/content/sample_data/challenges_music_recom.pdf"

# Open the PDF file
document = fitz.open(pdf_path)

# Extract text from each page and store it in a list
text_data = []
for page_num in range(len(document)):
    page = document[page_num]
    text = page.get_text()
    text_data.append(text)

# Convert the list into a pandas DataFrame
dataset_df = pd.DataFrame(text_data, columns=["Text"])

# Display the first 5 rows
dataset_df.head()


In [None]:
# Remove any rows where the "Text" column is missing
dataset_df = dataset_df.dropna(subset=["Text"])
print("\nNumber of missing values in each column after removal:")
print(dataset_df.isnull().sum())

# Display the first 5 rows of the cleaned DataFrame
dataset_df.head(5)

In [None]:
from sentence_transformers import SentenceTransformer

# Load the embedding model from Hugging Face
embedding_model = SentenceTransformer("thenlper/gte-large")

# Function to get embeddings for a given text
def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()

# Apply the embedding function to the "Text" column
dataset_df["embedding"] = dataset_df["Text"].apply(get_embedding)

# Display the first 5 rows of the DataFrame with the embeddings
dataset_df.head()


In [None]:
!pip install pymongo

In [None]:
import pymongo
from google.colab import userdata

def get_mongo_client(mongo_uri):
  """Establish connection to the MongoDB."""
  try:
    client = pymongo.MongoClient(mongo_uri, appname="devrel.content.python")
    print("Connection to MongoDB successful")
    return client
  except pymongo.errors.ConnectionFailure as e:
    print(f"Connection failed: {e}")
    return None

mongo_uri = "enter your uri"
if not mongo_uri:
  print("MONGO_URI not set in environment variables")

mongo_client = get_mongo_client(mongo_uri)

# Ingest data into MongoDB
db = mongo_client['pdftexts']
collection = db['pdfgen']

In [None]:
# Delete any existing records in the collection
collection.delete_many({})

In [None]:
documents = dataset_df.to_dict("records")
collection.insert_many(documents)

print("Data ingestion into MongoDB completed")

In [None]:
def vector_search(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)

    if not query_embedding:  # Check if embedding generation failed
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    vector_search_stage = {
        "$vectorSearch": {
            "index": "vector_index",  # Ensure this index exists in your collection
            "queryVector": query_embedding,
            "path": "embedding",
            "numCandidates": 150,  # Number of candidate matches to consider
            "limit": 4  # Return top 4 matches
        }
    }

    unset_stage = {
        "$unset": "embedding"  # Exclude the 'embedding' field from the results
    }

    project_stage = {
        "$project": {
            "_id": 0,  # Exclude the _id field
            "Text": 1,  # Include the Text field
            "score": {
                "$meta": "vectorSearchScore"  # Include the search score
            }
        }
    }

    pipeline = [vector_search_stage, unset_stage, project_stage]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)


In [None]:
def get_search_result(query, collection):
    """
    Get search results based on the user's query.

    Args:
    query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    str: A formatted string of search results.
    """

    # Perform vector search to retrieve relevant documents
    get_knowledge = vector_search(query, collection)

    # Combine the relevant text into a single string
    search_result = "Relevant information from the documents:\n\n"
    for result in get_knowledge:
        text = result.get('Text', 'N/A')
        search_result += f"{text}\n\n"

    return search_result

In [None]:
# Conduct a query with retrieval of sources
query = "What are the music streaming platforms that exist?"
source_information = get_search_result(query, collection)

# Combine the query with the search results
combined_information = f"Answer the following question using the information provided:\n\n{source_information}\n\nQuestion: {query}"

# Print the combined information (for debugging)
print(combined_information)

In [None]:
from huggingface_hub import login

login()


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", device_map = "auto" ,torch_dtype=torch.float16)



In [None]:
# Moving tensors to GPU
input_ids = tokenizer(combined_information, return_tensors="pt")
input_ids = input_ids.to(model.device)
response = model.generate(**input_ids, max_new_tokens=20)
print(tokenizer.decode(response[0]))