In [1]:
!pip install datasets pandas pymongo sentence_transformers
!pip install -U transformers
# Install below if using GPU
!pip install accelerate




In [2]:
pip install pandas pymupdf




In [1]:
import fitz  # PyMuPDF
import pandas as pd

# Replace with your PDF file path
pdf_path = "/content/sample_data/challenges_music_recom.pdf"

# Open the PDF file
document = fitz.open(pdf_path)

# Extract text from each page and store it in a list
text_data = []
for page_num in range(len(document)):
    page = document[page_num]
    text = page.get_text()
    text_data.append(text)

# Convert the list into a pandas DataFrame
dataset_df = pd.DataFrame(text_data, columns=["Text"])

# Display the first 5 rows
dataset_df.head()


Unnamed: 0,Text
0,International Journal of Multimedia Informatio...
1,96\nInternational Journal of Multimedia Inform...
2,International Journal of Multimedia Informatio...
3,98\nInternational Journal of Multimedia Inform...
4,International Journal of Multimedia Informatio...


In [2]:
# Remove any rows where the "Text" column is missing
dataset_df = dataset_df.dropna(subset=["Text"])
print("\nNumber of missing values in each column after removal:")
print(dataset_df.isnull().sum())

# Display the first 5 rows of the cleaned DataFrame
dataset_df.head(5)


Number of missing values in each column after removal:
Text    0
dtype: int64


Unnamed: 0,Text
0,International Journal of Multimedia Informatio...
1,96\nInternational Journal of Multimedia Inform...
2,International Journal of Multimedia Informatio...
3,98\nInternational Journal of Multimedia Inform...
4,International Journal of Multimedia Informatio...


In [3]:
from sentence_transformers import SentenceTransformer

# Load the embedding model from Hugging Face
embedding_model = SentenceTransformer("thenlper/gte-large")

# Function to get embeddings for a given text
def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()

# Apply the embedding function to the "Text" column
dataset_df["embedding"] = dataset_df["Text"].apply(get_embedding)

# Display the first 5 rows of the DataFrame with the embeddings
dataset_df.head()


  from tqdm.autonotebook import tqdm, trange


Unnamed: 0,Text,embedding
0,International Journal of Multimedia Informatio...,"[0.004441555123776197, -0.005392851307988167, ..."
1,96\nInternational Journal of Multimedia Inform...,"[0.0002883921260945499, 0.005521760322153568, ..."
2,International Journal of Multimedia Informatio...,"[-0.012531926855444908, 0.0199411790817976, -0..."
3,98\nInternational Journal of Multimedia Inform...,"[-0.011059354990720749, 0.016245555132627487, ..."
4,International Journal of Multimedia Informatio...,"[-0.009623144753277302, 0.023151641711592674, ..."


In [35]:
!pip install pymongo



In [4]:
import pymongo
from google.colab import userdata

def get_mongo_client(mongo_uri):
  """Establish connection to the MongoDB."""
  try:
    client = pymongo.MongoClient(mongo_uri, appname="devrel.content.python")
    print("Connection to MongoDB successful")
    return client
  except pymongo.errors.ConnectionFailure as e:
    print(f"Connection failed: {e}")
    return None

mongo_uri = "mongodb+srv://sumedha:1024@cluster0.pcy3g.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
if not mongo_uri:
  print("MONGO_URI not set in environment variables")

mongo_client = get_mongo_client(mongo_uri)

# Ingest data into MongoDB
db = mongo_client['pdftexts']
collection = db['pdfgen']

Connection to MongoDB successful


In [5]:
# Delete any existing records in the collection
collection.delete_many({})

DeleteResult({'n': 22, 'electionId': ObjectId('7fffffff0000000000000091'), 'opTime': {'ts': Timestamp(1724202675, 54), 't': 145}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1724202675, 55), 'signature': {'hash': b'\xf1py\xa3\x05\xeed\xad\xf84\xb6\xdf\xb5\xec\xd82\xa15\xa1\x06', 'keyId': 7342970547904446472}}, 'operationTime': Timestamp(1724202675, 54)}, acknowledged=True)

In [6]:
documents = dataset_df.to_dict("records")
collection.insert_many(documents)

print("Data ingestion into MongoDB completed")

Data ingestion into MongoDB completed


In [7]:
def vector_search(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)

    if not query_embedding:  # Check if embedding generation failed
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    vector_search_stage = {
        "$vectorSearch": {
            "index": "vector_index",  # Ensure this index exists in your collection
            "queryVector": query_embedding,
            "path": "embedding",
            "numCandidates": 150,  # Number of candidate matches to consider
            "limit": 4  # Return top 4 matches
        }
    }

    unset_stage = {
        "$unset": "embedding"  # Exclude the 'embedding' field from the results
    }

    project_stage = {
        "$project": {
            "_id": 0,  # Exclude the _id field
            "Text": 1,  # Include the Text field
            "score": {
                "$meta": "vectorSearchScore"  # Include the search score
            }
        }
    }

    pipeline = [vector_search_stage, unset_stage, project_stage]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)


In [8]:
def get_search_result(query, collection):
    """
    Get search results based on the user's query.

    Args:
    query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    str: A formatted string of search results.
    """

    # Perform vector search to retrieve relevant documents
    get_knowledge = vector_search(query, collection)

    # Combine the relevant text into a single string
    search_result = "Relevant information from the documents:\n\n"
    for result in get_knowledge:
        text = result.get('Text', 'N/A')
        search_result += f"{text}\n\n"

    return search_result

In [9]:
# Conduct a query with retrieval of sources
query = "What are the music streaming platforms that exist?"
source_information = get_search_result(query, collection)

# Combine the query with the search results
combined_information = f"Answer the following question using the information provided:\n\n{source_information}\n\nQuestion: {query}"

# Print the combined information (for debugging)
print(combined_information)

Answer the following question using the information provided:

Relevant information from the documents:

International Journal of Multimedia Information Retrieval (2018) 7:95–116
https://doi.org/10.1007/s13735-018-0154-2
TRENDS AND SURVEYS
Current challenges and visions in music recommender
systems research
Markus Schedl1
· Hamed Zamani2 · Ching-Wei Chen3 ·
Yashar Deldjoo4 · Mehdi Elahi5
Received: 28 September 2017 / Revised: 17 March 2018 / Accepted: 27 March 2018 / Published online: 5 April 2018
© The Author(s) 2018
Abstract
Music recommender systems (MRSs) have experienced a boom in recent years, thanks to the emergence and success of online
streaming services, which nowadays make available almost all music in the world at the user’s ﬁngertip. While today’s MRSs
considerably help users to ﬁnd interesting music in these huge catalogs, MRS research is still facing substantial challenges.
In particular when it comes to build, incorporate, and evaluate recommendation strategies that int

In [10]:
from huggingface_hub import login

login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", device_map = "auto" ,torch_dtype=torch.float16)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [11]:
# Moving tensors to GPU
input_ids = tokenizer(combined_information, return_tensors="pt")
input_ids = input_ids.to(model.device)
response = model.generate(**input_ids, max_new_tokens=20)
print(tokenizer.decode(response[0]))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> Answer the following question using the information provided:

Relevant information from the documents:

International Journal of Multimedia Information Retrieval (2018) 7:95–116
https://doi.org/10.1007/s13735-018-0154-2
TRENDS AND SURVEYS
Current challenges and visions in music recommender
systems research
Markus Schedl1
· Hamed Zamani2 · Ching-Wei Chen3 ·
Yashar Deldjoo4 · Mehdi Elahi5
Received: 28 September 2017 / Revised: 17 March 2018 / Accepted: 27 March 2018 / Published online: 5 April 2018
© The Author(s) 2018
Abstract
Music recommender systems (MRSs) have experienced a boom in recent years, thanks to the emergence and success of online
streaming services, which nowadays make available almost all music in the world at the user’s ﬁngertip. While today’s MRSs
considerably help users to ﬁnd interesting music in these huge catalogs, MRS research is still facing substantial challenges.
In particular when it comes to build, incorporate, and evaluate recommendation strategies that