In [27]:
import json
from datasets import load_dataset
import pandas as pd
import openai
import os
from dotenv import load_dotenv
import pymongo
import tiktoken
from sentence_transformers import SentenceTransformer

load_dotenv()
openai.api_key = os.getenv("OPENAI_API")

In [2]:
model = SentenceTransformer("BAAI/bge-large-en-v1.5", device="cuda")

In [3]:
# <https://huggingface.co/datasets/AIatMongoDB/embedded_movies>
# dataset = load_dataset("AIatMongoDB/embedded_movies")
dataset_df = pd.read_json("data.json")

# Convert the dataset to a pandas dataframe
# dataset_df = pd.DataFrame(dataset['train'])

dataset_df.head(5)

Unnamed: 0,category,question,answer
0,marriage,Apa sih yang harus aku siapkan sebelum menikah?,"Sebelum menikah, penting untuk menyiapkan komu..."
1,marriage,"Kalau aku menikah sekarang, apakah aku masih b...",Menikah tidak harus menghalangi cita-citamu. B...
2,marriage,Apa saja yang perlu dipersiapkan sebelum menikah?,"Sebelum menikah, penting untuk mempersiapkan d..."
3,marriage,"Kak, jika aku menikah muda, bagaimana cara mem...",Membagi waktu antara pekerjaan dan keluarga bi...
4,marriage,"Kak, kenapa harus menunggu untuk menikah?","Menunggu itu penting, karena saat kita masih m..."


In [8]:
# Remove data point where plot column is missing
# dataset_df = dataset_df.dropna(subset=['plot'])
# print("\\nNumber of missing values in each column after removal:")
print(dataset_df.isnull().sum())

# Remove the plot_embedding from each data point in the dataset as we are going to create new embeddings with the new OpenAI embedding Model "text-embedding-3-small"
# dataset_df = dataset_df.drop(columns=['plot_embedding'])
dataset_df.head(5)

category    0
question    0
answer      0
dtype: int64


Unnamed: 0,category,question,answer
0,marriage,Apa sih yang harus aku siapkan sebelum menikah?,"Sebelum menikah, penting untuk menyiapkan komu..."
1,marriage,"Kalau aku menikah sekarang, apakah aku masih b...",Menikah tidak harus menghalangi cita-citamu. B...
2,marriage,Apa saja yang perlu dipersiapkan sebelum menikah?,"Sebelum menikah, penting untuk mempersiapkan d..."
3,marriage,"Kak, jika aku menikah muda, bagaimana cara mem...",Membagi waktu antara pekerjaan dan keluarga bi...
4,marriage,"Kak, kenapa harus menunggu untuk menikah?","Menunggu itu penting, karena saat kita masih m..."


In [4]:
EMBEDDING_MODEL = "text-embedding-3-small"

def get_embedding(text):
    """Generate an embedding for the given text using OpenAI's API."""

    # Check for valid input
    if not text or not isinstance(text, str):
        return None

    try:
        # Call OpenAI API to get the embedding
        embedding = openai.embeddings.create(input=text, model=EMBEDDING_MODEL).data[0].embedding
        return embedding
    except Exception as e:
        print(f"Error in get_embedding: {e}")
        return None

In [9]:
def get_embedding(text):
    """Generate an embedding for the given text using OpenAI's API."""
    
    # Check for valid input
    if not text or not isinstance(text, str):
        return None

    try:
        # Call OpenAI API to get the embedding
        embedding = model.encode(text)
        return embedding.tolist()
    except Exception as e:
        print(f"Error in get_embedding: {e}")
        return None

In [1]:
# dataset_df["plot_embedding_optimised"] = dataset_df['plot'].apply(get_embedding)
dataset_df["embedding_optimised"] = dataset_df.apply(lambda x: get_embedding(x.to_json(orient='index')), axis=1)

dataset_df.head()

NameError: name 'dataset_df' is not defined

In [23]:
print(get_embedding('When earthy Dolly Portland is rejected by Captain Gaskell in favor of a socialite, she aids Jamesy McCardle, in league with Malay pirates, in his plot to seize his ship.'))

[0.02042803168296814, 0.019738078117370605, 0.0377715639770031, 0.06250166147947311, 0.008962629362940788, 0.025704145431518555, 0.03122377209365368, 0.003767686430364847, 0.0009292386821471155, -0.05124594643712044, 0.0005128146149218082, -0.03744688257575035, 0.0027885616291314363, 0.029059212654829025, 0.01201330590993166, 0.03612108901143074, 0.021334439516067505, -0.037555109709501266, 0.008292969316244125, 0.02359369955956936, 0.017113549634814262, 0.026637611910700798, -0.021591482684016228, 0.01168862171471119, -0.005150975193828344, 0.020915057510137558, -0.0045117540284991264, 0.012574737891554832, 0.01760057546198368, 0.0038218004629015923, -0.009848746471107006, -0.030844973400235176, 0.01612596958875656, 0.014069637283682823, 0.011742736212909222, -0.018682854250073433, -0.007772121578454971, 0.024229539558291435, -0.02279551886022091, -0.021794408559799194, -0.065586157143116, -0.09491594135761261, 0.04199245572090149, 0.024878906086087227, 0.011330116540193558, -0.004592

In [11]:
def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""
    try:
        client = pymongo.MongoClient(mongo_uri)
        print("Connection to MongoDB successful")
        return client
    except pymongo.errors.ConnectionFailure as e:
        print(f"Connection failed: {e}")
        return None

mongo_uri = os.getenv("MONGO_URI")
if not mongo_uri:
    print("MONGO_URI not set in environment variables")

mongo_client = get_mongo_client(mongo_uri)

# Ingest data into MongoDB
db = mongo_client['child_marriage']
collection = db['data_knowledge']

documents = dataset_df.to_dict('records')
# collection.insert_many(documents)

print("Data ingestion into MongoDB completed")

Connection to MongoDB successful
Data ingestion into MongoDB completed


In [32]:
def vector_search(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embedding_optimised",
                "numCandidates": 150,  # Number of candidate matches to consider
                "limit": 5  # Return top 5 matches
            }
        },
        {
            "$project": {
                "_id": 0,  # Exclude the _id field
                # "plot_embedding_opitimzed": 0,  # Exclude the plot_embedding_opitimzed field
                "category": 1,  # Include the plot field
                "question": 1,  # Include the title field
                "answer": 1, # Include the genres field
                "score": {
                    "$meta": "vectorSearchScore"  # Include the search score
                }
            }
        }
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)

In [50]:
def handle_user_query(query, collection):

  get_knowledge = vector_search(query, collection)

  search_result = ''
  count = 0
  for result in get_knowledge:
      count += 1
      search_result += f"{count}. Category: {result.get('category', 'N/A')}, question: {result.get('question', 'N/A')} answer: {result.get('answer', 'N/A')}\\n"
      
  completion = openai.chat.completions.create(
      model="gpt-3.5-turbo", 
      messages=[
          {"role": "system", "content": "You psychology."},
          {"role": "user", "content": "Answer this user query: " + query + " with the following context: " + search_result}
      ]
  )
  
  print("Answer this user query: " + query + " with the following context: " + search_result +"\\n")

  return (completion.choices[0].message.content), search_result

In [52]:
encoding = tiktoken.get_encoding('cl100k_base')

# for i in range(10):
#     query = "What is child marriage?"
#     response, source_information = handle_user_query(query, collection)
# 
#     print(f"Query: {query}")
#     print(f"Response: {response}\n")
#     print(f"Source Information: \\n{source_information}\n")
#     print(f"Tokens: {len(encoding.encode(response))}")
#     print("\n")

const_input = (0.5/1000000*765) + (0.02/1000000*181) # model gpt-3.5-turbo & text-embedding-3-small
const_output = 1.5/1000000*1030 # model gpt-3.5-turbo
cost_total = const_input + const_output # $0,01963 / 20 request, tgl 22/10/2024, untuk 1000 user $19,63/20 request
query = "kenapa nikah dini dilarang?"
print(f"Tokens input: {encoding.encode(query)},{len(encoding.encode(query))}")
response, source_information = handle_user_query(query, collection)
print(f"Response: {response}\n")
print(f"Source Information: \\n{source_information}\n")
print(f"Tokens output: {encoding.encode(response)},{len(encoding.encode(response))}")

Tokens input: [2779, 18826, 76202, 1494, 294, 6729, 294, 3653, 526, 30],10
Answer this user query: kenapa nikah dini dilarang? with the following context: 1. Category: marriage, question: Kenapa harus menunggu sampai dewasa untuk menikah? answer: Menunggu sampai dewasa memberi kamu kesempatan untuk lebih siap menghadapi berbagai tantangan dalam hidup. Pendidikan membantu kamu belajar cara membuat keputusan yang baik dan bertanggung jawab dalam pernikahan nanti.\n2. Category: social, question: Kak, kenapa aku nggak boleh nikah sekarang? Aku udah cinta banget sama pacarku. answer: Rasa cinta itu indah, tapi menikah butuh lebih dari sekadar cinta. Kamu harus siap mental, fisik, dan juga secara finansial. Lebih baik kamu nikmati dulu masa mudamu, fokus pada impianmu, nanti kamu bisa menikah saat sudah benar-benar siap.\n3. Category: marriage, question: Kak, apakah nikah muda bikin kita lebih cepat dewasa? answer: Dewasa itu bukan hanya soal menikah. Kamu bisa menjadi dewasa melalui pengala