In [None]:
# Import libraries
import pandas as pd
import numpy as np
import re as re
from datetime import datetime
import os
import requests
import google.generativeai as genai
import json
import spacy 
from bertopic import BERTopic
import hdbscan

from nltk.tokenize import sent_tokenize
import time
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

import vertexai
from vertexai.generative_models import GenerativeModel, Part, FinishReason
import vertexai.preview.generative_models as generative_models


## all available on conda except google-cloud-aiplatform and google-generativeai

In [None]:
nlp = spacy.load("en_core_web_trf") 

In [None]:
# # Get the API key from the environment variable
# openai_api_key = os.getenv("OPENAI_API_KEY")

# # Set the API key
# openai.api_key = openai_api_key

# #Defining Seed

# seed = 42


In [None]:
# Replace 'path_to_your_chat.txt' with the actual path to your exported chat file
# If the chat file is in your Google Drive, provide the correct path
chat_path = r"C:\Users\Shardul\OneDrive - London Business School\coding\bonoAI\raw\demo-chat-2.txt"

In [None]:
# Function to parse each line of the chat file
def parse_line(line):
    try:
        # Split the line into timestamp and message content
        timestamp, message = line.strip().split(" - ", 1)
        # Parse the timestamp into a datetime object
        timestamp = datetime.strptime(timestamp, "%m/%d/%y, %I:%M %p")
        # Extract sender and raw text from the message content
        sender, raw_text = message.split(": ", 1)
        # Remove '\n' characters from raw_text
        raw_text = raw_text.replace("\\n", "\n")
        return timestamp, sender, raw_text
    except ValueError:
        # Skip lines that don't match the expected format
        return None, None, None

In [None]:
# Read the chat file line by line and parse each line
chat_data = []
current_message = None
with open(chat_path, "r", encoding="utf-8") as file:
    for line in file:
        timestamp, sender, raw_text = parse_line(line)
        if timestamp is not None and sender is not None and raw_text is not None:
            if current_message is not None:
                chat_data.append(current_message)
            current_message = {"timestamp": timestamp, "sender": sender, "raw_text": raw_text}
        else:
            if current_message is not None:
                current_message["raw_text"] += "\n" + line.strip()

In [None]:
# Create a DataFrame from the parsed chat data
df_main = pd.DataFrame(chat_data)

# Display the DataFrame
print(df_main)

In [None]:
df = df_main.copy()


In [None]:
df_main

In [None]:
# Replacing the "\n" with actual new lines
df['raw_text'] = df['raw_text'].replace(r'\n', ' ', regex=True)


In [None]:
df['raw_text']

In [None]:
# Create additional columns based on raw_text column
df['link_dummy'] = df['raw_text'].str.contains('http')

# Extract HTTP links from raw_text and fill link_value column
df['link_value'] = df['raw_text'].str.extract(r'(https?://\S+)', expand=False)


# df['link_label'] = df['link_value'].apply(lambda x: 'Twitter' if x and 'twitter' in x else ('Blog' if x and 'blog' in x else None))
# df['link_content'] = df['raw_text']
df['plaintext_dummy'] = df['link_dummy'].apply(lambda x: 1 if not x else 0)


df['plaintext_content'] = df['raw_text'].where(df['plaintext_dummy'] == 1, None)

# Display the DataFrame with additional columns
print(df)

In [None]:
df.head(5)

In [None]:
df_dummy = df[['sender','link_value', 'plaintext_content', 'timestamp']]

In [None]:
# Add a new column 'link_jina' with appended base URL
df_dummy.loc[:,'link_jina'] = df_dummy['link_value'].apply(lambda x: f"https://r.jina.ai/{x}" if pd.notnull(x) else None)

In [None]:
def get_link_content(link_jina):
    if pd.notnull(link_jina):
        try:
            response = requests.get(link_jina)
            response.raise_for_status()  # Check for HTTP errors
            return response.text
        except requests.exceptions.RequestException as e:
            return f"Error: {e}"
    else:
        return None

In [None]:
df_dummy.loc[:,'link_content'] = df_dummy['link_jina'].apply(get_link_content)


In [None]:
df_small = df_dummy.sample(frac=.3)


In [None]:
## This is just a dev environment tool, we will remove this in production

def remove_nan_rows(df, column_name, percentage=0.99):
    null_rows = df[df[column_name].isnull()]
    num_rows_to_remove = int(len(null_rows) * percentage)

    # Randomly select rows to keep
    keep_indices = np.random.choice(null_rows.index, size=len(null_rows) - num_rows_to_remove, replace=False)

    # Keep the selected rows and non-null rows
    df_filtered = pd.concat([df.loc[keep_indices], df[~df[column_name].isnull()]])


    return df_filtered

# Apply to your DataFrame
df_small = remove_nan_rows(df_small.copy(), column_name="link_value", percentage=0.9)
print(df_small)

In [None]:
df_small = df_small.iloc[0:5]
df_small

In [None]:
df_small['link_content'].value_counts()

In [None]:
def df_to_list_of_dicts(df):
  """Converts a DataFrame into a list of dictionaries.

  Args:
      df: A Pandas DataFrame.

  Returns:
      A list of dictionaries, where each dictionary represents a row in the DataFrame.
  """
def df_to_list_of_dicts(df):
    data_list = []
    for index, row in df.iterrows():
        data_dict = row.to_dict()

        # Handle potential None values
        if pd.notnull(data_dict.get('plaintext_content')) and pd.notnull(data_dict.get('link_content')):  
            data_dict['combined_input'] = data_dict['plaintext_content'] + " [DELIMITER] " + data_dict['link_content']

        data_list.append(data_dict)
    return data_list

In [None]:
list_of_dicts = df_to_list_of_dicts(df_small.copy())  # Use a copy to avoid modifying original
print(list_of_dicts)

In [None]:
content_list = [item['link_content'] for item in list_of_dicts if 'link_content' in item]


In [None]:
content_list

In [None]:

generation_config = {
    "max_output_tokens": 8192,
    "temperature": 1,
    "top_p": 0.95,
}

safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH ,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH ,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH ,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH ,
}




In [None]:
prompt_test = f"{prompt_description_instructions}: {content_list}"  

def generate():
  vertexai.init(project="bonoai-421313", location="us-central1")
  model = GenerativeModel("gemini-1.0-pro")
  responses = model.generate_content(
      [prompt_test],
      generation_config=generation_config,
      safety_settings=safety_settings,
      stream=True,
  )

  for response in responses:
    print(response.text, end="")

generate()

In [None]:
labels = [
    "person",      # people, including fictional characters
    "fac",         # buildings, airports, highways, bridges
    "org",         # organizations, companies, agencies, institutions
    "gpe",         # geopolitical entities like countries, cities, states
    "loc",         # non-gpe locations
    "product",     # vehicles, foods, appareal, appliances, software, toys 
    "event",       # named sports, scientific milestones, historical events
    "work_of_art", # titles of books, songs, movies
    "law",         # named laws, acts, or legislations
    "language",    # any named language
    "date",        # absolute or relative dates or periods
    "time",        # time units smaller than a day
    "percent",     # percentage (e.g., "twenty percent", "18%")
    "money",       # monetary values, including unit
    "quantity",    # measurements, e.g., weight or distance
]



In [None]:
vertexai.init(project="bonoai-421313", location="us-central1")
model_name = GenerativeModel("gemini-1.0-pro")
max_length_per_chunk = 5000

In [None]:
prompt_metadata_instructions = f""" 

Persona: Persona: You are an analytical assistant for a tech-savvy user. Analyze the following content and provide a structured output. Do not include self-referential statements.

Instructions: 
1. Ignore fields such as timestamp and sender
2. Return the summary in the form of a raw dictionary in plain English with the following keys. DO NOT USE python or json prefix anywhere 
* Short Summary: Write a 2 line description of the main content.
* Labels: Provide a comma-separated list of relevant tags (e.g., twitter, url, design, website, experimental).
* Event: Is this related to an event (Calendar meeting, deadline, etc.)? Answer 'Yes' or 'No'.
* Topic:  Identify the primary topic (e.g., Design, Productivity).
* Journal: If this content is from a recognized source, provide the name (Newspaper, Media Outlet, Academic Journal).
* Action Needed:  Explicitly state any to-dos or actions required from the user.
* Author: If possible, identify the author of the content.  
 """

In [None]:
prompt_description_instructions = f""" 

Persona: Persona: You are an analytical assistant for a tech-savvy user. Analyze the following content and provide a structured summary. 

Instructions: 
1. Ignore all meta fields such as timestamp, sender, entities, labels in from the input data
2. Do not be self-referential at all, for example adding statements such as 'I hope this summary is helpful. Please let me know if you have any other questions.'
3. Return the summary in the form of a raw dictionary in plain English with the key "Detailed Summary". 
4. DO NOT USE 'python' or 'json' prefix anywhere 
5. Keep all main points, key events and any significant outcomes or conclusions from the article content
6. Highlight important data, mention relevant figures involved, and discuss the implications or impact mentioned in the article content



 """

In [None]:
def process_row_metadata(data_dict):
    """Processes a single dictionary, sending content to Gemini and adding the summary.

    Args:
        data_dict: A dictionary representing a DataFrame row.

    Returns:
        The modified dictionary with the 'summary' field added.
    """

    # Extract and Combine Content
    if pd.notnull(data_dict.get('plaintext_content')) and pd.notnull(data_dict.get('link_content')):  
        combined_input = data_dict['plaintext_content'] + " [DELIMITER] " + data_dict['link_content']
    elif pd.notnull(data_dict.get('plaintext_content')):
        combined_input = data_dict['plaintext_content']
    elif pd.notnull(data_dict.get('link_content')):
        combined_input = data_dict['link_content'] 
    else:
        combined_input = ""  # Or some default handling


    # spaCy Named Entity Recognition
    doc = nlp(combined_input)
    entities = [(entity.text, entity.label_) for entity in doc.ents] 
    entity_labels = [label for _, label in entities if label in labels]

    # Update the dictionary with spaCy results 
    data_dict["entities"] = entities
    data_dict["entity_labels"] = ", ".join(entity_labels)

    # Gemini API Call
    prompt = f"{prompt_metadata_instructions}: {combined_input}"  
    responses = model_name.generate_content(
        [prompt],
        generation_config=generation_config,  # Your generation settings
        safety_settings=safety_settings     # Your safety settings
    )
    summary = responses.text
    # summary = response.text

    # # MongoDB Structure
    # whatsapp_doc = {
    #     "timestamp": data_dict["timestamp"],  # Assuming you have this field from parsing.
    #     "sender": data_dict["sender"],
    #     "raw_text": data_dict["plaintext_content"], 
    #     "link_references": [data_dict["link_value"]]  # If there are multiple links, add them all
    # }

    # article_doc = {
    #     "url": data_dict["link_value"],
    #     "content": data_dict["link_content"], 
    #     "summary": summary 
    # }



    # Add Summary to the Dictionary
    data_dict['summary'] = summary
    return data_dict


# Process Each Dictionary
processed_docs = [process_row_metadata(data_dict) for data_dict in list_of_dicts]
print(processed_docs)


In [None]:
chunk = []

In [None]:
def divide_into_chunks(text, max_length):
    """
    Divide the text into chunks, each of size less than or equal to max_length.
    This function uses nltk to tokenize the text into sentences and then groups
    these sentences into chunks.

    Args:
        text (str): The input text to divide.
        max_length (int): Maximum length of each chunk in characters.

    Returns:
        List[str]: A list of string chunks, each of length <= max_length.
    """
# Tokenize the text into sentences using nltk
    sentences = sent_tokenize(text)

    i = 0

    chunks = []  # List to hold chunks
    current_chunk = ""  # String to accumulate sentences into a chunk

    # Iterate over each sentence, grouping sentences into chunks
    for sentence in sentences:
        # Check if adding this sentence would exceed the max_length
        if len(current_chunk + ("" if current_chunk == "" else " ") + sentence) > max_length:
            # If the current chunk + new sentence exceeds max_length, save the current chunk
            chunks.append(current_chunk)
            current_chunk = sentence  # Start a new chunk with the current sentence
            
        else:
            # Add the sentence to the current chunk with a space if it's not empty
            current_chunk += ("" if current_chunk == "" else " ") + sentence
    
    # Add the last chunk if it contains any text
    if current_chunk:
        chunks.append(current_chunk)

    for index, chunk in enumerate(chunks):
        print(f"Chunk {index + 1}: {chunk}")
    
    return chunks

In [None]:
def summarize_chunk(chunk, prompt_description_instructions, model_name, generation_config, safety_settings):
    """Summarizes a text chunk using Gemini.

    Args:
        chunk: The text chunk to summarize.
        prompt_description_instructions: Instructions to include in the Gemini prompt.
        model_name: The Gemini model to use for summarization.
        generation_config: Configuration settings for the generation process.
        safety_settings: Safety settings for the generation process.

    Returns:
        The generated summary text.
    """
    try:
        prompt = f"{prompt_description_instructions}: {chunk}"
        response = model_name.generate_content(
            [prompt],
            generation_config=generation_config, 
            safety_settings=safety_settings 
        )
        return response.text
    except Exception as e:
        if "Quota exceeded" in str(e):
            print("Quota exceeded, sleeping for 60 seconds...")
            time.sleep(60)  # Sleep for 60 seconds before retrying
            return summarize_chunk(chunk, prompt_description_instructions, model_name, generation_config, safety_settings)  # Retry summarization
        else:
            raise e  # Re-raise the exception if it's not a quota issue 


In [None]:
def summarize_document(combined_input, prompt_instructions, model_name, generation_config, safety_settings, max_length_per_chunk):
    """Summarizes a document in a hierarchical manner using Gemini.

    Args:
        combined_input: The input text to summarize.
        prompt_instructions: Instructions for the Gemini prompts.
        model: The Gemini model for summarization.
        generation_config: Generation settings for Gemini.
        safety_settings: Safety settings for Gemini.
        max_length_per_chunk: Maximum length of text chunks.

    Returns:
        dict: A dictionary containing the final summary ('final_summary') and 
              the intermediate summaries ('intermediate_summaries'). 
    """

    # Divide into chunks
    chunks = divide_into_chunks(combined_input, max_length_per_chunk)
    
    for index, chunk in enumerate(chunks):
        print(f"Chunk in main summary function number {index + 1}: {chunk}")

    

    # Summarize each chunk
    intermediate_summaries = [
        summarize_chunk(chunk, prompt_instructions, model_name, generation_config, safety_settings) 
        for chunk in chunks
    ]

    print('Intermediate Summary', intermediate_summaries)

    

    # Concatenate intermediate summaries
    concatenated_summary = " ".join(intermediate_summaries)

    print(concatenated_summary)

    

    # Summarize the concatenated summary
    final_summary = summarize_chunk(concatenated_summary, prompt_instructions, model_name, generation_config, safety_settings)

    print(final_summary)
    

    return {
        'final_summary': final_summary,
        'intermediate_summaries': intermediate_summaries
    }


In [None]:
# Process Each Dictionary
full_description = [summarize_document(content_list, prompt_description_instructions, model_name, generation_config, safety_settings, max_length_per_chunk) for content_list in content_list]

In [None]:
full_description

In [None]:
for description in full_description:
    final_summary = description['final_summary']
    print(final_summary)

In [None]:
# Process Each Dictionary For Metadata
processed_docs = [process_row_metadata(data_dict) for data_dict in list_of_dicts]
#processed_list = [process_row(data_dict) for data_dict in list_of_dicts]
print(processed_docs)


In [None]:
# Merge the dictionaries
merged_docs = [
    {**doc, **desc} for doc, desc in zip(processed_docs, full_description)
]

# Print the merged list to see the output
print(merged_docs)

In [147]:
merged_docs


# Convert list of dictionaries to a DataFrame
df_merged_final = pd.DataFrame(merged_docs)

# Save DataFrame to CSV
df_merged_final.to_csv(r'C:\Users\Shardul\OneDrive - London Business School\coding\bonoAI\output\merged_documents.csv', index=False)

In [None]:
text = "Your long document text goes here. It might include several sentences that you need to process in parts."
max_length = 1000  # maximum length of each chunk
chunks_demo = divide_into_chunks(text, max_length)
for index, chunk in enumerate(chunks_demo):
    print(f"Chunk {index + 1}: {chunk}")

In [None]:
# Convert the List of Dictionaries to a DataFrame
df_processed = pd.DataFrame(processed_docs)
df_processed.head()

In [None]:
# Combining entities into a string and then combining with summary
df_processed['entities_str'] = df_processed['entities'].apply(lambda x: ', '.join([f"{ent[0]} ({ent[1]})" for ent in x]))
df_processed['combined_summary_text'] = df_processed['entities_str'] + " " + df_processed['summary']

In [None]:
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

MODEL = "text-embedding-preview-0409"
TASK = "QUESTION_ANSWERING"
TITLE = "Embedding Title"
OUTPUT_DIMENSIONALITY = ""


In [None]:
def get_embeddings_df(df):
    BATCH_SIZE = 250  # Adjust based on your model's limits

    def process_batch(batch_df):
        texts = batch_df['combined_summary_text'].tolist() 
        embeddings = embed_text(
            model_name=MODEL, 
            task_type=TASK,
            texts=texts
        )
        return embeddings

    embeddings = []
    for i in range(0, len(df), BATCH_SIZE):
        batch_df = df.iloc[i:i + BATCH_SIZE]
        batch_embeddings = process_batch(batch_df)
        embeddings.extend(batch_embeddings)

    df['embedding_vector'] = embeddings
    return df


## Alternative function 
# def get_embeddings_df(df):
#     # Initialize an empty list to store embeddings
#     embeddings = []

#     # Iterate over each row in the dataframe using iterrows()
#     for index, row in df.iterrows():
#         input_text = [row['combined_summary_text']]  # Ensure text is in list format
        
#         # Embed Text
#         embedding = embed_text(
#             model_name=MODEL,
#             task_type=TASK,
#             texts=input_text
#         )[0]  # Extract the embedding since there's only one input per call
        
#         # Append the embedding to the list
#         embeddings.append(embedding)

#     # Assign the list of embeddings to a new dataframe column
#     df['embedding_vector'] = embeddings

#     return df

# get_embeddings_df(df_processed)



In [None]:
get_embeddings_df(df_processed)

In [None]:
df_processed['formatted_embeddings'] = df_processed['embedding_vector'].apply(lambda x: x.values)


In [None]:
df_processed['formatted_embeddings']

In [None]:
# Check if the column elements are actually lists 
if isinstance(df_processed["formatted_embeddings"].iloc[0], list):
    embeddings_array = np.array(df_processed["formatted_embeddings"].tolist())
else:
    # Handle cases where the column elements are not lists (e.g., strings)
    # ... add logic to convert strings into lists of numbers if needed ...
    embeddings_array = np.array(df_processed["formatted_embeddings"].tolist()) # Update this line


In [None]:
embeddings_array

In [None]:
# Example: Assuming `embeddings_array` should be created or verified
# Ensure embeddings_array is properly computed and not empty
if embeddings_array.size == 0:
    print("Error: The embeddings array is empty!")
else:
    print("Embeddings array is ready for modeling.")


# Check if the data column is empty or has non-string types
if df_processed['combined_summary_text'].isnull().any() or not all(isinstance(x, str) for x in df_processed['combined_summary_text']):
    print("Data error: Check 'combined_summary_text' for nulls or non-string entries.")
else:
    print("Data is properly formatted.")



In [None]:
### Below is WIP

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=50)  # Reduce to 50 dimensions as an example
reduced_embeddings = pca.fit_transform(embeddings_array)

clusterer = hdbscan.HDBSCAN(min_cluster_size=2, min_samples=1, metric='euclidean')
labels = clusterer.fit_predict(reduced_embeddings)
print("Number of clusters found:", len(np.unique(labels[labels >= 0])))
print("Labels assigned to data points:", labels)

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, learning_rate='auto', init='random')
reduced_data = tsne.fit_transform(embeddings_array)

plt.figure(figsize=(10, 8))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=labels, cmap='Spectral', s=50)
plt.title('Data Distribution via t-SNE')
plt.show()

In [None]:
# Create dummy data to test UMAP
dummy_data = np.random.rand(10, 5)  # 10 samples, 5 features each

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
try:
    reduced_embeddings = umap_model.fit_transform(dummy_data)
    print("UMAP works with dummy data, output:", reduced_embeddings)
except Exception as e:
    print("UMAP failed on dummy data with error:", str(e))


In [None]:
# Assuming 'embeddings_array' is your precomputed embeddings and is correctly structured
# Ensure n_neighbors is not greater than the number of samples
if embeddings_array.shape[0] < umap_model.n_neighbors:
    print(f"Reducing n_neighbors from {umap_model.n_neighbors} to {embeddings_array.shape[0] // 2}")
    umap_model.n_neighbors = max(2, embeddings_array.shape[0] // 2)

# Attempt to fit UMAP again
try:
    reduced_embeddings = umap_model.fit_transform(embeddings_array)
    print("UMAP successfully reduced dimensionality.")
except Exception as e:
    print("UMAP failed with adjusted parameters due to:", str(e))

# Use KMeans for clustering
kmeans_model = KMeans(n_clusters=10, random_state=42)
clusters = kmeans_model.fit_predict(reduced_embeddings)


In [None]:
from umap import UMAP

# Adjust UMAP parameters or check input size
umap_model = UMAP(n_neighbors=3, n_components=2, min_dist=0.1, metric='euclidean', n_jobs=1)

# Try manually fitting UMAP to see if it works outside BERTopic
try:
    umap_model.fit(embeddings_array)  # Directly fitting to check if UMAP works with given embeddings
    print("UMAP dimensionality reduction successful.")
except Exception as e:
    print(f"UMAP Error: {e}")


In [None]:
# Validate embeddings array is not empty and contains valid numerical data:
if embeddings_array.size == 0:
    raise ValueError("The embeddings array is empty.")
if np.isnan(embeddings_array).any():
    raise ValueError("NaN values found in embeddings array.")
if embeddings_array.ndim != 2:
    raise ValueError("Embeddings array must be two-dimensional.")


In [None]:
from sklearn.cluster import KMeans

n_clusters = 10

# Load and initialize BERTopic to use KMeans clustering with 8 clusters only.
cluster_model = KMeans(n_clusters=n_clusters)
topic_model = BERTopic(hdbscan_model=cluster_model)

# df is a dataframe. df['title'] is the column of text we're modeling
df_processed['topic'], probabilities = topic_model.fit_transform(df_processed['combined_summary_text'], embeddings_array)

In [None]:
# Initialize BERTopic with embedding_model set to None since we are using precomputed embeddings
topic_model = BERTopic(embedding_model=None)

In [None]:
# Create a BERTopic instance
topic_model = BERTopic(embedding_model="sentence-transformers/all-MiniLM-L6-v2")  # Or your preferred embedding model 

# Fit the model on your embeddings
topics, probabilities = topic_model.fit_transform(df_processed["formatted_embeddings"])


In [None]:
# Fit the model
topics, probabilities = topic_model.fit_transform(df_processed['formatted_embeddings'])


In [None]:
# Add topic numbers back to the original dataframe
df_processed['topic'] = topics

In [None]:
# Get the Topic Info
topic_info = topic_model.get_topic_info()

# Print topic information
print(topic_info)

# To visualize the topics, if feasible depending on your environment (best in Jupyter Notebooks)
topic_model.visualize_topics()

In [None]:
## Nomic is WIP

In [None]:
# from nomic import atlas


In [None]:
# project = atlas.map_data(
#     embeddings=embeddings_array
# )

In [None]:
!nomic login

In [None]:
# nomic cred
# id: bonodemo
# key: nk-LlPaaXJBULXS3hxXWUCkQGt-NsxMpLtXsM737Fd79Wk

In [None]:
# #MONGO_URI = "mongodb+srv://backend-demo-user:mS5qycxBRSPXmVG@bonoaicluster.n8jbhf5.mongodb.net/?retryWrites=true&w=majority&appName=BonoAICluster"


# MONGO_URI = "mongodb+srv://shardulvaidya95:wMmjqUoEXWSR9J7h@bonoaicluster.n8jbhf5.mongodb.net/?retryWrites=true&w=majority&appName=BonoAICluster"

# # Create a new client and connect to the server
# client = MongoClient(MONGO_URI, server_api=ServerApi('1'))
# # Send a ping to confirm a successful connection
# try:
#     client.admin.command('ping')
#     print("Pinged your deployment. You successfully connected to MongoDB!")
# except Exception as e:
#     print(e)


# for whatsapp_doc, article_doc in processed_docs:
#     whatsapp_result = db.bonoAIWhatsappChats.insert_one(whatsapp_doc)
#     print(f"WhatsApp Chat Inserted with ID: {whatsapp_result.inserted_id}")

#     article_result = db.bonoAIArticles.insert_one(article_doc)
#     print(f"Article Inserted with ID: {article_result.inserted_id}")


# whatsapp_count = db.bonoAIWhatsappChats.count_documents({})  # Pass an empty filter
# article_count = db.bonoAIArticles.count_documents({})  # Pass an empty filter

# print(f"WhatsApp Chats Count: {whatsapp_count}")
# print(f"Articles Count: {article_count}")

