In [201]:
# Import libraries
import pandas as pd
import numpy as np
import re as re
from datetime import datetime
import os
import requests
import google.generativeai as genai
import json
import spacy 
from bertopic import BERTopic
import hdbscan

from nltk.tokenize import sent_tokenize
import time
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

import vertexai
from vertexai.generative_models import GenerativeModel, Part, FinishReason
import vertexai.preview.generative_models as generative_models
import uuid
from langchain.text_splitter import RecursiveCharacterTextSplitter


## all available on conda except google-cloud-aiplatform and google-generativeai

In [202]:
nlp = spacy.load("en_core_web_trf") 

In [203]:
# # Get the API key from the environment variable
# openai_api_key = os.getenv("OPENAI_API_KEY")

# # Set the API key
# openai.api_key = openai_api_key

# #Defining Seed

# seed = 42


In [204]:
# Replace 'path_to_your_chat.txt' with the actual path to your exported chat file
# If the chat file is in your Google Drive, provide the correct path
chat_path = r"C:\Users\Shardul\OneDrive - London Business School\coding\bonoAI\raw\demo-chat-2.txt"

In [205]:
# Function to parse each line of the chat file
def parse_line(line):
    try:
        # Split the line into timestamp and message content
        timestamp, message = line.strip().split(" - ", 1)
        # Parse the timestamp into a datetime object
        timestamp = datetime.strptime(timestamp, "%m/%d/%y, %I:%M %p")
        # Extract sender and raw text from the message content
        sender, raw_text = message.split(": ", 1)
        # Remove '\n' characters from raw_text
        raw_text = raw_text.replace("\\n", "\n")
        return timestamp, sender, raw_text
    except ValueError:
        # Skip lines that don't match the expected format
        return None, None, None

In [206]:
# Read the chat file line by line and parse each line
chat_data = []
current_message = None
with open(chat_path, "r", encoding="utf-8") as file:
    for line in file:
        timestamp, sender, raw_text = parse_line(line)
        if timestamp is not None and sender is not None and raw_text is not None:
            if current_message is not None:
                chat_data.append(current_message)
            current_message = {"timestamp": timestamp, "sender": sender, "raw_text": raw_text}
        else:
            if current_message is not None:
                current_message["raw_text"] += "\n" + line.strip()

In [207]:
# Create a DataFrame from the parsed chat data
df_main = pd.DataFrame(chat_data)

# Display the DataFrame
print(df_main)

             timestamp          sender  \
0  2024-05-01 00:35:00  Shardul Vaidya   
1  2024-05-01 00:35:00  Shardul Vaidya   
2  2024-05-01 00:35:00  Shardul Vaidya   
3  2024-05-01 00:35:00  Shardul Vaidya   
4  2024-05-01 00:35:00  Shardul Vaidya   
5  2024-05-01 00:35:00  Shardul Vaidya   
6  2024-05-01 00:35:00  Shardul Vaidya   
7  2024-05-01 00:35:00  Shardul Vaidya   
8  2024-05-01 00:35:00  Shardul Vaidya   
9  2024-05-01 00:35:00  Shardul Vaidya   
10 2024-05-01 00:35:00  Shardul Vaidya   
11 2024-05-01 00:35:00  Shardul Vaidya   

                                             raw_text  
0   https://variety.com/2022/tv/news/succession-je...  
1   https://www.theguardian.com/music/gallery/2018...  
2   https://flavoredbyfatima.com/2021/07/06/karach...  
3   https://www.scientificamerican.com/article/nuc...  
4   https://www.artandobject.com/news/how-money-la...  
5   https://en.wikipedia.org/wiki/Propylaia_(Acrop...  
6   https://en.wikipedia.org/wiki/Temple_of_Athena...  
7   h

In [208]:
def generate_uuid(df):
    df['unique_id'] = [uuid.uuid4() for _ in range(len(df))]
    return df

# Example usage
df = generate_uuid(df_main) 

In [209]:
# Replacing the "\n" with actual new lines
df['raw_text'] = df['raw_text'].replace(r'\n', ' ', regex=True)


In [210]:
df['raw_text']

0     https://variety.com/2022/tv/news/succession-je...
1     https://www.theguardian.com/music/gallery/2018...
2     https://flavoredbyfatima.com/2021/07/06/karach...
3     https://www.scientificamerican.com/article/nuc...
4     https://www.artandobject.com/news/how-money-la...
5     https://en.wikipedia.org/wiki/Propylaia_(Acrop...
6     https://en.wikipedia.org/wiki/Temple_of_Athena...
7     https://www.deccanherald.com/sports/cricket/ho...
8     https://www.espn.com/nba/story/_/id/36305442/k...
9     https://theathletic.com/5233957/2024/01/29/sac...
10    https://theprint.in/opinion/bihars-chirand-has...
11    https://www.newscientist.com/article/2409000-p...
Name: raw_text, dtype: object

In [211]:
# Create additional columns based on raw_text column
df['link_dummy'] = df['raw_text'].str.contains('http')

# Extract HTTP links from raw_text and fill link_value column
df['link_value'] = df['raw_text'].str.extract(r'(https?://\S+)', expand=False)


# df['link_label'] = df['link_value'].apply(lambda x: 'Twitter' if x and 'twitter' in x else ('Blog' if x and 'blog' in x else None))
# df['link_content'] = df['raw_text']
df['plaintext_dummy'] = df['link_dummy'].apply(lambda x: 1 if not x else 0)


df['plaintext_content'] = df['raw_text'].where(df['plaintext_dummy'] == 1, None)

# Display the DataFrame with additional columns
print(df)

             timestamp          sender  \
0  2024-05-01 00:35:00  Shardul Vaidya   
1  2024-05-01 00:35:00  Shardul Vaidya   
2  2024-05-01 00:35:00  Shardul Vaidya   
3  2024-05-01 00:35:00  Shardul Vaidya   
4  2024-05-01 00:35:00  Shardul Vaidya   
5  2024-05-01 00:35:00  Shardul Vaidya   
6  2024-05-01 00:35:00  Shardul Vaidya   
7  2024-05-01 00:35:00  Shardul Vaidya   
8  2024-05-01 00:35:00  Shardul Vaidya   
9  2024-05-01 00:35:00  Shardul Vaidya   
10 2024-05-01 00:35:00  Shardul Vaidya   
11 2024-05-01 00:35:00  Shardul Vaidya   

                                             raw_text  \
0   https://variety.com/2022/tv/news/succession-je...   
1   https://www.theguardian.com/music/gallery/2018...   
2   https://flavoredbyfatima.com/2021/07/06/karach...   
3   https://www.scientificamerican.com/article/nuc...   
4   https://www.artandobject.com/news/how-money-la...   
5   https://en.wikipedia.org/wiki/Propylaia_(Acrop...   
6   https://en.wikipedia.org/wiki/Temple_of_Athena... 

In [212]:
df.head(5)

Unnamed: 0,timestamp,sender,raw_text,unique_id,link_dummy,link_value,plaintext_dummy,plaintext_content
0,2024-05-01 00:35:00,Shardul Vaidya,https://variety.com/2022/tv/news/succession-je...,ffefa829-9388-4183-996c-189f3b992907,True,https://variety.com/2022/tv/news/succession-je...,0,
1,2024-05-01 00:35:00,Shardul Vaidya,https://www.theguardian.com/music/gallery/2018...,18f95c6d-b49e-4f04-af48-ad2f037f1dfe,True,https://www.theguardian.com/music/gallery/2018...,0,
2,2024-05-01 00:35:00,Shardul Vaidya,https://flavoredbyfatima.com/2021/07/06/karach...,0f3bbfbd-5ef7-4589-be15-75b67963e98c,True,https://flavoredbyfatima.com/2021/07/06/karach...,0,
3,2024-05-01 00:35:00,Shardul Vaidya,https://www.scientificamerican.com/article/nuc...,368a3618-fd82-44c4-a63d-7b40d65b575c,True,https://www.scientificamerican.com/article/nuc...,0,
4,2024-05-01 00:35:00,Shardul Vaidya,https://www.artandobject.com/news/how-money-la...,04a8599c-4b0a-4c7f-8220-9c398fb190d7,True,https://www.artandobject.com/news/how-money-la...,0,


In [213]:
df_dummy = df[['sender','link_value', 'plaintext_content', 'timestamp', 'unique_id']]

In [214]:
# Add a new column 'link_jina' with appended base URL
df_dummy.loc[:,'link_jina'] = df_dummy['link_value'].apply(lambda x: f"https://r.jina.ai/{x}" if pd.notnull(x) else None)

In [215]:
def get_link_content(link_jina):
    if pd.notnull(link_jina):
        try:
            response = requests.get(link_jina)
            response.raise_for_status()  # Check for HTTP errors
            return response.text
        except requests.exceptions.RequestException as e:
            return f"Error: {e}"
    else:
        return None

In [216]:
df_dummy.loc[:,'link_content'] = df_dummy['link_jina'].apply(get_link_content)


In [232]:
df_small = df_dummy.sample(frac=.3)


In [218]:
## This is just a dev environment tool, we will remove this in production

def remove_nan_rows(df, column_name, percentage=0.99):
    null_rows = df[df[column_name].isnull()]
    num_rows_to_remove = int(len(null_rows) * percentage)

    # Randomly select rows to keep
    keep_indices = np.random.choice(null_rows.index, size=len(null_rows) - num_rows_to_remove, replace=False)

    # Keep the selected rows and non-null rows
    df_filtered = pd.concat([df.loc[keep_indices], df[~df[column_name].isnull()]])


    return df_filtered

# Apply to your DataFrame
df_small = remove_nan_rows(df_small.copy(), column_name="link_value", percentage=0.9)
print(df_small)

            sender                                         link_value  \
8   Shardul Vaidya  https://www.espn.com/nba/story/_/id/36305442/k...   
11  Shardul Vaidya  https://www.newscientist.com/article/2409000-p...   
9   Shardul Vaidya  https://theathletic.com/5233957/2024/01/29/sac...   
4   Shardul Vaidya  https://www.artandobject.com/news/how-money-la...   

   plaintext_content           timestamp  \
8               None 2024-05-01 00:35:00   
11              None 2024-05-01 00:35:00   
9               None 2024-05-01 00:35:00   
4               None 2024-05-01 00:35:00   

                               unique_id  \
8   bfb015a4-4336-4e24-8c87-66038a2660df   
11  c9f9c983-6d3f-4779-9b95-07841e432327   
9   5fbdfc8f-4c6d-4a0a-b606-a6a66bf0aadf   
4   04a8599c-4b0a-4c7f-8220-9c398fb190d7   

                                            link_jina  \
8   https://r.jina.ai/https://www.espn.com/nba/sto...   
11  https://r.jina.ai/https://www.newscientist.com...   
9   https://r.jina.ai

In [233]:
df_small = df_small.iloc[0:5]
df_small

Unnamed: 0,sender,link_value,plaintext_content,timestamp,unique_id,link_jina,link_content
1,Shardul Vaidya,https://www.theguardian.com/music/gallery/2018...,,2024-05-01 00:35:00,18f95c6d-b49e-4f04-af48-ad2f037f1dfe,https://r.jina.ai/https://www.theguardian.com/...,Title: A Great Day in Harlem: behind Art Kane'...
2,Shardul Vaidya,https://flavoredbyfatima.com/2021/07/06/karach...,,2024-05-01 00:35:00,0f3bbfbd-5ef7-4589-be15-75b67963e98c,https://r.jina.ai/https://flavoredbyfatima.com...,Title: Karachi Style Deighi Biryani - Flavored...
0,Shardul Vaidya,https://variety.com/2022/tv/news/succession-je...,,2024-05-01 00:35:00,ffefa829-9388-4183-996c-189f3b992907,https://r.jina.ai/https://variety.com/2022/tv/...,Title: 'Succession’ Star Jeremy Strong and Ada...
3,Shardul Vaidya,https://www.scientificamerican.com/article/nuc...,,2024-05-01 00:35:00,368a3618-fd82-44c4-a63d-7b40d65b575c,https://r.jina.ai/https://www.scientificameric...,Title: Nuclear-Testing 'Downwinders' Speak abo...


In [220]:
df_small['link_content'].value_counts()

link_content
Title: Why Stephen A. can't see the Suns coming back to beat the Nuggets - Videos - Watch ESPN\n\nURL Source: https://www.espn.com/nba/story/_/id/36305442/kevin-durant-high-school-year\n\nMarkdown Content:\nStephen A. Smith explains his bleak outlook for the Suns after going down 2-0 to the Nuggets. (2:48)\n                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

In [234]:
def df_to_list_of_dicts(df):
  """Converts a DataFrame into a list of dictionaries.

  Args:
      df: A Pandas DataFrame.

  Returns:
      A list of dictionaries, where each dictionary represents a row in the DataFrame.
  """
def df_to_list_of_dicts(df):
    data_list = []
    for index, row in df.iterrows():
        data_dict = row.to_dict()

        # Handle potential None values
        if pd.notnull(data_dict.get('plaintext_content')) and pd.notnull(data_dict.get('link_content')):  
            data_dict['combined_input'] = data_dict['plaintext_content'] + " [DELIMITER] " + data_dict['link_content']

        data_list.append(data_dict)
    return data_list

In [235]:
list_of_dicts = df_to_list_of_dicts(df_small.copy())  # Use a copy to avoid modifying original
print(list_of_dicts)



In [236]:
content_list = [item['link_content'] for item in list_of_dicts if 'link_content' in item]


In [237]:
content_list

["Title: A Great Day in Harlem: behind Art Kane's classic 1958 jazz photograph\n\nURL Source: https://www.theguardian.com/music/gallery/2018/dec/17/a-great-day-in-harlem-behind-art-kaness-classic-1958-jazz-photograph\n\nPublished Time: 2018-12-17T07:00:02.000Z\n\nMarkdown Content:\nThe young art director’s idea to photograph as many of the luminaries of the New York jazz scene as possible together for Esquire’s 1959 Golden Age of Jazz edition began his career as a photographer. Police closed the road to all but residential traffic, and 57 musicians duly assembled in Harlem between Fifth and Madison Avenues. The group included Dizzy Gillespie, Art Blakey, Thelonius Monk, Coleman Hawkins, Lester Young, Charles Mingus, Gerry Mulligan and Count Basie.\n\n*   Art Kane: Harlem 1958 is available at [Wall of Sound Editions](https://www.wallofsoundgallery.com/en/art-kane/)\n",
 'Title: Karachi Style Deighi Biryani - Flavored By Fatima\n\nURL Source: https://flavoredbyfatima.com/2021/07/06/karac

In [238]:

generation_config = {
    "max_output_tokens": 8192,
    "temperature": 1,
    "top_p": 0.95,
}

safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH ,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH ,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH ,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_ONLY_HIGH ,
}




In [239]:
prompt_test = f"{prompt_description_instructions}: {content_list}"  

def generate():
  vertexai.init(project="bonoai-421313", location="us-central1")
  model = GenerativeModel("gemini-1.0-pro")
  responses = model.generate_content(
      [prompt_test],
      generation_config=generation_config,
      safety_settings=safety_settings,
      stream=True,
  )

  for response in responses:
    print(response.text, end="")

generate()

```json
{
 "Detailed Summary": "A discussion between two individuals who are interested in the technical aspects of data science. The following points are touched upon:\n\n* There is interest in exploring ways to automatically detect and categorize the source of the text, such as by website, social media platform, email, podcast, etc., as well as the type of media it is, such as a review, an essay, a news article, etc.\n* The goal is to develop methods for analyzing and comparing data from a variety of sources in an efficient and automated manner. One approach is to extract salient phrases or sentences. Other methods could consider such things as word choice, the length of the text, or any special formatting used. These could then be used to automatically tag the data.\n* Once the data set has been tagged, various tools could be used to explore, compare, and categorize the data. The participants mention methods such as word trees, clustering analysis, and time series analysis to identi

In [240]:
labels = [
    "person",      # people, including fictional characters
    "fac",         # buildings, airports, highways, bridges
    "org",         # organizations, companies, agencies, institutions
    "gpe",         # geopolitical entities like countries, cities, states
    "loc",         # non-gpe locations
    "product",     # vehicles, foods, appareal, appliances, software, toys 
    "event",       # named sports, scientific milestones, historical events
    "work_of_art", # titles of books, songs, movies
    "law",         # named laws, acts, or legislations
    "language",    # any named language
    "date",        # absolute or relative dates or periods
    "time",        # time units smaller than a day
    "percent",     # percentage (e.g., "twenty percent", "18%")
    "money",       # monetary values, including unit
    "quantity",    # measurements, e.g., weight or distance
]



In [241]:
vertexai.init(project="bonoai-421313", location="us-central1")
model_name = GenerativeModel("gemini-1.0-pro")
max_length_per_chunk = 5000

In [242]:
prompt_metadata_instructions = f""" 

Persona: Persona: You are an analytical assistant for a tech-savvy user. Analyze the following content and provide a structured output. Do not include self-referential statements.

Instructions: 
1. Ignore fields such as timestamp and sender
2. Return the summary in the form of a raw dictionary in plain English with the following keys. DO NOT USE python or json prefix anywhere 
* Short Summary: Write a 2 line description of the main content.
* Labels: Provide a comma-separated list of relevant tags (e.g., twitter, url, design, website, experimental).
* Event: Is this related to an event (Calendar meeting, deadline, etc.)? Answer 'Yes' or 'No'.
* Topic:  Identify the primary topic (e.g., Design, Productivity).
* Journal: If this content is from a recognized source, provide the name (Newspaper, Media Outlet, Academic Journal).
* Action Needed:  Explicitly state any to-dos or actions required from the user.
* Author: If possible, identify the author of the content.  
 """

In [243]:
prompt_description_instructions = f""" 

Persona: Persona: You are an analytical assistant for a tech-savvy user. Analyze the following content and provide a structured summary. 

Instructions: 
1. Ignore all meta fields such as timestamp, sender, entities, labels in from the input data
2. Do not be self-referential at all, for example adding statements such as 'I hope this summary is helpful. Please let me know if you have any other questions.'
3. Return the summary in the form of a raw dictionary in plain English with the key "Detailed Summary". 
4. DO NOT USE 'python' or 'json' prefix anywhere 
5. Keep all main points, key events and any significant outcomes or conclusions from the article content
6. Highlight important data, mention relevant figures involved, and discuss the implications or impact mentioned in the article content



 """

In [244]:
def process_row_metadata(data_dict):
    """Processes a single dictionary, sending content to Gemini and adding the summary.

    Args:
        data_dict: A dictionary representing a DataFrame row.

    Returns:
        The modified dictionary with the 'summary' field added.
    """

    # Extract and Combine Content
    if pd.notnull(data_dict.get('plaintext_content')) and pd.notnull(data_dict.get('link_content')):  
        combined_input = data_dict['plaintext_content'] + " [DELIMITER] " + data_dict['link_content']
    elif pd.notnull(data_dict.get('plaintext_content')):
        combined_input = data_dict['plaintext_content']
    elif pd.notnull(data_dict.get('link_content')):
        combined_input = data_dict['link_content'] 
    else:
        combined_input = ""  # Or some default handling


    # spaCy Named Entity Recognition
    doc = nlp(combined_input)
    entities = [(entity.text, entity.label_) for entity in doc.ents] 
    entity_labels = [label for _, label in entities if label in labels]

    # Update the dictionary with spaCy results 
    data_dict["entities"] = entities
    data_dict["entity_labels"] = ", ".join(entity_labels)

    # Gemini API Call
    prompt = f"{prompt_metadata_instructions}: {combined_input}"  
    responses = model_name.generate_content(
        [prompt],
        generation_config=generation_config,  # Your generation settings
        safety_settings=safety_settings     # Your safety settings
    )
    summary = responses.text
    # summary = response.text

    # # MongoDB Structure
    # whatsapp_doc = {
    #     "timestamp": data_dict["timestamp"],  # Assuming you have this field from parsing.
    #     "sender": data_dict["sender"],
    #     "raw_text": data_dict["plaintext_content"], 
    #     "link_references": [data_dict["link_value"]]  # If there are multiple links, add them all
    # }

    # article_doc = {
    #     "url": data_dict["link_value"],
    #     "content": data_dict["link_content"], 
    #     "summary": summary 
    # }



    # Add Summary to the Dictionary
    data_dict['summary'] = summary
    return data_dict


# Process Each Dictionary
processed_docs = [process_row_metadata(data_dict) for data_dict in list_of_dicts]
print(processed_docs)




In [245]:
chunk = []

In [246]:
def divide_into_chunks(text, max_length):
    """
    Divide the text into chunks, each of size less than or equal to max_length.
    This function uses nltk to tokenize the text into sentences and then groups
    these sentences into chunks.

    Args:
        text (str): The input text to divide.
        max_length (int): Maximum length of each chunk in characters.

    Returns:
        List[str]: A list of string chunks, each of length <= max_length.
    """
# Tokenize the text into sentences using nltk
    sentences = sent_tokenize(text)

    i = 0

    chunks = []  # List to hold chunks
    current_chunk = ""  # String to accumulate sentences into a chunk

    # Iterate over each sentence, grouping sentences into chunks
    for sentence in sentences:
        # Check if adding this sentence would exceed the max_length
        if len(current_chunk + ("" if current_chunk == "" else " ") + sentence) > max_length:
            # If the current chunk + new sentence exceeds max_length, save the current chunk
            chunks.append(current_chunk)
            current_chunk = sentence  # Start a new chunk with the current sentence
            
        else:
            # Add the sentence to the current chunk with a space if it's not empty
            current_chunk += ("" if current_chunk == "" else " ") + sentence
    
    # Add the last chunk if it contains any text
    if current_chunk:
        chunks.append(current_chunk)

    for index, chunk in enumerate(chunks):
        print(f"Chunk {index + 1}: {chunk}")
    
    return chunks

In [247]:
def summarize_chunk(chunk, prompt_description_instructions, model_name, generation_config, safety_settings):
    """Summarizes a text chunk using Gemini.

    Args:
        chunk: The text chunk to summarize.
        prompt_description_instructions: Instructions to include in the Gemini prompt.
        model_name: The Gemini model to use for summarization.
        generation_config: Configuration settings for the generation process.
        safety_settings: Safety settings for the generation process.

    Returns:
        The generated summary text.
    """
    try:
        prompt = f"{prompt_description_instructions}: {chunk}"
        response = model_name.generate_content(
            [prompt],
            generation_config=generation_config, 
            safety_settings=safety_settings 
        )
        return response.text
    except Exception as e:
        if "Quota exceeded" in str(e):
            print("Quota exceeded, sleeping for 60 seconds...")
            time.sleep(60)  # Sleep for 60 seconds before retrying
            return summarize_chunk(chunk, prompt_description_instructions, model_name, generation_config, safety_settings)  # Retry summarization
        else:
            raise e  # Re-raise the exception if it's not a quota issue 


In [248]:
def summarize_document(combined_input, prompt_instructions, model_name, generation_config, safety_settings, max_length_per_chunk):
    """Summarizes a document in a hierarchical manner using Gemini.

    Args:
        combined_input: The input text to summarize.
        prompt_instructions: Instructions for the Gemini prompts.
        model: The Gemini model for summarization.
        generation_config: Generation settings for Gemini.
        safety_settings: Safety settings for Gemini.
        max_length_per_chunk: Maximum length of text chunks.

    Returns:
        dict: A dictionary containing the final summary ('final_summary') and 
              the intermediate summaries ('intermediate_summaries'). 
    """

    # Divide into chunks
    chunks = divide_into_chunks(combined_input, max_length_per_chunk)
    
    for index, chunk in enumerate(chunks):
        print(f"Chunk in main summary function number {index + 1}: {chunk}")

    

    # Summarize each chunk
    intermediate_summaries = [
        summarize_chunk(chunk, prompt_instructions, model_name, generation_config, safety_settings) 
        for chunk in chunks
    ]

    print('Intermediate Summary', intermediate_summaries)

    

    # Concatenate intermediate summaries
    concatenated_summary = " ".join(intermediate_summaries)

    print(concatenated_summary)

    

    # Summarize the concatenated summary
    final_summary = summarize_chunk(concatenated_summary, prompt_instructions, model_name, generation_config, safety_settings)

    print(final_summary)
    

    return {
        'final_summary': final_summary,
        'intermediate_summaries': intermediate_summaries
    }


In [249]:
# Process Each Dictionary
full_description = [summarize_document(content_list, prompt_description_instructions, model_name, generation_config, safety_settings, max_length_per_chunk) for content_list in content_list]

Chunk 1: Title: A Great Day in Harlem: behind Art Kane's classic 1958 jazz photograph

URL Source: https://www.theguardian.com/music/gallery/2018/dec/17/a-great-day-in-harlem-behind-art-kaness-classic-1958-jazz-photograph

Published Time: 2018-12-17T07:00:02.000Z

Markdown Content:
The young art director’s idea to photograph as many of the luminaries of the New York jazz scene as possible together for Esquire’s 1959 Golden Age of Jazz edition began his career as a photographer. Police closed the road to all but residential traffic, and 57 musicians duly assembled in Harlem between Fifth and Madison Avenues. The group included Dizzy Gillespie, Art Blakey, Thelonius Monk, Coleman Hawkins, Lester Young, Charles Mingus, Gerry Mulligan and Count Basie. *   Art Kane: Harlem 1958 is available at [Wall of Sound Editions](https://www.wallofsoundgallery.com/en/art-kane/)
Chunk in main summary function number 1: Title: A Great Day in Harlem: behind Art Kane's classic 1958 jazz photograph

URL Sou

In [250]:
full_description

[{'final_summary': '```json\n{\n"Detailed Summary": "## A Great Day in Harlem: Behind Art Kane\'s Classic 1958 Jazz Photograph\\n\\n**This article delves into the story behind Art Kane\'s iconic photograph \'A Great Day in Harlem\', capturing 57 of the most prominent jazz musicians of 1958.**\\n\\n**Here are the key takeaways:**\\n\\n* The photograph was commissioned by Esquire magazine for their 1959 \'Golden Age of Jazz\' issue.\n* Art Kane, then a young art director, assembled the musicians in Harlem on a street corner between Fifth and Madison Avenues.\n* Notably, the impressive lineup included legendary figures like Dizzy Gillespie, Art Blakey, Thelonius Monk, Coleman Hawkins, Lester Young, Charles Mingus, Gerry Mulligan, and Count Basie.\n* Police cleared the street for the photoshoot, enabling Kane to take the remarkable image.\n* \'A Great Day in Harlem\' has transcended time as a symbol of the vibrant jazz scene and the camaraderie amongst the musicians of that era.\\n\\n**Fur

In [251]:
for description in full_description:
    final_summary = description['final_summary']
    print(final_summary)

```json
{
"Detailed Summary": "## A Great Day in Harlem: Behind Art Kane's Classic 1958 Jazz Photograph\n\n**This article delves into the story behind Art Kane's iconic photograph 'A Great Day in Harlem', capturing 57 of the most prominent jazz musicians of 1958.**\n\n**Here are the key takeaways:**\n\n* The photograph was commissioned by Esquire magazine for their 1959 'Golden Age of Jazz' issue.
* Art Kane, then a young art director, assembled the musicians in Harlem on a street corner between Fifth and Madison Avenues.
* Notably, the impressive lineup included legendary figures like Dizzy Gillespie, Art Blakey, Thelonius Monk, Coleman Hawkins, Lester Young, Charles Mingus, Gerry Mulligan, and Count Basie.
* Police cleared the street for the photoshoot, enabling Kane to take the remarkable image.
* 'A Great Day in Harlem' has transcended time as a symbol of the vibrant jazz scene and the camaraderie amongst the musicians of that era.\n\n**Furthermore, the article sheds light on the p

In [253]:
# Process Each Dictionary For Metadata
processed_docs = [process_row_metadata(data_dict) for data_dict in list_of_dicts]
#processed_list = [process_row(data_dict) for data_dict in list_of_dicts]
print(processed_docs)




In [254]:
# Merge the dictionaries
merged_docs = [
    {**doc, **desc} for doc, desc in zip(processed_docs, full_description)
]

# Print the merged list to see the output
print(merged_docs)



In [255]:
merged_docs

# Convert list of dictionaries to a DataFrame
df_merged_final = pd.DataFrame(merged_docs)

# Save DataFrame to CSV
df_merged_final.to_csv(r'C:\Users\Shardul\OneDrive - London Business School\coding\bonoAI\output\merged_documents.csv', index=False)

In [256]:
text = "Your long document text goes here. It might include several sentences that you need to process in parts."
max_length = 1000  # maximum length of each chunk
chunks_demo = divide_into_chunks(text, max_length)
for index, chunk in enumerate(chunks_demo):
    print(f"Chunk {index + 1}: {chunk}")

Chunk 1: Your long document text goes here. It might include several sentences that you need to process in parts.
Chunk 1: Your long document text goes here. It might include several sentences that you need to process in parts.


In [257]:
# Convert the List of Dictionaries to a DataFrame
df_processed = pd.DataFrame(processed_docs)
df_processed.head()

Unnamed: 0,sender,link_value,plaintext_content,timestamp,unique_id,link_jina,link_content,entities,entity_labels,summary
0,Shardul Vaidya,https://www.theguardian.com/music/gallery/2018...,,2024-05-01 00:35:00,18f95c6d-b49e-4f04-af48-ad2f037f1dfe,https://r.jina.ai/https://www.theguardian.com/...,Title: A Great Day in Harlem: behind Art Kane'...,[(A Great Day in Harlem: behind Art Kane's cla...,,"```json\n{\n ""Short Summary"": ""Art Kane's fam..."
1,Shardul Vaidya,https://flavoredbyfatima.com/2021/07/06/karach...,,2024-05-01 00:35:00,0f3bbfbd-5ef7-4589-be15-75b67963e98c,https://r.jina.ai/https://flavoredbyfatima.com...,Title: Karachi Style Deighi Biryani - Flavored...,"[(2021-07-06T05:08:56, DATE), (Pakistani, NORP...",,## Content Summary:\n\n### Short Summary:\n\nT...
2,Shardul Vaidya,https://variety.com/2022/tv/news/succession-je...,,2024-05-01 00:35:00,ffefa829-9388-4183-996c-189f3b992907,https://r.jina.ai/https://variety.com/2022/tv/...,Title: 'Succession’ Star Jeremy Strong and Ada...,"[(Succession, WORK_OF_ART), (Jeremy Strong, PE...",,"```python\n{\n ""Short Summary"": ""This article ..."
3,Shardul Vaidya,https://www.scientificamerican.com/article/nuc...,,2024-05-01 00:35:00,368a3618-fd82-44c4-a63d-7b40d65b575c,https://r.jina.ai/https://www.scientificameric...,Title: Nuclear-Testing 'Downwinders' Speak abo...,"[(2022-01-27T14:00:00, DATE), (Sandra Evans Wa...",,"```python\n{\n ""Short Summary"": ""This article ..."


In [258]:
df_merged_final

Unnamed: 0,sender,link_value,plaintext_content,timestamp,unique_id,link_jina,link_content,entities,entity_labels,summary,final_summary,intermediate_summaries
0,Shardul Vaidya,https://www.theguardian.com/music/gallery/2018...,,2024-05-01 00:35:00,18f95c6d-b49e-4f04-af48-ad2f037f1dfe,https://r.jina.ai/https://www.theguardian.com/...,Title: A Great Day in Harlem: behind Art Kane'...,[(A Great Day in Harlem: behind Art Kane's cla...,,"```json\n{\n ""Short Summary"": ""Art Kane's fam...","```json\n{\n""Detailed Summary"": ""## A Great Da...","[```json\n{\n""Detailed Summary"": ""## A Great D..."
1,Shardul Vaidya,https://flavoredbyfatima.com/2021/07/06/karach...,,2024-05-01 00:35:00,0f3bbfbd-5ef7-4589-be15-75b67963e98c,https://r.jina.ai/https://flavoredbyfatima.com...,Title: Karachi Style Deighi Biryani - Flavored...,"[(2021-07-06T05:08:56, DATE), (Pakistani, NORP...",,## Content Summary:\n\n### Short Summary:\n\nT...,"```json\n{\n ""Detailed Summary"": {\n ""Titl...","[```json\n{\n ""Detailed Summary"": ""This articl..."
2,Shardul Vaidya,https://variety.com/2022/tv/news/succession-je...,,2024-05-01 00:35:00,ffefa829-9388-4183-996c-189f3b992907,https://r.jina.ai/https://variety.com/2022/tv/...,Title: 'Succession’ Star Jeremy Strong and Ada...,"[(Succession, WORK_OF_ART), (Jeremy Strong, PE...",,"```python\n{\n ""Short Summary"": ""This article ...",## Structured Summary of Jeremy Strong and Ada...,"[{\n""Detailed Summary"": ""## Jeremy Strong and ..."
3,Shardul Vaidya,https://www.scientificamerican.com/article/nuc...,,2024-05-01 00:35:00,368a3618-fd82-44c4-a63d-7b40d65b575c,https://r.jina.ai/https://www.scientificameric...,Title: Nuclear-Testing 'Downwinders' Speak abo...,"[(2022-01-27T14:00:00, DATE), (Sandra Evans Wa...",,"```python\n{\n ""Short Summary"": ""This article ...","```json\n{\n ""Detailed Summary"": {\n ""Cont...","[```json\n{\n""Detailed Summary"": {\n\n""**Nucle..."


In [None]:
# def process_summary(final_summary):
#     tokenizer = get_encoding('cl100k_base')  # Or your Text-Gecko compatible tokenizer
#     text_splitter = RecursiveCharacterTextSplitter(
#         chunk_size=500,  # Adjust as needed for Text-Gecko
#         chunk_overlap=20,
#         length_function=lambda text: len(tokenizer.encode(text)),
#         separators=['\n\n', '\n', ' ', '']
#     )
#     return text_splitter.split_text(final_summary)

# df_merged_final['chunked_summaries'] = df_merged_final['final_summary'].apply(process_summary)

In [259]:
import pandas as pd
import tiktoken

class DocumentProcessor:
    def __init__(self, df, tokenizer_model='cl100k_base'):
        self.df = df
        self.tokenizer = tiktoken.get_encoding(tokenizer_model)

    def _clean_data(self, text):
        """Clean the input text. Implement your cleaning here."""
        return text.strip()

    def tiktoken_len(self, text):
        """Calculate the number of tokens in the text."""
        tokens = self.tokenizer.encode(text, disallowed_special=())
        return len(tokens)

    def process_documents(self):
        """Process documents from DataFrame."""
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=20,
            length_function=self.tiktoken_len,
            separators=['\n\n', '\n', ' ', '']
        )

        processed_docs = []
        for idx, row in self.df.iterrows():
            text = self._clean_data(row['link_content'])
            chunks = text_splitter.split_text(text)

            for i, chunk in enumerate(chunks):
                processed_docs.append({
                    'unique_id': row['unique_id'],  # Only copy the unique_id from the original row
                    'text': f"Part {i+1} / {len(chunks)} of content \n {chunk}",
                    'chunk_id': i + 1,
                    'total_chunks': len(chunks)
                })

        return pd.DataFrame(processed_docs)


document_processor = DocumentProcessor(df_merged_final)
df_input_for_embeddings = document_processor.process_documents()


In [260]:
document_processor

<__main__.DocumentProcessor at 0x23b8462ffb0>

In [261]:
df_input_for_embeddings

Unnamed: 0,unique_id,text,chunk_id,total_chunks
0,18f95c6d-b49e-4f04-af48-ad2f037f1dfe,Part 1 / 1 of content \n Title: A Great Day in...,1,1
1,0f3bbfbd-5ef7-4589-be15-75b67963e98c,Part 1 / 6 of content \n Title: Karachi Style ...,1,6
2,0f3bbfbd-5ef7-4589-be15-75b67963e98c,Part 2 / 6 of content \n Do not cook the potat...,2,6
3,0f3bbfbd-5ef7-4589-be15-75b67963e98c,Part 3 / 6 of content \n * * *\n\nA few things...,3,6
4,0f3bbfbd-5ef7-4589-be15-75b67963e98c,Part 4 / 6 of content \n * 1 bay leaf\n\n* ...,4,6
5,0f3bbfbd-5ef7-4589-be15-75b67963e98c,Part 5 / 6 of content \n * 2 tbsp vinegar\n\...,5,6
6,0f3bbfbd-5ef7-4589-be15-75b67963e98c,Part 6 / 6 of content \n ### Recipe Video\n\n#...,6,6
7,ffefa829-9388-4183-996c-189f3b992907,Part 1 / 12 of content \n Title: 'Succession’ ...,1,12
8,ffefa829-9388-4183-996c-189f3b992907,Part 2 / 12 of content \n ### Popular on Varie...,2,12
9,ffefa829-9388-4183-996c-189f3b992907,Part 3 / 12 of content \n **JEREMY STRONG:** M...,3,12


In [None]:
### Embeddings need to be generated from df_input_for_embeddings['text'] column and merged by unique_id

In [262]:
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel

MODEL = "text-embedding-preview-0409"
TASK = "QUESTION_ANSWERING"
TITLE = "Embedding Title"
OUTPUT_DIMENSIONALITY = ""


def get_embeddings_df(df):
    BATCH_SIZE = 250  # Adjust based on your model's limits

    def process_batch(batch_df):
        texts = batch_df['text'].tolist() 
        embeddings = embed_text(
            model_name=MODEL, 
            task_type=TASK,
            texts=texts
        )
        return embeddings

    embeddings = []
    for i in range(0, len(df), BATCH_SIZE):
        batch_df = df.iloc[i:i + BATCH_SIZE]
        batch_embeddings = process_batch(batch_df)
        embeddings.extend(batch_embeddings)

    df['embedding_vector'] = embeddings
    return df

In [263]:
get_embeddings_df(df_input_for_embeddings)

NameError: name 'embed_text' is not defined

In [264]:
def process_batch(batch_df):
    texts = batch_df['text'].tolist()  # Extract 'text' column
    text_inputs = [TextEmbeddingInput(content=text) for text in texts]  # Format inputs for Vertex AI model 

    embeddings = model.embed(inputs=text_inputs)  # Call the Vertex AI model

    return [embedding.embedding_vector.values for embedding in embeddings]  # Extract embeddings

In [None]:



## Alternative function 
# def get_embeddings_df(df):
#     # Initialize an empty list to store embeddings
#     embeddings = []

#     # Iterate over each row in the dataframe using iterrows()
#     for index, row in df.iterrows():
#         input_text = [row['combined_summary_text']]  # Ensure text is in list format
        
#         # Embed Text
#         embedding = embed_text(
#             model_name=MODEL,
#             task_type=TASK,
#             texts=input_text
#         )[0]  # Extract the embedding since there's only one input per call
        
#         # Append the embedding to the list
#         embeddings.append(embedding)

#     # Assign the list of embeddings to a new dataframe column
#     df['embedding_vector'] = embeddings

#     return df

# get_embeddings_df(df_processed)



In [None]:
get_embeddings_df(df_processed)

In [None]:
df_processed['formatted_embeddings'] = df_processed['embedding_vector'].apply(lambda x: x.values)


In [None]:
df_processed['formatted_embeddings']

In [None]:
# Check if the column elements are actually lists 
if isinstance(df_processed["formatted_embeddings"].iloc[0], list):
    embeddings_array = np.array(df_processed["formatted_embeddings"].tolist())
else:
    # Handle cases where the column elements are not lists (e.g., strings)
    # ... add logic to convert strings into lists of numbers if needed ...
    embeddings_array = np.array(df_processed["formatted_embeddings"].tolist()) # Update this line


In [None]:
embeddings_array

In [None]:
# Example: Assuming `embeddings_array` should be created or verified
# Ensure embeddings_array is properly computed and not empty
if embeddings_array.size == 0:
    print("Error: The embeddings array is empty!")
else:
    print("Embeddings array is ready for modeling.")


# Check if the data column is empty or has non-string types
if df_processed['combined_summary_text'].isnull().any() or not all(isinstance(x, str) for x in df_processed['combined_summary_text']):
    print("Data error: Check 'combined_summary_text' for nulls or non-string entries.")
else:
    print("Data is properly formatted.")



In [None]:
### Below is WIP

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=50)  # Reduce to 50 dimensions as an example
reduced_embeddings = pca.fit_transform(embeddings_array)

clusterer = hdbscan.HDBSCAN(min_cluster_size=2, min_samples=1, metric='euclidean')
labels = clusterer.fit_predict(reduced_embeddings)
print("Number of clusters found:", len(np.unique(labels[labels >= 0])))
print("Labels assigned to data points:", labels)

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, learning_rate='auto', init='random')
reduced_data = tsne.fit_transform(embeddings_array)

plt.figure(figsize=(10, 8))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=labels, cmap='Spectral', s=50)
plt.title('Data Distribution via t-SNE')
plt.show()

In [None]:
# Create dummy data to test UMAP
dummy_data = np.random.rand(10, 5)  # 10 samples, 5 features each

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
try:
    reduced_embeddings = umap_model.fit_transform(dummy_data)
    print("UMAP works with dummy data, output:", reduced_embeddings)
except Exception as e:
    print("UMAP failed on dummy data with error:", str(e))


In [None]:
# Assuming 'embeddings_array' is your precomputed embeddings and is correctly structured
# Ensure n_neighbors is not greater than the number of samples
if embeddings_array.shape[0] < umap_model.n_neighbors:
    print(f"Reducing n_neighbors from {umap_model.n_neighbors} to {embeddings_array.shape[0] // 2}")
    umap_model.n_neighbors = max(2, embeddings_array.shape[0] // 2)

# Attempt to fit UMAP again
try:
    reduced_embeddings = umap_model.fit_transform(embeddings_array)
    print("UMAP successfully reduced dimensionality.")
except Exception as e:
    print("UMAP failed with adjusted parameters due to:", str(e))

# Use KMeans for clustering
kmeans_model = KMeans(n_clusters=10, random_state=42)
clusters = kmeans_model.fit_predict(reduced_embeddings)


In [None]:
from umap import UMAP

# Adjust UMAP parameters or check input size
umap_model = UMAP(n_neighbors=3, n_components=2, min_dist=0.1, metric='euclidean', n_jobs=1)

# Try manually fitting UMAP to see if it works outside BERTopic
try:
    umap_model.fit(embeddings_array)  # Directly fitting to check if UMAP works with given embeddings
    print("UMAP dimensionality reduction successful.")
except Exception as e:
    print(f"UMAP Error: {e}")


In [None]:
# Validate embeddings array is not empty and contains valid numerical data:
if embeddings_array.size == 0:
    raise ValueError("The embeddings array is empty.")
if np.isnan(embeddings_array).any():
    raise ValueError("NaN values found in embeddings array.")
if embeddings_array.ndim != 2:
    raise ValueError("Embeddings array must be two-dimensional.")


In [None]:
from sklearn.cluster import KMeans

n_clusters = 10

# Load and initialize BERTopic to use KMeans clustering with 8 clusters only.
cluster_model = KMeans(n_clusters=n_clusters)
topic_model = BERTopic(hdbscan_model=cluster_model)

# df is a dataframe. df['title'] is the column of text we're modeling
df_processed['topic'], probabilities = topic_model.fit_transform(df_processed['combined_summary_text'], embeddings_array)

In [None]:
# Initialize BERTopic with embedding_model set to None since we are using precomputed embeddings
topic_model = BERTopic(embedding_model=None)

In [None]:
# Create a BERTopic instance
topic_model = BERTopic(embedding_model="sentence-transformers/all-MiniLM-L6-v2")  # Or your preferred embedding model 

# Fit the model on your embeddings
topics, probabilities = topic_model.fit_transform(df_processed["formatted_embeddings"])


In [None]:
# Fit the model
topics, probabilities = topic_model.fit_transform(df_processed['formatted_embeddings'])


In [None]:
# Add topic numbers back to the original dataframe
df_processed['topic'] = topics

In [None]:
# Get the Topic Info
topic_info = topic_model.get_topic_info()

# Print topic information
print(topic_info)

# To visualize the topics, if feasible depending on your environment (best in Jupyter Notebooks)
topic_model.visualize_topics()

In [None]:
## Nomic is WIP

In [None]:
# from nomic import atlas


In [None]:
# project = atlas.map_data(
#     embeddings=embeddings_array
# )

In [None]:
!nomic login

In [None]:
# nomic cred
# id: bonodemo
# key: nk-LlPaaXJBULXS3hxXWUCkQGt-NsxMpLtXsM737Fd79Wk

In [None]:
# #MONGO_URI = "mongodb+srv://backend-demo-user:mS5qycxBRSPXmVG@bonoaicluster.n8jbhf5.mongodb.net/?retryWrites=true&w=majority&appName=BonoAICluster"


# MONGO_URI = "mongodb+srv://shardulvaidya95:wMmjqUoEXWSR9J7h@bonoaicluster.n8jbhf5.mongodb.net/?retryWrites=true&w=majority&appName=BonoAICluster"

# # Create a new client and connect to the server
# client = MongoClient(MONGO_URI, server_api=ServerApi('1'))
# # Send a ping to confirm a successful connection
# try:
#     client.admin.command('ping')
#     print("Pinged your deployment. You successfully connected to MongoDB!")
# except Exception as e:
#     print(e)


# for whatsapp_doc, article_doc in processed_docs:
#     whatsapp_result = db.bonoAIWhatsappChats.insert_one(whatsapp_doc)
#     print(f"WhatsApp Chat Inserted with ID: {whatsapp_result.inserted_id}")

#     article_result = db.bonoAIArticles.insert_one(article_doc)
#     print(f"Article Inserted with ID: {article_result.inserted_id}")


# whatsapp_count = db.bonoAIWhatsappChats.count_documents({})  # Pass an empty filter
# article_count = db.bonoAIArticles.count_documents({})  # Pass an empty filter

# print(f"WhatsApp Chats Count: {whatsapp_count}")
# print(f"Articles Count: {article_count}")

